]> git.pld-linux.org Git - packages/kernel.git/blame - kernel-rt.patch
- 4.14.72
[packages/kernel.git] / kernel-rt.patch
CommitLineData
e4b2b4a8
JK
1diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/alpha/include/asm/spinlock_types.h linux-4.14/arch/alpha/include/asm/spinlock_types.h
2--- linux-4.14.orig/arch/alpha/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
3+++ linux-4.14/arch/alpha/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
4@@ -2,10 +2,6 @@
5 #ifndef _ALPHA_SPINLOCK_TYPES_H
6 #define _ALPHA_SPINLOCK_TYPES_H
7
8-#ifndef __LINUX_SPINLOCK_TYPES_H
9-# error "please don't include this file directly"
10-#endif
11-
12 typedef struct {
13 volatile unsigned int lock;
14 } arch_spinlock_t;
15diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/irq.h linux-4.14/arch/arm/include/asm/irq.h
16--- linux-4.14.orig/arch/arm/include/asm/irq.h 2017-11-12 19:46:13.000000000 +0100
17+++ linux-4.14/arch/arm/include/asm/irq.h 2018-09-05 11:05:07.000000000 +0200
18@@ -23,6 +23,8 @@
c7c16703
JK
19 #endif
20
21 #ifndef __ASSEMBLY__
22+#include <linux/cpumask.h>
23+
24 struct irqaction;
25 struct pt_regs;
26 extern void migrate_irqs(void);
e4b2b4a8
JK
27diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/spinlock_types.h linux-4.14/arch/arm/include/asm/spinlock_types.h
28--- linux-4.14.orig/arch/arm/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
29+++ linux-4.14/arch/arm/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
30@@ -2,10 +2,6 @@
31 #ifndef __ASM_SPINLOCK_TYPES_H
32 #define __ASM_SPINLOCK_TYPES_H
33
34-#ifndef __LINUX_SPINLOCK_TYPES_H
35-# error "please don't include this file directly"
36-#endif
37-
38 #define TICKET_SHIFT 16
39
40 typedef struct {
41diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/switch_to.h linux-4.14/arch/arm/include/asm/switch_to.h
42--- linux-4.14.orig/arch/arm/include/asm/switch_to.h 2017-11-12 19:46:13.000000000 +0100
43+++ linux-4.14/arch/arm/include/asm/switch_to.h 2018-09-05 11:05:07.000000000 +0200
44@@ -4,6 +4,13 @@
1a6e0f06
JK
45
46 #include <linux/thread_info.h>
47
48+#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
49+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
50+#else
51+static inline void
52+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
53+#endif
54+
55 /*
56 * For v7 SMP cores running a preemptible kernel we may be pre-empted
57 * during a TLB maintenance operation, so execute an inner-shareable dsb
e4b2b4a8 58@@ -26,6 +33,7 @@
1a6e0f06
JK
59 #define switch_to(prev,next,last) \
60 do { \
61 __complete_pending_tlbi(); \
62+ switch_kmaps(prev, next); \
63 last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
64 } while (0)
65
e4b2b4a8
JK
66diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/thread_info.h linux-4.14/arch/arm/include/asm/thread_info.h
67--- linux-4.14.orig/arch/arm/include/asm/thread_info.h 2017-11-12 19:46:13.000000000 +0100
68+++ linux-4.14/arch/arm/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
69@@ -49,6 +49,7 @@
1a6e0f06
JK
70 struct thread_info {
71 unsigned long flags; /* low level flags */
72 int preempt_count; /* 0 => preemptable, <0 => bug */
73+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
74 mm_segment_t addr_limit; /* address limit */
75 struct task_struct *task; /* main task structure */
76 __u32 cpu; /* cpu */
e4b2b4a8 77@@ -142,7 +143,8 @@
1a6e0f06
JK
78 #define TIF_SYSCALL_TRACE 4 /* syscall trace active */
79 #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
80 #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
81-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
82+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
83+#define TIF_NEED_RESCHED_LAZY 7
84
85 #define TIF_NOHZ 12 /* in adaptive nohz mode */
86 #define TIF_USING_IWMMXT 17
e4b2b4a8 87@@ -152,6 +154,7 @@
1a6e0f06
JK
88 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
89 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
90 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
91+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
92 #define _TIF_UPROBE (1 << TIF_UPROBE)
93 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
94 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
e4b2b4a8 95@@ -167,7 +170,8 @@
1a6e0f06
JK
96 * Change these and you break ASM code in entry-common.S
97 */
98 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
99- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
100+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
101+ _TIF_NEED_RESCHED_LAZY)
102
103 #endif /* __KERNEL__ */
104 #endif /* __ASM_ARM_THREAD_INFO_H */
e4b2b4a8
JK
105diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/Kconfig linux-4.14/arch/arm/Kconfig
106--- linux-4.14.orig/arch/arm/Kconfig 2017-11-12 19:46:13.000000000 +0100
107+++ linux-4.14/arch/arm/Kconfig 2018-09-05 11:05:07.000000000 +0200
108@@ -45,7 +45,7 @@
109 select HARDIRQS_SW_RESEND
110 select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
111 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
112- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
113+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
114 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
115 select HAVE_ARCH_MMAP_RND_BITS if MMU
116 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
117@@ -85,6 +85,7 @@
118 select HAVE_PERF_EVENTS
119 select HAVE_PERF_REGS
120 select HAVE_PERF_USER_STACK_DUMP
121+ select HAVE_PREEMPT_LAZY
122 select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
123 select HAVE_REGS_AND_STACK_ACCESS_API
124 select HAVE_SYSCALL_TRACEPOINTS
125@@ -2164,7 +2165,7 @@
126
127 config KERNEL_MODE_NEON
128 bool "Support for NEON in kernel mode"
129- depends on NEON && AEABI
130+ depends on NEON && AEABI && !PREEMPT_RT_BASE
131 help
132 Say Y to include support for NEON in kernel mode.
133
134diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/asm-offsets.c linux-4.14/arch/arm/kernel/asm-offsets.c
135--- linux-4.14.orig/arch/arm/kernel/asm-offsets.c 2017-11-12 19:46:13.000000000 +0100
136+++ linux-4.14/arch/arm/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
137@@ -65,6 +65,7 @@
1a6e0f06
JK
138 BLANK();
139 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
140 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
141+ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
142 DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
143 DEFINE(TI_TASK, offsetof(struct thread_info, task));
144 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
e4b2b4a8
JK
145diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/entry-armv.S linux-4.14/arch/arm/kernel/entry-armv.S
146--- linux-4.14.orig/arch/arm/kernel/entry-armv.S 2017-11-12 19:46:13.000000000 +0100
147+++ linux-4.14/arch/arm/kernel/entry-armv.S 2018-09-05 11:05:07.000000000 +0200
148@@ -220,11 +220,18 @@
1a6e0f06
JK
149
150 #ifdef CONFIG_PREEMPT
151 ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
152- ldr r0, [tsk, #TI_FLAGS] @ get flags
153 teq r8, #0 @ if preempt count != 0
154+ bne 1f @ return from exeption
155+ ldr r0, [tsk, #TI_FLAGS] @ get flags
156+ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
157+ blne svc_preempt @ preempt!
158+
159+ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
160+ teq r8, #0 @ if preempt lazy count != 0
161 movne r0, #0 @ force flags to 0
162- tst r0, #_TIF_NEED_RESCHED
163+ tst r0, #_TIF_NEED_RESCHED_LAZY
164 blne svc_preempt
165+1:
166 #endif
167
168 svc_exit r5, irq = 1 @ return from exception
e4b2b4a8 169@@ -239,8 +246,14 @@
1a6e0f06
JK
170 1: bl preempt_schedule_irq @ irq en/disable is done inside
171 ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
172 tst r0, #_TIF_NEED_RESCHED
173+ bne 1b
174+ tst r0, #_TIF_NEED_RESCHED_LAZY
175 reteq r8 @ go again
176- b 1b
177+ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
178+ teq r0, #0 @ if preempt lazy count != 0
179+ beq 1b
180+ ret r8 @ go again
181+
182 #endif
183
184 __und_fault:
e4b2b4a8
JK
185diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/entry-common.S linux-4.14/arch/arm/kernel/entry-common.S
186--- linux-4.14.orig/arch/arm/kernel/entry-common.S 2017-11-12 19:46:13.000000000 +0100
187+++ linux-4.14/arch/arm/kernel/entry-common.S 2018-09-05 11:05:07.000000000 +0200
188@@ -53,7 +53,9 @@
189 cmp r2, #TASK_SIZE
190 blne addr_limit_check_failed
1a6e0f06
JK
191 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
192- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
193+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
194+ bne fast_work_pending
195+ tst r1, #_TIF_SECCOMP
196 bne fast_work_pending
197
e4b2b4a8
JK
198
199@@ -83,8 +85,11 @@
200 cmp r2, #TASK_SIZE
201 blne addr_limit_check_failed
1a6e0f06
JK
202 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
203- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
204+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
e4b2b4a8 205+ bne do_slower_path
1a6e0f06
JK
206+ tst r1, #_TIF_SECCOMP
207 beq no_work_pending
208+do_slower_path:
209 UNWIND(.fnend )
210 ENDPROC(ret_fast_syscall)
211
e4b2b4a8
JK
212diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/patch.c linux-4.14/arch/arm/kernel/patch.c
213--- linux-4.14.orig/arch/arm/kernel/patch.c 2017-11-12 19:46:13.000000000 +0100
214+++ linux-4.14/arch/arm/kernel/patch.c 2018-09-05 11:05:07.000000000 +0200
215@@ -16,7 +16,7 @@
c7c16703
JK
216 unsigned int insn;
217 };
218
219-static DEFINE_SPINLOCK(patch_lock);
220+static DEFINE_RAW_SPINLOCK(patch_lock);
221
222 static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
223 __acquires(&patch_lock)
e4b2b4a8 224@@ -33,7 +33,7 @@
c7c16703
JK
225 return addr;
226
227 if (flags)
228- spin_lock_irqsave(&patch_lock, *flags);
229+ raw_spin_lock_irqsave(&patch_lock, *flags);
230 else
231 __acquire(&patch_lock);
232
e4b2b4a8 233@@ -48,7 +48,7 @@
c7c16703
JK
234 clear_fixmap(fixmap);
235
236 if (flags)
237- spin_unlock_irqrestore(&patch_lock, *flags);
238+ raw_spin_unlock_irqrestore(&patch_lock, *flags);
239 else
240 __release(&patch_lock);
241 }
e4b2b4a8
JK
242diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/process.c linux-4.14/arch/arm/kernel/process.c
243--- linux-4.14.orig/arch/arm/kernel/process.c 2017-11-12 19:46:13.000000000 +0100
244+++ linux-4.14/arch/arm/kernel/process.c 2018-09-05 11:05:07.000000000 +0200
245@@ -325,6 +325,30 @@
1a6e0f06
JK
246 }
247
248 #ifdef CONFIG_MMU
249+/*
250+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
251+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
252+ * fail.
253+ */
254+static int __init vectors_user_mapping_init_page(void)
255+{
256+ struct page *page;
257+ unsigned long addr = 0xffff0000;
258+ pgd_t *pgd;
259+ pud_t *pud;
260+ pmd_t *pmd;
261+
262+ pgd = pgd_offset_k(addr);
263+ pud = pud_offset(pgd, addr);
264+ pmd = pmd_offset(pud, addr);
265+ page = pmd_page(*(pmd));
266+
267+ pgtable_page_ctor(page);
268+
269+ return 0;
270+}
271+late_initcall(vectors_user_mapping_init_page);
272+
273 #ifdef CONFIG_KUSER_HELPERS
274 /*
275 * The vectors page is always readable from user space for the
e4b2b4a8
JK
276diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/signal.c linux-4.14/arch/arm/kernel/signal.c
277--- linux-4.14.orig/arch/arm/kernel/signal.c 2017-11-12 19:46:13.000000000 +0100
278+++ linux-4.14/arch/arm/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
279@@ -615,7 +615,8 @@
1a6e0f06
JK
280 */
281 trace_hardirqs_off();
282 do {
283- if (likely(thread_flags & _TIF_NEED_RESCHED)) {
284+ if (likely(thread_flags & (_TIF_NEED_RESCHED |
285+ _TIF_NEED_RESCHED_LAZY))) {
286 schedule();
287 } else {
288 if (unlikely(!user_mode(regs)))
e4b2b4a8
JK
289diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/smp.c linux-4.14/arch/arm/kernel/smp.c
290--- linux-4.14.orig/arch/arm/kernel/smp.c 2017-11-12 19:46:13.000000000 +0100
291+++ linux-4.14/arch/arm/kernel/smp.c 2018-09-05 11:05:07.000000000 +0200
292@@ -236,8 +236,6 @@
1a6e0f06
JK
293 flush_cache_louis();
294 local_flush_tlb_all();
295
296- clear_tasks_mm_cpumask(cpu);
297-
298 return 0;
299 }
300
e4b2b4a8 301@@ -255,6 +253,7 @@
1a6e0f06 302 }
e4b2b4a8 303 pr_debug("CPU%u: shutdown\n", cpu);
1a6e0f06 304
e4b2b4a8 305+ clear_tasks_mm_cpumask(cpu);
1a6e0f06 306 /*
e4b2b4a8
JK
307 * platform_cpu_kill() is generally expected to do the powering off
308 * and/or cutting of clocks to the dying CPU. Optionally, this may
309diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/unwind.c linux-4.14/arch/arm/kernel/unwind.c
310--- linux-4.14.orig/arch/arm/kernel/unwind.c 2017-11-12 19:46:13.000000000 +0100
311+++ linux-4.14/arch/arm/kernel/unwind.c 2018-09-05 11:05:07.000000000 +0200
312@@ -93,7 +93,7 @@
1a6e0f06
JK
313 static const struct unwind_idx *__origin_unwind_idx;
314 extern const struct unwind_idx __stop_unwind_idx[];
315
316-static DEFINE_SPINLOCK(unwind_lock);
317+static DEFINE_RAW_SPINLOCK(unwind_lock);
318 static LIST_HEAD(unwind_tables);
319
320 /* Convert a prel31 symbol to an absolute address */
e4b2b4a8 321@@ -201,7 +201,7 @@
1a6e0f06
JK
322 /* module unwind tables */
323 struct unwind_table *table;
324
325- spin_lock_irqsave(&unwind_lock, flags);
326+ raw_spin_lock_irqsave(&unwind_lock, flags);
327 list_for_each_entry(table, &unwind_tables, list) {
328 if (addr >= table->begin_addr &&
329 addr < table->end_addr) {
e4b2b4a8 330@@ -213,7 +213,7 @@
1a6e0f06
JK
331 break;
332 }
333 }
334- spin_unlock_irqrestore(&unwind_lock, flags);
335+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
336 }
337
338 pr_debug("%s: idx = %p\n", __func__, idx);
e4b2b4a8 339@@ -529,9 +529,9 @@
1a6e0f06
JK
340 tab->begin_addr = text_addr;
341 tab->end_addr = text_addr + text_size;
342
343- spin_lock_irqsave(&unwind_lock, flags);
344+ raw_spin_lock_irqsave(&unwind_lock, flags);
345 list_add_tail(&tab->list, &unwind_tables);
346- spin_unlock_irqrestore(&unwind_lock, flags);
347+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
348
349 return tab;
350 }
e4b2b4a8 351@@ -543,9 +543,9 @@
1a6e0f06
JK
352 if (!tab)
353 return;
354
355- spin_lock_irqsave(&unwind_lock, flags);
356+ raw_spin_lock_irqsave(&unwind_lock, flags);
357 list_del(&tab->list);
358- spin_unlock_irqrestore(&unwind_lock, flags);
359+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
360
361 kfree(tab);
362 }
e4b2b4a8
JK
363diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-exynos/platsmp.c linux-4.14/arch/arm/mach-exynos/platsmp.c
364--- linux-4.14.orig/arch/arm/mach-exynos/platsmp.c 2017-11-12 19:46:13.000000000 +0100
365+++ linux-4.14/arch/arm/mach-exynos/platsmp.c 2018-09-05 11:05:07.000000000 +0200
366@@ -229,7 +229,7 @@
1a6e0f06
JK
367 return (void __iomem *)(S5P_VA_SCU);
368 }
369
370-static DEFINE_SPINLOCK(boot_lock);
371+static DEFINE_RAW_SPINLOCK(boot_lock);
372
373 static void exynos_secondary_init(unsigned int cpu)
374 {
e4b2b4a8 375@@ -242,8 +242,8 @@
1a6e0f06
JK
376 /*
377 * Synchronise with the boot thread.
378 */
379- spin_lock(&boot_lock);
380- spin_unlock(&boot_lock);
381+ raw_spin_lock(&boot_lock);
382+ raw_spin_unlock(&boot_lock);
383 }
384
385 int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
e4b2b4a8 386@@ -307,7 +307,7 @@
1a6e0f06
JK
387 * Set synchronisation state between this boot processor
388 * and the secondary one
389 */
390- spin_lock(&boot_lock);
391+ raw_spin_lock(&boot_lock);
392
393 /*
394 * The secondary processor is waiting to be released from
e4b2b4a8 395@@ -334,7 +334,7 @@
1a6e0f06
JK
396
397 if (timeout == 0) {
398 printk(KERN_ERR "cpu1 power enable failed");
399- spin_unlock(&boot_lock);
400+ raw_spin_unlock(&boot_lock);
401 return -ETIMEDOUT;
402 }
403 }
e4b2b4a8 404@@ -380,7 +380,7 @@
1a6e0f06
JK
405 * calibrations, then wait for it to finish
406 */
407 fail:
408- spin_unlock(&boot_lock);
409+ raw_spin_unlock(&boot_lock);
410
411 return pen_release != -1 ? ret : 0;
412 }
e4b2b4a8
JK
413diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-hisi/platmcpm.c linux-4.14/arch/arm/mach-hisi/platmcpm.c
414--- linux-4.14.orig/arch/arm/mach-hisi/platmcpm.c 2017-11-12 19:46:13.000000000 +0100
415+++ linux-4.14/arch/arm/mach-hisi/platmcpm.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
416@@ -61,7 +61,7 @@
417
418 static void __iomem *sysctrl, *fabric;
419 static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
420-static DEFINE_SPINLOCK(boot_lock);
421+static DEFINE_RAW_SPINLOCK(boot_lock);
422 static u32 fabric_phys_addr;
423 /*
424 * [0]: bootwrapper physical address
e4b2b4a8 425@@ -113,7 +113,7 @@
1a6e0f06
JK
426 if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
427 return -EINVAL;
428
429- spin_lock_irq(&boot_lock);
430+ raw_spin_lock_irq(&boot_lock);
431
432 if (hip04_cpu_table[cluster][cpu])
433 goto out;
e4b2b4a8 434@@ -147,7 +147,7 @@
1a6e0f06
JK
435
436 out:
437 hip04_cpu_table[cluster][cpu]++;
438- spin_unlock_irq(&boot_lock);
439+ raw_spin_unlock_irq(&boot_lock);
440
441 return 0;
442 }
e4b2b4a8 443@@ -162,11 +162,11 @@
1a6e0f06
JK
444 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
445 cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
446
447- spin_lock(&boot_lock);
448+ raw_spin_lock(&boot_lock);
449 hip04_cpu_table[cluster][cpu]--;
450 if (hip04_cpu_table[cluster][cpu] == 1) {
451 /* A power_up request went ahead of us. */
452- spin_unlock(&boot_lock);
453+ raw_spin_unlock(&boot_lock);
454 return;
455 } else if (hip04_cpu_table[cluster][cpu] > 1) {
456 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
e4b2b4a8 457@@ -174,7 +174,7 @@
1a6e0f06
JK
458 }
459
460 last_man = hip04_cluster_is_down(cluster);
461- spin_unlock(&boot_lock);
462+ raw_spin_unlock(&boot_lock);
463 if (last_man) {
464 /* Since it's Cortex A15, disable L2 prefetching. */
465 asm volatile(
e4b2b4a8 466@@ -203,7 +203,7 @@
1a6e0f06
JK
467 cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
468
469 count = TIMEOUT_MSEC / POLL_MSEC;
470- spin_lock_irq(&boot_lock);
471+ raw_spin_lock_irq(&boot_lock);
472 for (tries = 0; tries < count; tries++) {
473 if (hip04_cpu_table[cluster][cpu])
474 goto err;
e4b2b4a8 475@@ -211,10 +211,10 @@
1a6e0f06
JK
476 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
477 if (data & CORE_WFI_STATUS(cpu))
478 break;
479- spin_unlock_irq(&boot_lock);
480+ raw_spin_unlock_irq(&boot_lock);
481 /* Wait for clean L2 when the whole cluster is down. */
482 msleep(POLL_MSEC);
483- spin_lock_irq(&boot_lock);
484+ raw_spin_lock_irq(&boot_lock);
485 }
486 if (tries >= count)
487 goto err;
e4b2b4a8 488@@ -231,10 +231,10 @@
1a6e0f06
JK
489 goto err;
490 if (hip04_cluster_is_down(cluster))
491 hip04_set_snoop_filter(cluster, 0);
492- spin_unlock_irq(&boot_lock);
493+ raw_spin_unlock_irq(&boot_lock);
494 return 1;
495 err:
496- spin_unlock_irq(&boot_lock);
497+ raw_spin_unlock_irq(&boot_lock);
498 return 0;
499 }
500 #endif
e4b2b4a8
JK
501diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-omap2/omap-smp.c linux-4.14/arch/arm/mach-omap2/omap-smp.c
502--- linux-4.14.orig/arch/arm/mach-omap2/omap-smp.c 2018-09-05 11:03:20.000000000 +0200
503+++ linux-4.14/arch/arm/mach-omap2/omap-smp.c 2018-09-05 11:05:07.000000000 +0200
504@@ -69,7 +69,7 @@
1a6e0f06
JK
505 .startup_addr = omap5_secondary_startup,
506 };
507
508-static DEFINE_SPINLOCK(boot_lock);
509+static DEFINE_RAW_SPINLOCK(boot_lock);
510
511 void __iomem *omap4_get_scu_base(void)
512 {
e4b2b4a8 513@@ -177,8 +177,8 @@
1a6e0f06
JK
514 /*
515 * Synchronise with the boot thread.
516 */
517- spin_lock(&boot_lock);
518- spin_unlock(&boot_lock);
519+ raw_spin_lock(&boot_lock);
520+ raw_spin_unlock(&boot_lock);
521 }
522
523 static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
e4b2b4a8 524@@ -191,7 +191,7 @@
1a6e0f06
JK
525 * Set synchronisation state between this boot processor
526 * and the secondary one
527 */
528- spin_lock(&boot_lock);
529+ raw_spin_lock(&boot_lock);
530
531 /*
532 * Update the AuxCoreBoot0 with boot state for secondary core.
e4b2b4a8 533@@ -270,7 +270,7 @@
1a6e0f06
JK
534 * Now the secondary core is starting up let it run its
535 * calibrations, then wait for it to finish
536 */
537- spin_unlock(&boot_lock);
538+ raw_spin_unlock(&boot_lock);
539
540 return 0;
541 }
e4b2b4a8
JK
542diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-prima2/platsmp.c linux-4.14/arch/arm/mach-prima2/platsmp.c
543--- linux-4.14.orig/arch/arm/mach-prima2/platsmp.c 2017-11-12 19:46:13.000000000 +0100
544+++ linux-4.14/arch/arm/mach-prima2/platsmp.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
545@@ -22,7 +22,7 @@
546
547 static void __iomem *clk_base;
548
549-static DEFINE_SPINLOCK(boot_lock);
550+static DEFINE_RAW_SPINLOCK(boot_lock);
551
552 static void sirfsoc_secondary_init(unsigned int cpu)
553 {
e4b2b4a8 554@@ -36,8 +36,8 @@
1a6e0f06
JK
555 /*
556 * Synchronise with the boot thread.
557 */
558- spin_lock(&boot_lock);
559- spin_unlock(&boot_lock);
560+ raw_spin_lock(&boot_lock);
561+ raw_spin_unlock(&boot_lock);
562 }
563
564 static const struct of_device_id clk_ids[] = {
e4b2b4a8 565@@ -75,7 +75,7 @@
1a6e0f06
JK
566 /* make sure write buffer is drained */
567 mb();
568
569- spin_lock(&boot_lock);
570+ raw_spin_lock(&boot_lock);
571
572 /*
573 * The secondary processor is waiting to be released from
e4b2b4a8 574@@ -107,7 +107,7 @@
1a6e0f06
JK
575 * now the secondary core is starting up let it run its
576 * calibrations, then wait for it to finish
577 */
578- spin_unlock(&boot_lock);
579+ raw_spin_unlock(&boot_lock);
580
581 return pen_release != -1 ? -ENOSYS : 0;
582 }
e4b2b4a8
JK
583diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-qcom/platsmp.c linux-4.14/arch/arm/mach-qcom/platsmp.c
584--- linux-4.14.orig/arch/arm/mach-qcom/platsmp.c 2017-11-12 19:46:13.000000000 +0100
585+++ linux-4.14/arch/arm/mach-qcom/platsmp.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
586@@ -46,7 +46,7 @@
587
588 extern void secondary_startup_arm(void);
589
590-static DEFINE_SPINLOCK(boot_lock);
591+static DEFINE_RAW_SPINLOCK(boot_lock);
592
593 #ifdef CONFIG_HOTPLUG_CPU
594 static void qcom_cpu_die(unsigned int cpu)
e4b2b4a8 595@@ -60,8 +60,8 @@
1a6e0f06
JK
596 /*
597 * Synchronise with the boot thread.
598 */
599- spin_lock(&boot_lock);
600- spin_unlock(&boot_lock);
601+ raw_spin_lock(&boot_lock);
602+ raw_spin_unlock(&boot_lock);
603 }
604
605 static int scss_release_secondary(unsigned int cpu)
e4b2b4a8 606@@ -284,7 +284,7 @@
1a6e0f06
JK
607 * set synchronisation state between this boot processor
608 * and the secondary one
609 */
610- spin_lock(&boot_lock);
611+ raw_spin_lock(&boot_lock);
612
613 /*
614 * Send the secondary CPU a soft interrupt, thereby causing
e4b2b4a8 615@@ -297,7 +297,7 @@
1a6e0f06
JK
616 * now the secondary core is starting up let it run its
617 * calibrations, then wait for it to finish
618 */
619- spin_unlock(&boot_lock);
620+ raw_spin_unlock(&boot_lock);
621
622 return ret;
623 }
e4b2b4a8
JK
624diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-spear/platsmp.c linux-4.14/arch/arm/mach-spear/platsmp.c
625--- linux-4.14.orig/arch/arm/mach-spear/platsmp.c 2017-11-12 19:46:13.000000000 +0100
626+++ linux-4.14/arch/arm/mach-spear/platsmp.c 2018-09-05 11:05:07.000000000 +0200
627@@ -32,7 +32,7 @@
1a6e0f06
JK
628 sync_cache_w(&pen_release);
629 }
630
631-static DEFINE_SPINLOCK(boot_lock);
632+static DEFINE_RAW_SPINLOCK(boot_lock);
633
634 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
635
e4b2b4a8 636@@ -47,8 +47,8 @@
1a6e0f06
JK
637 /*
638 * Synchronise with the boot thread.
639 */
640- spin_lock(&boot_lock);
641- spin_unlock(&boot_lock);
642+ raw_spin_lock(&boot_lock);
643+ raw_spin_unlock(&boot_lock);
644 }
645
646 static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
e4b2b4a8 647@@ -59,7 +59,7 @@
1a6e0f06
JK
648 * set synchronisation state between this boot processor
649 * and the secondary one
650 */
651- spin_lock(&boot_lock);
652+ raw_spin_lock(&boot_lock);
653
654 /*
655 * The secondary processor is waiting to be released from
e4b2b4a8 656@@ -84,7 +84,7 @@
1a6e0f06
JK
657 * now the secondary core is starting up let it run its
658 * calibrations, then wait for it to finish
659 */
660- spin_unlock(&boot_lock);
661+ raw_spin_unlock(&boot_lock);
662
663 return pen_release != -1 ? -ENOSYS : 0;
664 }
e4b2b4a8
JK
665diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-sti/platsmp.c linux-4.14/arch/arm/mach-sti/platsmp.c
666--- linux-4.14.orig/arch/arm/mach-sti/platsmp.c 2017-11-12 19:46:13.000000000 +0100
667+++ linux-4.14/arch/arm/mach-sti/platsmp.c 2018-09-05 11:05:07.000000000 +0200
668@@ -35,7 +35,7 @@
1a6e0f06
JK
669 sync_cache_w(&pen_release);
670 }
671
672-static DEFINE_SPINLOCK(boot_lock);
673+static DEFINE_RAW_SPINLOCK(boot_lock);
674
675 static void sti_secondary_init(unsigned int cpu)
676 {
e4b2b4a8 677@@ -48,8 +48,8 @@
1a6e0f06
JK
678 /*
679 * Synchronise with the boot thread.
680 */
681- spin_lock(&boot_lock);
682- spin_unlock(&boot_lock);
683+ raw_spin_lock(&boot_lock);
684+ raw_spin_unlock(&boot_lock);
685 }
686
687 static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
e4b2b4a8 688@@ -60,7 +60,7 @@
1a6e0f06
JK
689 * set synchronisation state between this boot processor
690 * and the secondary one
691 */
692- spin_lock(&boot_lock);
693+ raw_spin_lock(&boot_lock);
694
695 /*
696 * The secondary processor is waiting to be released from
e4b2b4a8 697@@ -91,7 +91,7 @@
1a6e0f06
JK
698 * now the secondary core is starting up let it run its
699 * calibrations, then wait for it to finish
700 */
701- spin_unlock(&boot_lock);
702+ raw_spin_unlock(&boot_lock);
703
704 return pen_release != -1 ? -ENOSYS : 0;
705 }
e4b2b4a8
JK
706diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mm/fault.c linux-4.14/arch/arm/mm/fault.c
707--- linux-4.14.orig/arch/arm/mm/fault.c 2017-11-12 19:46:13.000000000 +0100
708+++ linux-4.14/arch/arm/mm/fault.c 2018-09-05 11:05:07.000000000 +0200
709@@ -434,6 +434,9 @@
1a6e0f06
JK
710 if (addr < TASK_SIZE)
711 return do_page_fault(addr, fsr, regs);
712
713+ if (interrupts_enabled(regs))
714+ local_irq_enable();
715+
716 if (user_mode(regs))
717 goto bad_area;
718
e4b2b4a8 719@@ -501,6 +504,9 @@
1a6e0f06
JK
720 static int
721 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
722 {
723+ if (interrupts_enabled(regs))
724+ local_irq_enable();
725+
726 do_bad_area(addr, fsr, regs);
727 return 0;
728 }
e4b2b4a8
JK
729diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mm/highmem.c linux-4.14/arch/arm/mm/highmem.c
730--- linux-4.14.orig/arch/arm/mm/highmem.c 2017-11-12 19:46:13.000000000 +0100
731+++ linux-4.14/arch/arm/mm/highmem.c 2018-09-05 11:05:07.000000000 +0200
732@@ -34,6 +34,11 @@
1a6e0f06
JK
733 return *ptep;
734 }
735
736+static unsigned int fixmap_idx(int type)
737+{
738+ return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
739+}
740+
741 void *kmap(struct page *page)
742 {
743 might_sleep();
e4b2b4a8 744@@ -54,12 +59,13 @@
1a6e0f06
JK
745
746 void *kmap_atomic(struct page *page)
747 {
748+ pte_t pte = mk_pte(page, kmap_prot);
749 unsigned int idx;
750 unsigned long vaddr;
751 void *kmap;
752 int type;
753
754- preempt_disable();
755+ preempt_disable_nort();
756 pagefault_disable();
757 if (!PageHighMem(page))
758 return page_address(page);
e4b2b4a8 759@@ -79,7 +85,7 @@
1a6e0f06
JK
760
761 type = kmap_atomic_idx_push();
762
763- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
764+ idx = fixmap_idx(type);
765 vaddr = __fix_to_virt(idx);
766 #ifdef CONFIG_DEBUG_HIGHMEM
767 /*
e4b2b4a8 768@@ -93,7 +99,10 @@
1a6e0f06
JK
769 * in place, so the contained TLB flush ensures the TLB is updated
770 * with the new mapping.
771 */
772- set_fixmap_pte(idx, mk_pte(page, kmap_prot));
773+#ifdef CONFIG_PREEMPT_RT_FULL
774+ current->kmap_pte[type] = pte;
775+#endif
776+ set_fixmap_pte(idx, pte);
777
778 return (void *)vaddr;
779 }
e4b2b4a8 780@@ -106,44 +115,75 @@
1a6e0f06
JK
781
782 if (kvaddr >= (void *)FIXADDR_START) {
783 type = kmap_atomic_idx();
784- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
785+ idx = fixmap_idx(type);
786
787 if (cache_is_vivt())
788 __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
789+#ifdef CONFIG_PREEMPT_RT_FULL
790+ current->kmap_pte[type] = __pte(0);
791+#endif
792 #ifdef CONFIG_DEBUG_HIGHMEM
793 BUG_ON(vaddr != __fix_to_virt(idx));
794- set_fixmap_pte(idx, __pte(0));
795 #else
796 (void) idx; /* to kill a warning */
797 #endif
798+ set_fixmap_pte(idx, __pte(0));
799 kmap_atomic_idx_pop();
800 } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
801 /* this address was obtained through kmap_high_get() */
802 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
803 }
804 pagefault_enable();
805- preempt_enable();
806+ preempt_enable_nort();
807 }
808 EXPORT_SYMBOL(__kunmap_atomic);
809
810 void *kmap_atomic_pfn(unsigned long pfn)
811 {
812+ pte_t pte = pfn_pte(pfn, kmap_prot);
813 unsigned long vaddr;
814 int idx, type;
815 struct page *page = pfn_to_page(pfn);
816
817- preempt_disable();
818+ preempt_disable_nort();
819 pagefault_disable();
820 if (!PageHighMem(page))
821 return page_address(page);
822
823 type = kmap_atomic_idx_push();
824- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
825+ idx = fixmap_idx(type);
826 vaddr = __fix_to_virt(idx);
827 #ifdef CONFIG_DEBUG_HIGHMEM
828 BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
829 #endif
830- set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
831+#ifdef CONFIG_PREEMPT_RT_FULL
832+ current->kmap_pte[type] = pte;
833+#endif
834+ set_fixmap_pte(idx, pte);
835
836 return (void *)vaddr;
837 }
838+#if defined CONFIG_PREEMPT_RT_FULL
839+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
840+{
841+ int i;
842+
843+ /*
844+ * Clear @prev's kmap_atomic mappings
845+ */
846+ for (i = 0; i < prev_p->kmap_idx; i++) {
847+ int idx = fixmap_idx(i);
848+
849+ set_fixmap_pte(idx, __pte(0));
850+ }
851+ /*
852+ * Restore @next_p's kmap_atomic mappings
853+ */
854+ for (i = 0; i < next_p->kmap_idx; i++) {
855+ int idx = fixmap_idx(i);
856+
857+ if (!pte_none(next_p->kmap_pte[i]))
858+ set_fixmap_pte(idx, next_p->kmap_pte[i]);
859+ }
860+}
861+#endif
e4b2b4a8
JK
862diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/plat-versatile/platsmp.c linux-4.14/arch/arm/plat-versatile/platsmp.c
863--- linux-4.14.orig/arch/arm/plat-versatile/platsmp.c 2017-11-12 19:46:13.000000000 +0100
864+++ linux-4.14/arch/arm/plat-versatile/platsmp.c 2018-09-05 11:05:07.000000000 +0200
865@@ -32,7 +32,7 @@
1a6e0f06
JK
866 sync_cache_w(&pen_release);
867 }
868
869-static DEFINE_SPINLOCK(boot_lock);
870+static DEFINE_RAW_SPINLOCK(boot_lock);
871
872 void versatile_secondary_init(unsigned int cpu)
873 {
e4b2b4a8 874@@ -45,8 +45,8 @@
1a6e0f06
JK
875 /*
876 * Synchronise with the boot thread.
877 */
878- spin_lock(&boot_lock);
879- spin_unlock(&boot_lock);
880+ raw_spin_lock(&boot_lock);
881+ raw_spin_unlock(&boot_lock);
882 }
883
884 int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
e4b2b4a8 885@@ -57,7 +57,7 @@
1a6e0f06
JK
886 * Set synchronisation state between this boot processor
887 * and the secondary one
888 */
889- spin_lock(&boot_lock);
890+ raw_spin_lock(&boot_lock);
891
892 /*
893 * This is really belt and braces; we hold unintended secondary
e4b2b4a8 894@@ -87,7 +87,7 @@
1a6e0f06
JK
895 * now the secondary core is starting up let it run its
896 * calibrations, then wait for it to finish
897 */
898- spin_unlock(&boot_lock);
899+ raw_spin_unlock(&boot_lock);
900
901 return pen_release != -1 ? -ENOSYS : 0;
902 }
e4b2b4a8
JK
903diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/crypto/crc32-ce-glue.c linux-4.14/arch/arm64/crypto/crc32-ce-glue.c
904--- linux-4.14.orig/arch/arm64/crypto/crc32-ce-glue.c 2018-09-05 11:03:20.000000000 +0200
905+++ linux-4.14/arch/arm64/crypto/crc32-ce-glue.c 2018-09-05 11:05:07.000000000 +0200
906@@ -208,7 +208,8 @@
907
908 static int __init crc32_pmull_mod_init(void)
909 {
910- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
911+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
912+ !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
913 crc32_pmull_algs[0].update = crc32_pmull_update;
914 crc32_pmull_algs[1].update = crc32c_pmull_update;
915
916diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/crypto/Kconfig linux-4.14/arch/arm64/crypto/Kconfig
917--- linux-4.14.orig/arch/arm64/crypto/Kconfig 2017-11-12 19:46:13.000000000 +0100
918+++ linux-4.14/arch/arm64/crypto/Kconfig 2018-09-05 11:05:07.000000000 +0200
919@@ -19,19 +19,19 @@
920
921 config CRYPTO_SHA1_ARM64_CE
922 tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
923- depends on KERNEL_MODE_NEON
924+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
925 select CRYPTO_HASH
926 select CRYPTO_SHA1
927
928 config CRYPTO_SHA2_ARM64_CE
929 tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
930- depends on KERNEL_MODE_NEON
931+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
932 select CRYPTO_HASH
933 select CRYPTO_SHA256_ARM64
934
935 config CRYPTO_GHASH_ARM64_CE
936 tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
937- depends on KERNEL_MODE_NEON
938+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
939 select CRYPTO_HASH
940 select CRYPTO_GF128MUL
941 select CRYPTO_AES
942@@ -39,7 +39,7 @@
943
944 config CRYPTO_CRCT10DIF_ARM64_CE
945 tristate "CRCT10DIF digest algorithm using PMULL instructions"
946- depends on KERNEL_MODE_NEON && CRC_T10DIF
947+ depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
948 select CRYPTO_HASH
949
950 config CRYPTO_CRC32_ARM64_CE
951@@ -53,13 +53,13 @@
952
953 config CRYPTO_AES_ARM64_CE
954 tristate "AES core cipher using ARMv8 Crypto Extensions"
955- depends on ARM64 && KERNEL_MODE_NEON
956+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
957 select CRYPTO_ALGAPI
958 select CRYPTO_AES_ARM64
959
960 config CRYPTO_AES_ARM64_CE_CCM
961 tristate "AES in CCM mode using ARMv8 Crypto Extensions"
962- depends on ARM64 && KERNEL_MODE_NEON
963+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
964 select CRYPTO_ALGAPI
965 select CRYPTO_AES_ARM64_CE
966 select CRYPTO_AES_ARM64
967@@ -67,7 +67,7 @@
968
969 config CRYPTO_AES_ARM64_CE_BLK
970 tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
971- depends on KERNEL_MODE_NEON
972+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
973 select CRYPTO_BLKCIPHER
974 select CRYPTO_AES_ARM64_CE
975 select CRYPTO_AES_ARM64
976@@ -75,7 +75,7 @@
977
978 config CRYPTO_AES_ARM64_NEON_BLK
979 tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
980- depends on KERNEL_MODE_NEON
981+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
982 select CRYPTO_BLKCIPHER
983 select CRYPTO_AES_ARM64
984 select CRYPTO_AES
985@@ -83,13 +83,13 @@
986
987 config CRYPTO_CHACHA20_NEON
988 tristate "NEON accelerated ChaCha20 symmetric cipher"
989- depends on KERNEL_MODE_NEON
990+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
991 select CRYPTO_BLKCIPHER
992 select CRYPTO_CHACHA20
993
994 config CRYPTO_AES_ARM64_BS
995 tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
996- depends on KERNEL_MODE_NEON
997+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
998 select CRYPTO_BLKCIPHER
999 select CRYPTO_AES_ARM64_NEON_BLK
1000 select CRYPTO_AES_ARM64
1001diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/include/asm/spinlock_types.h linux-4.14/arch/arm64/include/asm/spinlock_types.h
1002--- linux-4.14.orig/arch/arm64/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1003+++ linux-4.14/arch/arm64/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1004@@ -16,10 +16,6 @@
1005 #ifndef __ASM_SPINLOCK_TYPES_H
1006 #define __ASM_SPINLOCK_TYPES_H
1007
1008-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
1009-# error "please don't include this file directly"
1010-#endif
1011-
1012 #include <linux/types.h>
1a6e0f06 1013
e4b2b4a8
JK
1014 #define TICKET_SHIFT 16
1015diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/include/asm/thread_info.h linux-4.14/arch/arm64/include/asm/thread_info.h
1016--- linux-4.14.orig/arch/arm64/include/asm/thread_info.h 2018-09-05 11:03:20.000000000 +0200
1017+++ linux-4.14/arch/arm64/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
1018@@ -43,6 +43,7 @@
1019 u64 ttbr0; /* saved TTBR0_EL1 */
1020 #endif
1a6e0f06
JK
1021 int preempt_count; /* 0 => preemptable, <0 => bug */
1022+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1a6e0f06
JK
1023 };
1024
e4b2b4a8
JK
1025 #define INIT_THREAD_INFO(tsk) \
1026@@ -82,6 +83,7 @@
1a6e0f06 1027 #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
e4b2b4a8
JK
1028 #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
1029 #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
1030+#define TIF_NEED_RESCHED_LAZY 6
1a6e0f06
JK
1031 #define TIF_NOHZ 7
1032 #define TIF_SYSCALL_TRACE 8
1033 #define TIF_SYSCALL_AUDIT 9
e4b2b4a8 1034@@ -98,6 +100,7 @@
1a6e0f06
JK
1035 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
1036 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
1037 #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
1038+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1039 #define _TIF_NOHZ (1 << TIF_NOHZ)
1040 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
1041 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
e4b2b4a8 1042@@ -109,8 +112,9 @@
1a6e0f06
JK
1043
1044 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
e4b2b4a8
JK
1045 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1046- _TIF_UPROBE | _TIF_FSCHECK)
1047+ _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY)
1a6e0f06 1048
e4b2b4a8 1049+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1a6e0f06
JK
1050 #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1051 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
e4b2b4a8
JK
1052 _TIF_NOHZ)
1053diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/Kconfig linux-4.14/arch/arm64/Kconfig
1054--- linux-4.14.orig/arch/arm64/Kconfig 2018-09-05 11:03:20.000000000 +0200
1055+++ linux-4.14/arch/arm64/Kconfig 2018-09-05 11:05:07.000000000 +0200
1056@@ -103,6 +103,7 @@
1057 select HAVE_PERF_EVENTS
1058 select HAVE_PERF_REGS
1059 select HAVE_PERF_USER_STACK_DUMP
1060+ select HAVE_PREEMPT_LAZY
1061 select HAVE_REGS_AND_STACK_ACCESS_API
1062 select HAVE_RCU_TABLE_FREE
1063 select HAVE_SYSCALL_TRACEPOINTS
1064@@ -791,7 +792,7 @@
1065
1066 config XEN
1067 bool "Xen guest support on ARM64"
1068- depends on ARM64 && OF
1069+ depends on ARM64 && OF && !PREEMPT_RT_FULL
1070 select SWIOTLB_XEN
1071 select PARAVIRT
1072 help
1073diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/asm-offsets.c linux-4.14/arch/arm64/kernel/asm-offsets.c
1074--- linux-4.14.orig/arch/arm64/kernel/asm-offsets.c 2018-09-05 11:03:20.000000000 +0200
1075+++ linux-4.14/arch/arm64/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
1076@@ -39,6 +39,7 @@
1a6e0f06 1077 BLANK();
e4b2b4a8
JK
1078 DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
1079 DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
1080+ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
1081 DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
1082 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
1083 DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
1084diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/entry.S linux-4.14/arch/arm64/kernel/entry.S
1085--- linux-4.14.orig/arch/arm64/kernel/entry.S 2018-09-05 11:03:20.000000000 +0200
1086+++ linux-4.14/arch/arm64/kernel/entry.S 2018-09-05 11:05:07.000000000 +0200
1087@@ -637,11 +637,16 @@
1a6e0f06
JK
1088
1089 #ifdef CONFIG_PREEMPT
e4b2b4a8 1090 ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
1a6e0f06
JK
1091- cbnz w24, 1f // preempt count != 0
1092+ cbnz w24, 2f // preempt count != 0
e4b2b4a8 1093 ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
1a6e0f06
JK
1094- tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
1095- bl el1_preempt
1096+ tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
1097+
e4b2b4a8 1098+ ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
1a6e0f06
JK
1099+ cbnz w24, 2f // preempt lazy count != 0
1100+ tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
1101 1:
1102+ bl el1_preempt
1103+2:
1104 #endif
1105 #ifdef CONFIG_TRACE_IRQFLAGS
1106 bl trace_hardirqs_on
e4b2b4a8 1107@@ -655,6 +660,7 @@
1a6e0f06 1108 1: bl preempt_schedule_irq // irq en/disable is done inside
e4b2b4a8 1109 ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
1a6e0f06
JK
1110 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
1111+ tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling?
1112 ret x24
1113 #endif
1114
e4b2b4a8
JK
1115diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/signal.c linux-4.14/arch/arm64/kernel/signal.c
1116--- linux-4.14.orig/arch/arm64/kernel/signal.c 2018-09-05 11:03:20.000000000 +0200
1117+++ linux-4.14/arch/arm64/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
1118@@ -756,7 +756,7 @@
1119 /* Check valid user FS if needed */
1120 addr_limit_user_check();
1121
c7c16703
JK
1122- if (thread_flags & _TIF_NEED_RESCHED) {
1123+ if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1124 schedule();
1125 } else {
1126 local_irq_enable();
e4b2b4a8
JK
1127diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/blackfin/include/asm/spinlock_types.h linux-4.14/arch/blackfin/include/asm/spinlock_types.h
1128--- linux-4.14.orig/arch/blackfin/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1129+++ linux-4.14/arch/blackfin/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1130@@ -7,10 +7,6 @@
1131 #ifndef __ASM_SPINLOCK_TYPES_H
1132 #define __ASM_SPINLOCK_TYPES_H
1133
1134-#ifndef __LINUX_SPINLOCK_TYPES_H
1135-# error "please don't include this file directly"
1136-#endif
1137-
1138 #include <asm/rwlock.h>
1139
1140 typedef struct {
1141diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/hexagon/include/asm/spinlock_types.h linux-4.14/arch/hexagon/include/asm/spinlock_types.h
1142--- linux-4.14.orig/arch/hexagon/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1143+++ linux-4.14/arch/hexagon/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1144@@ -21,10 +21,6 @@
1145 #ifndef _ASM_SPINLOCK_TYPES_H
1146 #define _ASM_SPINLOCK_TYPES_H
1147
1148-#ifndef __LINUX_SPINLOCK_TYPES_H
1149-# error "please don't include this file directly"
1150-#endif
1151-
1152 typedef struct {
1153 volatile unsigned int lock;
1154 } arch_spinlock_t;
1155diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/ia64/include/asm/spinlock_types.h linux-4.14/arch/ia64/include/asm/spinlock_types.h
1156--- linux-4.14.orig/arch/ia64/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1157+++ linux-4.14/arch/ia64/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1158@@ -2,10 +2,6 @@
1159 #ifndef _ASM_IA64_SPINLOCK_TYPES_H
1160 #define _ASM_IA64_SPINLOCK_TYPES_H
1161
1162-#ifndef __LINUX_SPINLOCK_TYPES_H
1163-# error "please don't include this file directly"
1164-#endif
1165-
1166 typedef struct {
1167 volatile unsigned int lock;
1168 } arch_spinlock_t;
1169diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/ia64/kernel/mca.c linux-4.14/arch/ia64/kernel/mca.c
1170--- linux-4.14.orig/arch/ia64/kernel/mca.c 2017-11-12 19:46:13.000000000 +0100
1171+++ linux-4.14/arch/ia64/kernel/mca.c 2018-09-05 11:05:07.000000000 +0200
1172@@ -1824,7 +1824,7 @@
1173 ti->cpu = cpu;
1174 p->stack = ti;
1175 p->state = TASK_UNINTERRUPTIBLE;
1176- cpumask_set_cpu(cpu, &p->cpus_allowed);
1177+ cpumask_set_cpu(cpu, &p->cpus_mask);
1178 INIT_LIST_HEAD(&p->tasks);
1179 p->parent = p->real_parent = p->group_leader = p;
1180 INIT_LIST_HEAD(&p->children);
1181diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/Kconfig linux-4.14/arch/Kconfig
1182--- linux-4.14.orig/arch/Kconfig 2018-09-05 11:03:20.000000000 +0200
1183+++ linux-4.14/arch/Kconfig 2018-09-05 11:05:07.000000000 +0200
1184@@ -20,6 +20,7 @@
1185 tristate "OProfile system profiling"
1186 depends on PROFILING
1187 depends on HAVE_OPROFILE
1188+ depends on !PREEMPT_RT_FULL
1189 select RING_BUFFER
1190 select RING_BUFFER_ALLOW_SWAP
1191 help
1192diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/m32r/include/asm/spinlock_types.h linux-4.14/arch/m32r/include/asm/spinlock_types.h
1193--- linux-4.14.orig/arch/m32r/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1194+++ linux-4.14/arch/m32r/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1195@@ -2,10 +2,6 @@
1196 #ifndef _ASM_M32R_SPINLOCK_TYPES_H
1197 #define _ASM_M32R_SPINLOCK_TYPES_H
1198
1199-#ifndef __LINUX_SPINLOCK_TYPES_H
1200-# error "please don't include this file directly"
1201-#endif
1202-
1203 typedef struct {
1204 volatile int slock;
1205 } arch_spinlock_t;
1206diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/metag/include/asm/spinlock_types.h linux-4.14/arch/metag/include/asm/spinlock_types.h
1207--- linux-4.14.orig/arch/metag/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1208+++ linux-4.14/arch/metag/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1209@@ -2,10 +2,6 @@
1210 #ifndef _ASM_METAG_SPINLOCK_TYPES_H
1211 #define _ASM_METAG_SPINLOCK_TYPES_H
1212
1213-#ifndef __LINUX_SPINLOCK_TYPES_H
1214-# error "please don't include this file directly"
1215-#endif
1216-
1217 typedef struct {
1218 volatile unsigned int lock;
1219 } arch_spinlock_t;
1220diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/include/asm/switch_to.h linux-4.14/arch/mips/include/asm/switch_to.h
1221--- linux-4.14.orig/arch/mips/include/asm/switch_to.h 2017-11-12 19:46:13.000000000 +0100
1222+++ linux-4.14/arch/mips/include/asm/switch_to.h 2018-09-05 11:05:07.000000000 +0200
1223@@ -42,7 +42,7 @@
1224 * inline to try to keep the overhead down. If we have been forced to run on
1225 * a "CPU" with an FPU because of a previous high level of FP computation,
1226 * but did not actually use the FPU during the most recent time-slice (CU1
1227- * isn't set), we undo the restriction on cpus_allowed.
1228+ * isn't set), we undo the restriction on cpus_mask.
1229 *
1230 * We're not calling set_cpus_allowed() here, because we have no need to
1231 * force prompt migration - we're already switching the current CPU to a
1232@@ -57,7 +57,7 @@
1233 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
1234 (!(KSTK_STATUS(prev) & ST0_CU1))) { \
1235 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
1236- prev->cpus_allowed = prev->thread.user_cpus_allowed; \
1237+ prev->cpus_mask = prev->thread.user_cpus_allowed; \
1238 } \
1239 next->thread.emulated_fp = 0; \
1240 } while(0)
1241diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/Kconfig linux-4.14/arch/mips/Kconfig
1242--- linux-4.14.orig/arch/mips/Kconfig 2018-09-05 11:03:20.000000000 +0200
1243+++ linux-4.14/arch/mips/Kconfig 2018-09-05 11:05:07.000000000 +0200
1244@@ -2519,7 +2519,7 @@
1a6e0f06
JK
1245 #
1246 config HIGHMEM
1247 bool "High Memory Support"
1248- depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1249+ depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1250
1251 config CPU_SUPPORTS_HIGHMEM
1252 bool
e4b2b4a8
JK
1253diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/kernel/mips-mt-fpaff.c linux-4.14/arch/mips/kernel/mips-mt-fpaff.c
1254--- linux-4.14.orig/arch/mips/kernel/mips-mt-fpaff.c 2017-11-12 19:46:13.000000000 +0100
1255+++ linux-4.14/arch/mips/kernel/mips-mt-fpaff.c 2018-09-05 11:05:07.000000000 +0200
1256@@ -177,7 +177,7 @@
1257 if (retval)
1258 goto out_unlock;
1a6e0f06 1259
e4b2b4a8
JK
1260- cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
1261+ cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
1262 cpumask_and(&mask, &allowed, cpu_active_mask);
1a6e0f06 1263
e4b2b4a8
JK
1264 out_unlock:
1265diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/kernel/traps.c linux-4.14/arch/mips/kernel/traps.c
1266--- linux-4.14.orig/arch/mips/kernel/traps.c 2018-09-05 11:03:20.000000000 +0200
1267+++ linux-4.14/arch/mips/kernel/traps.c 2018-09-05 11:05:07.000000000 +0200
1268@@ -1193,12 +1193,12 @@
1269 * restricted the allowed set to exclude any CPUs with FPUs,
1270 * we'll skip the procedure.
1271 */
1272- if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
1273+ if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
1274 cpumask_t tmask;
1275
1276 current->thread.user_cpus_allowed
1277- = current->cpus_allowed;
1278- cpumask_and(&tmask, &current->cpus_allowed,
1279+ = current->cpus_mask;
1280+ cpumask_and(&tmask, &current->cpus_mask,
1281 &mt_fpu_cpumask);
1282 set_cpus_allowed_ptr(current, &tmask);
1283 set_thread_flag(TIF_FPUBOUND);
1284diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mn10300/include/asm/spinlock_types.h linux-4.14/arch/mn10300/include/asm/spinlock_types.h
1285--- linux-4.14.orig/arch/mn10300/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1286+++ linux-4.14/arch/mn10300/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1287@@ -2,10 +2,6 @@
1288 #ifndef _ASM_SPINLOCK_TYPES_H
1289 #define _ASM_SPINLOCK_TYPES_H
1290
1291-#ifndef __LINUX_SPINLOCK_TYPES_H
1292-# error "please don't include this file directly"
1293-#endif
1294-
1295 typedef struct arch_spinlock {
1296 unsigned int slock;
1297 } arch_spinlock_t;
1298diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/include/asm/spinlock_types.h linux-4.14/arch/powerpc/include/asm/spinlock_types.h
1299--- linux-4.14.orig/arch/powerpc/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1300+++ linux-4.14/arch/powerpc/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1301@@ -2,10 +2,6 @@
1302 #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
1303 #define _ASM_POWERPC_SPINLOCK_TYPES_H
1304
1305-#ifndef __LINUX_SPINLOCK_TYPES_H
1306-# error "please don't include this file directly"
1307-#endif
1308-
1309 typedef struct {
1310 volatile unsigned int slock;
1311 } arch_spinlock_t;
1312diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/include/asm/thread_info.h linux-4.14/arch/powerpc/include/asm/thread_info.h
1313--- linux-4.14.orig/arch/powerpc/include/asm/thread_info.h 2017-11-12 19:46:13.000000000 +0100
1314+++ linux-4.14/arch/powerpc/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
1315@@ -36,6 +36,8 @@
1a6e0f06
JK
1316 int cpu; /* cpu we're on */
1317 int preempt_count; /* 0 => preemptable,
1318 <0 => BUG */
1319+ int preempt_lazy_count; /* 0 => preemptable,
1320+ <0 => BUG */
1321 unsigned long local_flags; /* private flags for thread */
1322 #ifdef CONFIG_LIVEPATCH
1323 unsigned long *livepatch_sp;
e4b2b4a8 1324@@ -81,8 +83,7 @@
1a6e0f06
JK
1325 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
1326 #define TIF_SIGPENDING 1 /* signal pending */
1327 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
1328-#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
1329- TIF_NEED_RESCHED */
1330+#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
1331 #define TIF_32BIT 4 /* 32 bit binary */
1332 #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
e4b2b4a8
JK
1333 #define TIF_PATCH_PENDING 6 /* pending live patching update */
1334@@ -101,6 +102,8 @@
1a6e0f06
JK
1335 #if defined(CONFIG_PPC64)
1336 #define TIF_ELF2ABI 18 /* function descriptors must die! */
1337 #endif
1338+#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
1339+ TIF_NEED_RESCHED */
1340
1341 /* as above, but as bit values */
1342 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
e4b2b4a8 1343@@ -120,14 +123,16 @@
1a6e0f06
JK
1344 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
1345 #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
1346 #define _TIF_NOHZ (1<<TIF_NOHZ)
1347+#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1348 #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1349 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1350 _TIF_NOHZ)
1351
1352 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1353 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
e4b2b4a8
JK
1354- _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
1355+ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY)
1a6e0f06
JK
1356 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
1357+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1358
1359 /* Bits in local_flags */
1360 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
e4b2b4a8
JK
1361diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/Kconfig linux-4.14/arch/powerpc/Kconfig
1362--- linux-4.14.orig/arch/powerpc/Kconfig 2018-09-05 11:03:20.000000000 +0200
1363+++ linux-4.14/arch/powerpc/Kconfig 2018-09-05 11:05:07.000000000 +0200
1364@@ -111,10 +111,11 @@
1365
1366 config RWSEM_GENERIC_SPINLOCK
1367 bool
1368+ default y if PREEMPT_RT_FULL
1369
1370 config RWSEM_XCHGADD_ALGORITHM
1371 bool
1372- default y
1373+ default y if !PREEMPT_RT_FULL
1374
1375 config GENERIC_LOCKBREAK
1376 bool
1377@@ -215,6 +216,7 @@
1378 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
1379 select HAVE_PERF_REGS
1380 select HAVE_PERF_USER_STACK_DUMP
1381+ select HAVE_PREEMPT_LAZY
1382 select HAVE_RCU_TABLE_FREE if SMP
1383 select HAVE_REGS_AND_STACK_ACCESS_API
1384 select HAVE_SYSCALL_TRACEPOINTS
1385@@ -390,7 +392,7 @@
1386
1387 config HIGHMEM
1388 bool "High memory support"
1389- depends on PPC32
1390+ depends on PPC32 && !PREEMPT_RT_FULL
1391
1392 source kernel/Kconfig.hz
1393 source kernel/Kconfig.preempt
1394diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/asm-offsets.c linux-4.14/arch/powerpc/kernel/asm-offsets.c
1395--- linux-4.14.orig/arch/powerpc/kernel/asm-offsets.c 2018-09-05 11:03:20.000000000 +0200
1396+++ linux-4.14/arch/powerpc/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
1397@@ -156,6 +156,7 @@
1398 OFFSET(TI_FLAGS, thread_info, flags);
1399 OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
1400 OFFSET(TI_PREEMPT, thread_info, preempt_count);
1401+ OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
1402 OFFSET(TI_TASK, thread_info, task);
1403 OFFSET(TI_CPU, thread_info, cpu);
1404
1405diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/entry_32.S linux-4.14/arch/powerpc/kernel/entry_32.S
1406--- linux-4.14.orig/arch/powerpc/kernel/entry_32.S 2017-11-12 19:46:13.000000000 +0100
1407+++ linux-4.14/arch/powerpc/kernel/entry_32.S 2018-09-05 11:05:07.000000000 +0200
1408@@ -866,7 +866,14 @@
1a6e0f06
JK
1409 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
1410 bne restore
1411 andi. r8,r8,_TIF_NEED_RESCHED
1412+ bne+ 1f
1413+ lwz r0,TI_PREEMPT_LAZY(r9)
1414+ cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
1415+ bne restore
1416+ lwz r0,TI_FLAGS(r9)
1417+ andi. r0,r0,_TIF_NEED_RESCHED_LAZY
1418 beq+ restore
1419+1:
1420 lwz r3,_MSR(r1)
1421 andi. r0,r3,MSR_EE /* interrupts off? */
1422 beq restore /* don't schedule if so */
e4b2b4a8 1423@@ -877,11 +884,11 @@
1a6e0f06
JK
1424 */
1425 bl trace_hardirqs_off
1426 #endif
1427-1: bl preempt_schedule_irq
1428+2: bl preempt_schedule_irq
1429 CURRENT_THREAD_INFO(r9, r1)
1430 lwz r3,TI_FLAGS(r9)
1431- andi. r0,r3,_TIF_NEED_RESCHED
1432- bne- 1b
1433+ andi. r0,r3,_TIF_NEED_RESCHED_MASK
1434+ bne- 2b
1435 #ifdef CONFIG_TRACE_IRQFLAGS
1436 /* And now, to properly rebalance the above, we tell lockdep they
1437 * are being turned back on, which will happen when we return
e4b2b4a8 1438@@ -1204,7 +1211,7 @@
1a6e0f06
JK
1439 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1440
1441 do_work: /* r10 contains MSR_KERNEL here */
1442- andi. r0,r9,_TIF_NEED_RESCHED
1443+ andi. r0,r9,_TIF_NEED_RESCHED_MASK
1444 beq do_user_signal
1445
1446 do_resched: /* r10 contains MSR_KERNEL here */
e4b2b4a8 1447@@ -1225,7 +1232,7 @@
1a6e0f06
JK
1448 MTMSRD(r10) /* disable interrupts */
1449 CURRENT_THREAD_INFO(r9, r1)
1450 lwz r9,TI_FLAGS(r9)
1451- andi. r0,r9,_TIF_NEED_RESCHED
1452+ andi. r0,r9,_TIF_NEED_RESCHED_MASK
1453 bne- do_resched
1454 andi. r0,r9,_TIF_USER_WORK_MASK
1455 beq restore_user
e4b2b4a8
JK
1456diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/entry_64.S linux-4.14/arch/powerpc/kernel/entry_64.S
1457--- linux-4.14.orig/arch/powerpc/kernel/entry_64.S 2018-09-05 11:03:20.000000000 +0200
1458+++ linux-4.14/arch/powerpc/kernel/entry_64.S 2018-09-05 11:05:07.000000000 +0200
1459@@ -690,7 +690,7 @@
1a6e0f06
JK
1460 bl restore_math
1461 b restore
1462 #endif
1463-1: andi. r0,r4,_TIF_NEED_RESCHED
1464+1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
1465 beq 2f
1466 bl restore_interrupts
1467 SCHEDULE_USER
e4b2b4a8 1468@@ -752,10 +752,18 @@
1a6e0f06
JK
1469
1470 #ifdef CONFIG_PREEMPT
1471 /* Check if we need to preempt */
e4b2b4a8 1472+ lwz r8,TI_PREEMPT(r9)
1a6e0f06
JK
1473+ cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
1474+ bne restore
e4b2b4a8 1475 andi. r0,r4,_TIF_NEED_RESCHED
1a6e0f06
JK
1476+ bne+ check_count
1477+
1478+ andi. r0,r4,_TIF_NEED_RESCHED_LAZY
e4b2b4a8 1479 beq+ restore
1a6e0f06
JK
1480+ lwz r8,TI_PREEMPT_LAZY(r9)
1481+
e4b2b4a8
JK
1482 /* Check that preempt_count() == 0 and interrupts are enabled */
1483- lwz r8,TI_PREEMPT(r9)
1a6e0f06
JK
1484+check_count:
1485 cmpwi cr1,r8,0
1486 ld r0,SOFTE(r1)
1487 cmpdi r0,0
e4b2b4a8 1488@@ -772,7 +780,7 @@
1a6e0f06
JK
1489 /* Re-test flags and eventually loop */
1490 CURRENT_THREAD_INFO(r9, r1)
1491 ld r4,TI_FLAGS(r9)
1492- andi. r0,r4,_TIF_NEED_RESCHED
1493+ andi. r0,r4,_TIF_NEED_RESCHED_MASK
1494 bne 1b
1495
1496 /*
e4b2b4a8
JK
1497diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/irq.c linux-4.14/arch/powerpc/kernel/irq.c
1498--- linux-4.14.orig/arch/powerpc/kernel/irq.c 2018-09-05 11:03:20.000000000 +0200
1499+++ linux-4.14/arch/powerpc/kernel/irq.c 2018-09-05 11:05:07.000000000 +0200
1500@@ -693,6 +693,7 @@
1a6e0f06
JK
1501 }
1502 }
1503
1504+#ifndef CONFIG_PREEMPT_RT_FULL
1505 void do_softirq_own_stack(void)
1506 {
1507 struct thread_info *curtp, *irqtp;
e4b2b4a8 1508@@ -710,6 +711,7 @@
1a6e0f06
JK
1509 if (irqtp->flags)
1510 set_bits(irqtp->flags, &curtp->flags);
1511 }
1512+#endif
1513
1514 irq_hw_number_t virq_to_hw(unsigned int virq)
1515 {
e4b2b4a8
JK
1516diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/misc_32.S linux-4.14/arch/powerpc/kernel/misc_32.S
1517--- linux-4.14.orig/arch/powerpc/kernel/misc_32.S 2017-11-12 19:46:13.000000000 +0100
1518+++ linux-4.14/arch/powerpc/kernel/misc_32.S 2018-09-05 11:05:07.000000000 +0200
c7c16703 1519@@ -41,6 +41,7 @@
1a6e0f06
JK
1520 * We store the saved ksp_limit in the unused part
1521 * of the STACK_FRAME_OVERHEAD
1522 */
1523+#ifndef CONFIG_PREEMPT_RT_FULL
1524 _GLOBAL(call_do_softirq)
1525 mflr r0
1526 stw r0,4(r1)
e4b2b4a8 1527@@ -57,6 +58,7 @@
1a6e0f06
JK
1528 stw r10,THREAD+KSP_LIMIT(r2)
1529 mtlr r0
1530 blr
1531+#endif
1532
1533 /*
1534 * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
e4b2b4a8
JK
1535diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/misc_64.S linux-4.14/arch/powerpc/kernel/misc_64.S
1536--- linux-4.14.orig/arch/powerpc/kernel/misc_64.S 2018-09-05 11:03:20.000000000 +0200
1537+++ linux-4.14/arch/powerpc/kernel/misc_64.S 2018-09-05 11:05:07.000000000 +0200
c7c16703 1538@@ -31,6 +31,7 @@
1a6e0f06
JK
1539
1540 .text
1541
1542+#ifndef CONFIG_PREEMPT_RT_FULL
1543 _GLOBAL(call_do_softirq)
1544 mflr r0
1545 std r0,16(r1)
e4b2b4a8 1546@@ -41,6 +42,7 @@
1a6e0f06
JK
1547 ld r0,16(r1)
1548 mtlr r0
1549 blr
1550+#endif
1551
1552 _GLOBAL(call_do_irq)
1553 mflr r0
e4b2b4a8
JK
1554diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kvm/Kconfig linux-4.14/arch/powerpc/kvm/Kconfig
1555--- linux-4.14.orig/arch/powerpc/kvm/Kconfig 2018-09-05 11:03:20.000000000 +0200
1556+++ linux-4.14/arch/powerpc/kvm/Kconfig 2018-09-05 11:05:07.000000000 +0200
1557@@ -177,6 +177,7 @@
1a6e0f06
JK
1558 config KVM_MPIC
1559 bool "KVM in-kernel MPIC emulation"
1560 depends on KVM && E500
1561+ depends on !PREEMPT_RT_FULL
1562 select HAVE_KVM_IRQCHIP
1563 select HAVE_KVM_IRQFD
1564 select HAVE_KVM_IRQ_ROUTING
e4b2b4a8
JK
1565diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/platforms/cell/spufs/sched.c linux-4.14/arch/powerpc/platforms/cell/spufs/sched.c
1566--- linux-4.14.orig/arch/powerpc/platforms/cell/spufs/sched.c 2017-11-12 19:46:13.000000000 +0100
1567+++ linux-4.14/arch/powerpc/platforms/cell/spufs/sched.c 2018-09-05 11:05:07.000000000 +0200
1568@@ -141,7 +141,7 @@
1569 * runqueue. The context will be rescheduled on the proper node
1570 * if it is timesliced or preempted.
1571 */
1572- cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
1573+ cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
1574
1575 /* Save the current cpu id for spu interrupt routing. */
1576 ctx->last_ran = raw_smp_processor_id();
1577diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/platforms/ps3/device-init.c linux-4.14/arch/powerpc/platforms/ps3/device-init.c
1578--- linux-4.14.orig/arch/powerpc/platforms/ps3/device-init.c 2017-11-12 19:46:13.000000000 +0100
1579+++ linux-4.14/arch/powerpc/platforms/ps3/device-init.c 2018-09-05 11:05:07.000000000 +0200
1580@@ -752,7 +752,7 @@
1a6e0f06
JK
1581 }
1582 pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1583
1584- res = wait_event_interruptible(dev->done.wait,
1585+ res = swait_event_interruptible(dev->done.wait,
1586 dev->done.done || kthread_should_stop());
1587 if (kthread_should_stop())
1588 res = -EINTR;
e4b2b4a8
JK
1589diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/s390/include/asm/spinlock_types.h linux-4.14/arch/s390/include/asm/spinlock_types.h
1590--- linux-4.14.orig/arch/s390/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1591+++ linux-4.14/arch/s390/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1592@@ -2,10 +2,6 @@
1593 #ifndef __ASM_SPINLOCK_TYPES_H
1594 #define __ASM_SPINLOCK_TYPES_H
1595
1596-#ifndef __LINUX_SPINLOCK_TYPES_H
1597-# error "please don't include this file directly"
1598-#endif
1599-
1600 typedef struct {
1601 int lock;
1602 } __attribute__ ((aligned (4))) arch_spinlock_t;
1603diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sh/include/asm/spinlock_types.h linux-4.14/arch/sh/include/asm/spinlock_types.h
1604--- linux-4.14.orig/arch/sh/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1605+++ linux-4.14/arch/sh/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1606@@ -2,10 +2,6 @@
1607 #ifndef __ASM_SH_SPINLOCK_TYPES_H
1608 #define __ASM_SH_SPINLOCK_TYPES_H
1609
1610-#ifndef __LINUX_SPINLOCK_TYPES_H
1611-# error "please don't include this file directly"
1612-#endif
1613-
1614 typedef struct {
1615 volatile unsigned int lock;
1616 } arch_spinlock_t;
1617diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sh/kernel/irq.c linux-4.14/arch/sh/kernel/irq.c
1618--- linux-4.14.orig/arch/sh/kernel/irq.c 2017-11-12 19:46:13.000000000 +0100
1619+++ linux-4.14/arch/sh/kernel/irq.c 2018-09-05 11:05:07.000000000 +0200
1620@@ -148,6 +148,7 @@
1a6e0f06
JK
1621 hardirq_ctx[cpu] = NULL;
1622 }
1623
1624+#ifndef CONFIG_PREEMPT_RT_FULL
1625 void do_softirq_own_stack(void)
1626 {
1627 struct thread_info *curctx;
e4b2b4a8 1628@@ -175,6 +176,7 @@
1a6e0f06
JK
1629 "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1630 );
1631 }
1632+#endif
1633 #else
1634 static inline void handle_one_irq(unsigned int irq)
1635 {
e4b2b4a8
JK
1636diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sparc/Kconfig linux-4.14/arch/sparc/Kconfig
1637--- linux-4.14.orig/arch/sparc/Kconfig 2017-11-12 19:46:13.000000000 +0100
1638+++ linux-4.14/arch/sparc/Kconfig 2018-09-05 11:05:07.000000000 +0200
1639@@ -206,12 +206,10 @@
1a6e0f06
JK
1640 source kernel/Kconfig.hz
1641
1642 config RWSEM_GENERIC_SPINLOCK
1643- bool
1644- default y if SPARC32
1645+ def_bool PREEMPT_RT_FULL
1646
1647 config RWSEM_XCHGADD_ALGORITHM
1648- bool
1649- default y if SPARC64
1650+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1651
1652 config GENERIC_HWEIGHT
1653 bool
e4b2b4a8
JK
1654diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sparc/kernel/irq_64.c linux-4.14/arch/sparc/kernel/irq_64.c
1655--- linux-4.14.orig/arch/sparc/kernel/irq_64.c 2017-11-12 19:46:13.000000000 +0100
1656+++ linux-4.14/arch/sparc/kernel/irq_64.c 2018-09-05 11:05:07.000000000 +0200
1657@@ -855,6 +855,7 @@
1a6e0f06
JK
1658 set_irq_regs(old_regs);
1659 }
1660
1661+#ifndef CONFIG_PREEMPT_RT_FULL
1662 void do_softirq_own_stack(void)
1663 {
1664 void *orig_sp, *sp = softirq_stack[smp_processor_id()];
e4b2b4a8 1665@@ -869,6 +870,7 @@
1a6e0f06
JK
1666 __asm__ __volatile__("mov %0, %%sp"
1667 : : "r" (orig_sp));
1668 }
1669+#endif
1670
1671 #ifdef CONFIG_HOTPLUG_CPU
1672 void fixup_irqs(void)
e4b2b4a8
JK
1673diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/include/asm/setup.h linux-4.14/arch/tile/include/asm/setup.h
1674--- linux-4.14.orig/arch/tile/include/asm/setup.h 2017-11-12 19:46:13.000000000 +0100
1675+++ linux-4.14/arch/tile/include/asm/setup.h 2018-09-05 11:05:07.000000000 +0200
1676@@ -49,7 +49,7 @@
1677
1678 /* Hook hardwall code into changes in affinity. */
1679 #define arch_set_cpus_allowed(p, new_mask) do { \
1680- if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
1681+ if (!cpumask_equal(p->cpus_ptr, new_mask)) \
1682 hardwall_deactivate_all(p); \
1683 } while (0)
1684 #endif
1685diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/include/asm/spinlock_types.h linux-4.14/arch/tile/include/asm/spinlock_types.h
1686--- linux-4.14.orig/arch/tile/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
1687+++ linux-4.14/arch/tile/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
1688@@ -15,10 +15,6 @@
1689 #ifndef _ASM_TILE_SPINLOCK_TYPES_H
1690 #define _ASM_TILE_SPINLOCK_TYPES_H
1691
1692-#ifndef __LINUX_SPINLOCK_TYPES_H
1693-# error "please don't include this file directly"
1694-#endif
1695-
1696 #ifdef __tilegx__
1697
1698 /* Low 15 bits are "next"; high 15 bits are "current". */
1699diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/kernel/hardwall.c linux-4.14/arch/tile/kernel/hardwall.c
1700--- linux-4.14.orig/arch/tile/kernel/hardwall.c 2017-11-12 19:46:13.000000000 +0100
1701+++ linux-4.14/arch/tile/kernel/hardwall.c 2018-09-05 11:05:07.000000000 +0200
1702@@ -590,12 +590,12 @@
1703 * Get our affinity; if we're not bound to this tile uniquely,
1704 * we can't access the network registers.
1705 */
1706- if (cpumask_weight(&p->cpus_allowed) != 1)
1707+ if (p->nr_cpus_allowed != 1)
1708 return -EPERM;
1a6e0f06 1709
e4b2b4a8
JK
1710 /* Make sure we are bound to a cpu assigned to this resource. */
1711 cpu = smp_processor_id();
1712- BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
1713+ BUG_ON(cpumask_first(p->cpus_ptr) != cpu);
1714 if (!cpumask_test_cpu(cpu, &info->cpumask))
1715 return -EINVAL;
1a6e0f06 1716
e4b2b4a8
JK
1717@@ -621,17 +621,17 @@
1718 * Deactivate a task's hardwall. Must hold lock for hardwall_type.
1719 * This method may be called from exit_thread(), so we don't want to
1720 * rely on too many fields of struct task_struct still being valid.
1721- * We assume the cpus_allowed, pid, and comm fields are still valid.
1722+ * We assume the nr_cpus_allowed, pid, and comm fields are still valid.
1723 */
1724 static void _hardwall_deactivate(struct hardwall_type *hwt,
1725 struct task_struct *task)
1726 {
1727 struct thread_struct *ts = &task->thread;
1728
1729- if (cpumask_weight(&task->cpus_allowed) != 1) {
1730+ if (task->nr_cpus_allowed != 1) {
1731 pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n",
1732 task->pid, task->comm, hwt->name,
1733- cpumask_weight(&task->cpus_allowed));
1734+ task->nr_cpus_allowed);
1735 BUG();
1736 }
1737
1738diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/aesni-intel_glue.c linux-4.14/arch/x86/crypto/aesni-intel_glue.c
1739--- linux-4.14.orig/arch/x86/crypto/aesni-intel_glue.c 2018-09-05 11:03:20.000000000 +0200
1740+++ linux-4.14/arch/x86/crypto/aesni-intel_glue.c 2018-09-05 11:05:07.000000000 +0200
1741@@ -387,14 +387,14 @@
1742
1743 err = skcipher_walk_virt(&walk, req, true);
1a6e0f06
JK
1744
1745- kernel_fpu_begin();
1746 while ((nbytes = walk.nbytes)) {
1747+ kernel_fpu_begin();
1748 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
e4b2b4a8 1749 nbytes & AES_BLOCK_MASK);
1a6e0f06
JK
1750+ kernel_fpu_end();
1751 nbytes &= AES_BLOCK_SIZE - 1;
e4b2b4a8 1752 err = skcipher_walk_done(&walk, nbytes);
1a6e0f06
JK
1753 }
1754- kernel_fpu_end();
1755
1756 return err;
1757 }
e4b2b4a8
JK
1758@@ -409,14 +409,14 @@
1759
1760 err = skcipher_walk_virt(&walk, req, true);
1a6e0f06
JK
1761
1762- kernel_fpu_begin();
1763 while ((nbytes = walk.nbytes)) {
1764+ kernel_fpu_begin();
1765 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1766 nbytes & AES_BLOCK_MASK);
1767+ kernel_fpu_end();
1768 nbytes &= AES_BLOCK_SIZE - 1;
e4b2b4a8 1769 err = skcipher_walk_done(&walk, nbytes);
1a6e0f06
JK
1770 }
1771- kernel_fpu_end();
1772
1773 return err;
1774 }
e4b2b4a8
JK
1775@@ -431,14 +431,14 @@
1776
1777 err = skcipher_walk_virt(&walk, req, true);
1a6e0f06
JK
1778
1779- kernel_fpu_begin();
1780 while ((nbytes = walk.nbytes)) {
1781+ kernel_fpu_begin();
1782 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1783 nbytes & AES_BLOCK_MASK, walk.iv);
1784+ kernel_fpu_end();
1785 nbytes &= AES_BLOCK_SIZE - 1;
e4b2b4a8 1786 err = skcipher_walk_done(&walk, nbytes);
1a6e0f06
JK
1787 }
1788- kernel_fpu_end();
1789
1790 return err;
1791 }
e4b2b4a8
JK
1792@@ -453,14 +453,14 @@
1793
1794 err = skcipher_walk_virt(&walk, req, true);
1a6e0f06
JK
1795
1796- kernel_fpu_begin();
1797 while ((nbytes = walk.nbytes)) {
1798+ kernel_fpu_begin();
1799 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1800 nbytes & AES_BLOCK_MASK, walk.iv);
1801+ kernel_fpu_end();
1802 nbytes &= AES_BLOCK_SIZE - 1;
e4b2b4a8 1803 err = skcipher_walk_done(&walk, nbytes);
1a6e0f06
JK
1804 }
1805- kernel_fpu_end();
1806
1807 return err;
1808 }
e4b2b4a8
JK
1809@@ -510,18 +510,20 @@
1810
1811 err = skcipher_walk_virt(&walk, req, true);
1a6e0f06
JK
1812
1813- kernel_fpu_begin();
1814 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1815+ kernel_fpu_begin();
1816 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1817 nbytes & AES_BLOCK_MASK, walk.iv);
1818+ kernel_fpu_end();
1819 nbytes &= AES_BLOCK_SIZE - 1;
e4b2b4a8 1820 err = skcipher_walk_done(&walk, nbytes);
1a6e0f06
JK
1821 }
1822 if (walk.nbytes) {
1823+ kernel_fpu_begin();
1824 ctr_crypt_final(ctx, &walk);
1825+ kernel_fpu_end();
e4b2b4a8 1826 err = skcipher_walk_done(&walk, 0);
1a6e0f06
JK
1827 }
1828- kernel_fpu_end();
1829
1830 return err;
1831 }
e4b2b4a8
JK
1832diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx2_glue.c linux-4.14/arch/x86/crypto/camellia_aesni_avx2_glue.c
1833--- linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx2_glue.c 2017-11-12 19:46:13.000000000 +0100
1834+++ linux-4.14/arch/x86/crypto/camellia_aesni_avx2_glue.c 2018-09-05 11:05:07.000000000 +0200
1835@@ -206,6 +206,20 @@
1836 bool fpu_enabled;
1837 };
1838
1839+#ifdef CONFIG_PREEMPT_RT_FULL
1840+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
1841+{
1842+ bool fpu_enabled = ctx->fpu_enabled;
1843+
1844+ if (!fpu_enabled)
1845+ return;
1846+ camellia_fpu_end(fpu_enabled);
1847+ ctx->fpu_enabled = false;
1848+}
1849+#else
1850+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
1851+#endif
1852+
1853 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1854 {
1855 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1856@@ -221,16 +235,19 @@
1857 }
1858
1859 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
1860+ kernel_fpu_resched();
1861 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
1862 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1863 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1864 }
1865
1866 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1867+ kernel_fpu_resched();
1868 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
1869 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1870 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1871 }
1872+ camellia_fpu_end_rt(ctx);
1873
1874 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1875 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
1876@@ -251,16 +268,19 @@
1877 }
1878
1879 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
1880+ kernel_fpu_resched();
1881 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
1882 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1883 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1884 }
1885
1886 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1887+ kernel_fpu_resched();
1888 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
1889 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1890 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1891 }
1892+ camellia_fpu_end_rt(ctx);
1893
1894 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1895 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
1896diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx_glue.c linux-4.14/arch/x86/crypto/camellia_aesni_avx_glue.c
1897--- linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
1898+++ linux-4.14/arch/x86/crypto/camellia_aesni_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
1899@@ -210,6 +210,21 @@
1900 bool fpu_enabled;
1901 };
1902
1903+#ifdef CONFIG_PREEMPT_RT_FULL
1904+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
1905+{
1906+ bool fpu_enabled = ctx->fpu_enabled;
1907+
1908+ if (!fpu_enabled)
1909+ return;
1910+ camellia_fpu_end(fpu_enabled);
1911+ ctx->fpu_enabled = false;
1912+}
1913+
1914+#else
1915+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
1916+#endif
1917+
1918 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1919 {
1920 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1921@@ -225,10 +240,12 @@
1922 }
1923
1924 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1925+ kernel_fpu_resched();
1926 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
1927 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1928 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1929 }
1930+ camellia_fpu_end_rt(ctx);
1931
1932 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1933 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
1934@@ -249,10 +266,12 @@
1935 }
1936
1937 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1938+ kernel_fpu_resched();
1939 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
1940 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1941 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1942 }
1943+ camellia_fpu_end_rt(ctx);
1944
1945 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1946 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
1947diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/cast5_avx_glue.c linux-4.14/arch/x86/crypto/cast5_avx_glue.c
1948--- linux-4.14.orig/arch/x86/crypto/cast5_avx_glue.c 2018-09-05 11:03:20.000000000 +0200
1949+++ linux-4.14/arch/x86/crypto/cast5_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
1950@@ -59,7 +59,7 @@
1951 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1952 bool enc)
1953 {
1954- bool fpu_enabled = false;
1a6e0f06
JK
1955+ bool fpu_enabled;
1956 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1957 const unsigned int bsize = CAST5_BLOCK_SIZE;
1958 unsigned int nbytes;
e4b2b4a8 1959@@ -73,7 +73,7 @@
1a6e0f06
JK
1960 u8 *wsrc = walk->src.virt.addr;
1961 u8 *wdst = walk->dst.virt.addr;
1962
1963- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1964+ fpu_enabled = cast5_fpu_begin(false, nbytes);
1965
1966 /* Process multi-block batch */
1967 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
e4b2b4a8 1968@@ -102,10 +102,9 @@
1a6e0f06
JK
1969 } while (nbytes >= bsize);
1970
1971 done:
1972+ cast5_fpu_end(fpu_enabled);
1973 err = blkcipher_walk_done(desc, walk, nbytes);
1974 }
1975-
1976- cast5_fpu_end(fpu_enabled);
1977 return err;
1978 }
1979
e4b2b4a8 1980@@ -226,7 +225,7 @@
1a6e0f06
JK
1981 static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1982 struct scatterlist *src, unsigned int nbytes)
1983 {
1984- bool fpu_enabled = false;
1985+ bool fpu_enabled;
1986 struct blkcipher_walk walk;
1987 int err;
1988
e4b2b4a8 1989@@ -235,12 +234,11 @@
1a6e0f06
JK
1990 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1991
1992 while ((nbytes = walk.nbytes)) {
1993- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1994+ fpu_enabled = cast5_fpu_begin(false, nbytes);
1995 nbytes = __cbc_decrypt(desc, &walk);
1996+ cast5_fpu_end(fpu_enabled);
1997 err = blkcipher_walk_done(desc, &walk, nbytes);
1998 }
1999-
2000- cast5_fpu_end(fpu_enabled);
2001 return err;
2002 }
2003
e4b2b4a8 2004@@ -309,7 +307,7 @@
1a6e0f06
JK
2005 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2006 struct scatterlist *src, unsigned int nbytes)
2007 {
2008- bool fpu_enabled = false;
2009+ bool fpu_enabled;
2010 struct blkcipher_walk walk;
2011 int err;
2012
e4b2b4a8 2013@@ -318,13 +316,12 @@
1a6e0f06
JK
2014 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2015
2016 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2017- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2018+ fpu_enabled = cast5_fpu_begin(false, nbytes);
2019 nbytes = __ctr_crypt(desc, &walk);
2020+ cast5_fpu_end(fpu_enabled);
2021 err = blkcipher_walk_done(desc, &walk, nbytes);
2022 }
2023
2024- cast5_fpu_end(fpu_enabled);
2025-
2026 if (walk.nbytes) {
2027 ctr_crypt_final(desc, &walk);
2028 err = blkcipher_walk_done(desc, &walk, 0);
e4b2b4a8
JK
2029diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/cast6_avx_glue.c linux-4.14/arch/x86/crypto/cast6_avx_glue.c
2030--- linux-4.14.orig/arch/x86/crypto/cast6_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
2031+++ linux-4.14/arch/x86/crypto/cast6_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2032@@ -205,19 +205,33 @@
2033 bool fpu_enabled;
2034 };
2035
2036+#ifdef CONFIG_PREEMPT_RT_FULL
2037+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
2038+{
2039+ bool fpu_enabled = ctx->fpu_enabled;
2040+
2041+ if (!fpu_enabled)
2042+ return;
2043+ cast6_fpu_end(fpu_enabled);
2044+ ctx->fpu_enabled = false;
2045+}
2046+
2047+#else
2048+static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
2049+#endif
2050+
2051 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2052 {
2053 const unsigned int bsize = CAST6_BLOCK_SIZE;
2054 struct crypt_priv *ctx = priv;
2055 int i;
2056
2057- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2058-
2059 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
2060+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2061 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
2062+ cast6_fpu_end_rt(ctx);
2063 return;
2064 }
2065-
2066 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2067 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
2068 }
2069@@ -228,10 +242,10 @@
2070 struct crypt_priv *ctx = priv;
2071 int i;
2072
2073- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2074-
2075 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
2076+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2077 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
2078+ cast6_fpu_end_rt(ctx);
2079 return;
2080 }
2081
2082diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/chacha20_glue.c linux-4.14/arch/x86/crypto/chacha20_glue.c
2083--- linux-4.14.orig/arch/x86/crypto/chacha20_glue.c 2017-11-12 19:46:13.000000000 +0100
2084+++ linux-4.14/arch/x86/crypto/chacha20_glue.c 2018-09-05 11:05:07.000000000 +0200
2085@@ -81,23 +81,24 @@
2086
2087 crypto_chacha20_init(state, ctx, walk.iv);
2088
2089- kernel_fpu_begin();
2090-
2091 while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
2092+ kernel_fpu_begin();
2093+
2094 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
2095 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
2096+ kernel_fpu_end();
2097 err = skcipher_walk_done(&walk,
2098 walk.nbytes % CHACHA20_BLOCK_SIZE);
2099 }
2100
2101 if (walk.nbytes) {
2102+ kernel_fpu_begin();
2103 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
2104 walk.nbytes);
2105+ kernel_fpu_end();
2106 err = skcipher_walk_done(&walk, 0);
2107 }
2108
2109- kernel_fpu_end();
2110-
2111 return err;
2112 }
2113
2114diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/glue_helper.c linux-4.14/arch/x86/crypto/glue_helper.c
2115--- linux-4.14.orig/arch/x86/crypto/glue_helper.c 2017-11-12 19:46:13.000000000 +0100
2116+++ linux-4.14/arch/x86/crypto/glue_helper.c 2018-09-05 11:05:07.000000000 +0200
2117@@ -40,7 +40,7 @@
1a6e0f06
JK
2118 void *ctx = crypto_blkcipher_ctx(desc->tfm);
2119 const unsigned int bsize = 128 / 8;
2120 unsigned int nbytes, i, func_bytes;
2121- bool fpu_enabled = false;
2122+ bool fpu_enabled;
2123 int err;
2124
2125 err = blkcipher_walk_virt(desc, walk);
e4b2b4a8 2126@@ -50,7 +50,7 @@
1a6e0f06
JK
2127 u8 *wdst = walk->dst.virt.addr;
2128
2129 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2130- desc, fpu_enabled, nbytes);
2131+ desc, false, nbytes);
2132
2133 for (i = 0; i < gctx->num_funcs; i++) {
2134 func_bytes = bsize * gctx->funcs[i].num_blocks;
e4b2b4a8 2135@@ -72,10 +72,10 @@
1a6e0f06
JK
2136 }
2137
2138 done:
2139+ glue_fpu_end(fpu_enabled);
2140 err = blkcipher_walk_done(desc, walk, nbytes);
2141 }
2142
2143- glue_fpu_end(fpu_enabled);
2144 return err;
2145 }
2146
e4b2b4a8 2147@@ -192,7 +192,7 @@
1a6e0f06
JK
2148 struct scatterlist *src, unsigned int nbytes)
2149 {
2150 const unsigned int bsize = 128 / 8;
2151- bool fpu_enabled = false;
2152+ bool fpu_enabled;
2153 struct blkcipher_walk walk;
2154 int err;
2155
e4b2b4a8 2156@@ -201,12 +201,12 @@
1a6e0f06
JK
2157
2158 while ((nbytes = walk.nbytes)) {
2159 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2160- desc, fpu_enabled, nbytes);
2161+ desc, false, nbytes);
2162 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2163+ glue_fpu_end(fpu_enabled);
2164 err = blkcipher_walk_done(desc, &walk, nbytes);
2165 }
2166
2167- glue_fpu_end(fpu_enabled);
2168 return err;
2169 }
2170 EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
e4b2b4a8 2171@@ -275,7 +275,7 @@
1a6e0f06
JK
2172 struct scatterlist *src, unsigned int nbytes)
2173 {
2174 const unsigned int bsize = 128 / 8;
2175- bool fpu_enabled = false;
2176+ bool fpu_enabled;
2177 struct blkcipher_walk walk;
2178 int err;
2179
e4b2b4a8 2180@@ -284,13 +284,12 @@
1a6e0f06
JK
2181
2182 while ((nbytes = walk.nbytes) >= bsize) {
2183 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2184- desc, fpu_enabled, nbytes);
2185+ desc, false, nbytes);
2186 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2187+ glue_fpu_end(fpu_enabled);
2188 err = blkcipher_walk_done(desc, &walk, nbytes);
2189 }
2190
2191- glue_fpu_end(fpu_enabled);
2192-
2193 if (walk.nbytes) {
2194 glue_ctr_crypt_final_128bit(
2195 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
e4b2b4a8 2196@@ -380,7 +379,7 @@
1a6e0f06
JK
2197 void *tweak_ctx, void *crypt_ctx)
2198 {
2199 const unsigned int bsize = 128 / 8;
2200- bool fpu_enabled = false;
2201+ bool fpu_enabled;
2202 struct blkcipher_walk walk;
2203 int err;
2204
e4b2b4a8 2205@@ -393,21 +392,21 @@
1a6e0f06
JK
2206
2207 /* set minimum length to bsize, for tweak_fn */
2208 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2209- desc, fpu_enabled,
2210+ desc, false,
2211 nbytes < bsize ? bsize : nbytes);
2212-
2213 /* calculate first value of T */
2214 tweak_fn(tweak_ctx, walk.iv, walk.iv);
2215+ glue_fpu_end(fpu_enabled);
2216
2217 while (nbytes) {
2218+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2219+ desc, false, nbytes);
2220 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2221
2222+ glue_fpu_end(fpu_enabled);
2223 err = blkcipher_walk_done(desc, &walk, nbytes);
2224 nbytes = walk.nbytes;
2225 }
2226-
2227- glue_fpu_end(fpu_enabled);
2228-
2229 return err;
2230 }
2231 EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
e4b2b4a8
JK
2232diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_avx2_glue.c linux-4.14/arch/x86/crypto/serpent_avx2_glue.c
2233--- linux-4.14.orig/arch/x86/crypto/serpent_avx2_glue.c 2017-11-12 19:46:13.000000000 +0100
2234+++ linux-4.14/arch/x86/crypto/serpent_avx2_glue.c 2018-09-05 11:05:07.000000000 +0200
2235@@ -184,6 +184,21 @@
2236 bool fpu_enabled;
2237 };
2238
2239+#ifdef CONFIG_PREEMPT_RT_FULL
2240+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2241+{
2242+ bool fpu_enabled = ctx->fpu_enabled;
2243+
2244+ if (!fpu_enabled)
2245+ return;
2246+ serpent_fpu_end(fpu_enabled);
2247+ ctx->fpu_enabled = false;
2248+}
2249+
2250+#else
2251+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2252+#endif
2253+
2254 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2255 {
2256 const unsigned int bsize = SERPENT_BLOCK_SIZE;
2257@@ -199,10 +214,12 @@
2258 }
2259
2260 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
2261+ kernel_fpu_resched();
2262 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
2263 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
2264 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
2265 }
2266+ serpent_fpu_end_rt(ctx);
2267
2268 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2269 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
2270@@ -223,10 +240,12 @@
2271 }
2272
2273 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
2274+ kernel_fpu_resched();
2275 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
2276 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
2277 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
2278 }
2279+ serpent_fpu_end_rt(ctx);
2280
2281 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2282 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
2283diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_avx_glue.c linux-4.14/arch/x86/crypto/serpent_avx_glue.c
2284--- linux-4.14.orig/arch/x86/crypto/serpent_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
2285+++ linux-4.14/arch/x86/crypto/serpent_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2286@@ -218,16 +218,31 @@
2287 bool fpu_enabled;
2288 };
2289
2290+#ifdef CONFIG_PREEMPT_RT_FULL
2291+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2292+{
2293+ bool fpu_enabled = ctx->fpu_enabled;
2294+
2295+ if (!fpu_enabled)
2296+ return;
2297+ serpent_fpu_end(fpu_enabled);
2298+ ctx->fpu_enabled = false;
2299+}
2300+
2301+#else
2302+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2303+#endif
2304+
2305 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2306 {
2307 const unsigned int bsize = SERPENT_BLOCK_SIZE;
2308 struct crypt_priv *ctx = priv;
2309 int i;
2310
2311- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2312-
2313 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2314+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2315 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
2316+ serpent_fpu_end_rt(ctx);
2317 return;
2318 }
2319
2320@@ -241,10 +256,10 @@
2321 struct crypt_priv *ctx = priv;
2322 int i;
2323
2324- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2325-
2326 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2327+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2328 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
2329+ serpent_fpu_end_rt(ctx);
2330 return;
2331 }
2332
2333diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_sse2_glue.c linux-4.14/arch/x86/crypto/serpent_sse2_glue.c
2334--- linux-4.14.orig/arch/x86/crypto/serpent_sse2_glue.c 2017-11-12 19:46:13.000000000 +0100
2335+++ linux-4.14/arch/x86/crypto/serpent_sse2_glue.c 2018-09-05 11:05:07.000000000 +0200
2336@@ -187,16 +187,31 @@
2337 bool fpu_enabled;
2338 };
2339
2340+#ifdef CONFIG_PREEMPT_RT_FULL
2341+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2342+{
2343+ bool fpu_enabled = ctx->fpu_enabled;
2344+
2345+ if (!fpu_enabled)
2346+ return;
2347+ serpent_fpu_end(fpu_enabled);
2348+ ctx->fpu_enabled = false;
2349+}
2350+
2351+#else
2352+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2353+#endif
2354+
2355 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2356 {
2357 const unsigned int bsize = SERPENT_BLOCK_SIZE;
2358 struct crypt_priv *ctx = priv;
2359 int i;
2360
2361- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2362-
2363 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2364+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2365 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
2366+ serpent_fpu_end_rt(ctx);
2367 return;
2368 }
2369
2370@@ -210,10 +225,10 @@
2371 struct crypt_priv *ctx = priv;
2372 int i;
2373
2374- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2375-
2376 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2377+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2378 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
2379+ serpent_fpu_end_rt(ctx);
2380 return;
2381 }
2382
2383diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/twofish_avx_glue.c linux-4.14/arch/x86/crypto/twofish_avx_glue.c
2384--- linux-4.14.orig/arch/x86/crypto/twofish_avx_glue.c 2017-11-12 19:46:13.000000000 +0100
2385+++ linux-4.14/arch/x86/crypto/twofish_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2386@@ -218,6 +218,21 @@
2387 bool fpu_enabled;
2388 };
2389
2390+#ifdef CONFIG_PREEMPT_RT_FULL
2391+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
2392+{
2393+ bool fpu_enabled = ctx->fpu_enabled;
2394+
2395+ if (!fpu_enabled)
2396+ return;
2397+ twofish_fpu_end(fpu_enabled);
2398+ ctx->fpu_enabled = false;
2399+}
2400+
2401+#else
2402+static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
2403+#endif
2404+
2405 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2406 {
2407 const unsigned int bsize = TF_BLOCK_SIZE;
2408@@ -228,12 +243,16 @@
2409
2410 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
2411 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
2412+ twofish_fpu_end_rt(ctx);
2413 return;
2414 }
2415
2416- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
2417+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
2418+ kernel_fpu_resched();
2419 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
2420+ }
2421
2422+ twofish_fpu_end_rt(ctx);
2423 nbytes %= bsize * 3;
2424
2425 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2426@@ -250,11 +269,15 @@
2427
2428 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
2429 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
2430+ twofish_fpu_end_rt(ctx);
2431 return;
2432 }
2433
2434- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
2435+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
2436+ kernel_fpu_resched();
2437 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
2438+ }
2439+ twofish_fpu_end_rt(ctx);
2440
2441 nbytes %= bsize * 3;
2442
2443diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/common.c linux-4.14/arch/x86/entry/common.c
2444--- linux-4.14.orig/arch/x86/entry/common.c 2018-09-05 11:03:20.000000000 +0200
2445+++ linux-4.14/arch/x86/entry/common.c 2018-09-05 11:05:07.000000000 +0200
2446@@ -133,7 +133,7 @@
1a6e0f06
JK
2447
2448 #define EXIT_TO_USERMODE_LOOP_FLAGS \
2449 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
e4b2b4a8
JK
2450- _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
2451+ _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
1a6e0f06
JK
2452
2453 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2454 {
e4b2b4a8 2455@@ -148,9 +148,16 @@
1a6e0f06
JK
2456 /* We have work to do. */
2457 local_irq_enable();
2458
2459- if (cached_flags & _TIF_NEED_RESCHED)
2460+ if (cached_flags & _TIF_NEED_RESCHED_MASK)
2461 schedule();
2462
2463+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2464+ if (unlikely(current->forced_info.si_signo)) {
2465+ struct task_struct *t = current;
2466+ force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2467+ t->forced_info.si_signo = 0;
2468+ }
2469+#endif
2470 if (cached_flags & _TIF_UPROBE)
2471 uprobe_notify_resume(regs);
2472
e4b2b4a8
JK
2473diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/entry_32.S linux-4.14/arch/x86/entry/entry_32.S
2474--- linux-4.14.orig/arch/x86/entry/entry_32.S 2018-09-05 11:03:20.000000000 +0200
2475+++ linux-4.14/arch/x86/entry/entry_32.S 2018-09-05 11:05:07.000000000 +0200
2476@@ -350,8 +350,25 @@
1a6e0f06
JK
2477 ENTRY(resume_kernel)
2478 DISABLE_INTERRUPTS(CLBR_ANY)
e4b2b4a8 2479 .Lneed_resched:
1a6e0f06
JK
2480+ # preempt count == 0 + NEED_RS set?
2481 cmpl $0, PER_CPU_VAR(__preempt_count)
2482+#ifndef CONFIG_PREEMPT_LAZY
2483 jnz restore_all
2484+#else
2485+ jz test_int_off
2486+
2487+ # atleast preempt count == 0 ?
2488+ cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2489+ jne restore_all
2490+
e4b2b4a8
JK
2491+ movl PER_CPU_VAR(current_task), %ebp
2492+ cmpl $0,TASK_TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
2493+ jnz restore_all
1a6e0f06 2494+
e4b2b4a8
JK
2495+ testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2496+ jz restore_all
1a6e0f06
JK
2497+test_int_off:
2498+#endif
2499 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2500 jz restore_all
2501 call preempt_schedule_irq
e4b2b4a8
JK
2502diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/entry_64.S linux-4.14/arch/x86/entry/entry_64.S
2503--- linux-4.14.orig/arch/x86/entry/entry_64.S 2018-09-05 11:03:20.000000000 +0200
2504+++ linux-4.14/arch/x86/entry/entry_64.S 2018-09-05 11:05:07.000000000 +0200
2505@@ -633,7 +633,23 @@
1a6e0f06
JK
2506 bt $9, EFLAGS(%rsp) /* were interrupts off? */
2507 jnc 1f
2508 0: cmpl $0, PER_CPU_VAR(__preempt_count)
2509+#ifndef CONFIG_PREEMPT_LAZY
e4b2b4a8 2510+ jnz 1f
1a6e0f06
JK
2511+#else
2512+ jz do_preempt_schedule_irq
2513+
2514+ # atleast preempt count == 0 ?
2515+ cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2516+ jnz 1f
2517+
c7c16703
JK
2518+ movq PER_CPU_VAR(current_task), %rcx
2519+ cmpl $0, TASK_TI_preempt_lazy_count(%rcx)
e4b2b4a8 2520 jnz 1f
1a6e0f06 2521+
c7c16703 2522+ bt $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
1a6e0f06
JK
2523+ jnc 1f
2524+do_preempt_schedule_irq:
2525+#endif
2526 call preempt_schedule_irq
2527 jmp 0b
2528 1:
e4b2b4a8 2529@@ -988,6 +1004,7 @@
1a6e0f06
JK
2530 jmp 2b
2531 .previous
2532
2533+#ifndef CONFIG_PREEMPT_RT_FULL
2534 /* Call softirq on interrupt stack. Interrupts are off. */
2535 ENTRY(do_softirq_own_stack)
2536 pushq %rbp
e4b2b4a8
JK
2537@@ -998,6 +1015,7 @@
2538 leaveq
1a6e0f06 2539 ret
e4b2b4a8 2540 ENDPROC(do_softirq_own_stack)
1a6e0f06
JK
2541+#endif
2542
2543 #ifdef CONFIG_XEN
e4b2b4a8
JK
2544 idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2545diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/fpu/api.h linux-4.14/arch/x86/include/asm/fpu/api.h
2546--- linux-4.14.orig/arch/x86/include/asm/fpu/api.h 2017-11-12 19:46:13.000000000 +0100
2547+++ linux-4.14/arch/x86/include/asm/fpu/api.h 2018-09-05 11:05:07.000000000 +0200
2548@@ -25,6 +25,7 @@
2549 extern void __kernel_fpu_end(void);
2550 extern void kernel_fpu_begin(void);
2551 extern void kernel_fpu_end(void);
2552+extern void kernel_fpu_resched(void);
2553 extern bool irq_fpu_usable(void);
2554
2555 /*
2556diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/preempt.h linux-4.14/arch/x86/include/asm/preempt.h
2557--- linux-4.14.orig/arch/x86/include/asm/preempt.h 2017-11-12 19:46:13.000000000 +0100
2558+++ linux-4.14/arch/x86/include/asm/preempt.h 2018-09-05 11:05:07.000000000 +0200
2559@@ -86,17 +86,46 @@
1a6e0f06
JK
2560 * a decrement which hits zero means we have no preempt_count and should
2561 * reschedule.
2562 */
2563-static __always_inline bool __preempt_count_dec_and_test(void)
2564+static __always_inline bool ____preempt_count_dec_and_test(void)
2565 {
2566 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2567 }
2568
2569+static __always_inline bool __preempt_count_dec_and_test(void)
2570+{
2571+ if (____preempt_count_dec_and_test())
2572+ return true;
2573+#ifdef CONFIG_PREEMPT_LAZY
2574+ if (current_thread_info()->preempt_lazy_count)
2575+ return false;
2576+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2577+#else
2578+ return false;
2579+#endif
2580+}
2581+
2582 /*
2583 * Returns true when we need to resched and can (barring IRQ state).
2584 */
2585 static __always_inline bool should_resched(int preempt_offset)
2586 {
2587+#ifdef CONFIG_PREEMPT_LAZY
2588+ u32 tmp;
2589+
2590+ tmp = raw_cpu_read_4(__preempt_count);
2591+ if (tmp == preempt_offset)
2592+ return true;
2593+
2594+ /* preempt count == 0 ? */
2595+ tmp &= ~PREEMPT_NEED_RESCHED;
2596+ if (tmp)
2597+ return false;
2598+ if (current_thread_info()->preempt_lazy_count)
2599+ return false;
2600+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2601+#else
2602 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2603+#endif
2604 }
2605
2606 #ifdef CONFIG_PREEMPT
e4b2b4a8
JK
2607diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/signal.h linux-4.14/arch/x86/include/asm/signal.h
2608--- linux-4.14.orig/arch/x86/include/asm/signal.h 2017-11-12 19:46:13.000000000 +0100
2609+++ linux-4.14/arch/x86/include/asm/signal.h 2018-09-05 11:05:07.000000000 +0200
2610@@ -28,6 +28,19 @@
c7c16703
JK
2611 #define SA_IA32_ABI 0x02000000u
2612 #define SA_X32_ABI 0x01000000u
1a6e0f06
JK
2613
2614+/*
2615+ * Because some traps use the IST stack, we must keep preemption
2616+ * disabled while calling do_trap(), but do_trap() may call
2617+ * force_sig_info() which will grab the signal spin_locks for the
2618+ * task, which in PREEMPT_RT_FULL are mutexes. By defining
2619+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2620+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2621+ * trap.
2622+ */
2623+#if defined(CONFIG_PREEMPT_RT_FULL)
2624+#define ARCH_RT_DELAYS_SIGNAL_SEND
2625+#endif
2626+
2627 #ifndef CONFIG_COMPAT
2628 typedef sigset_t compat_sigset_t;
2629 #endif
e4b2b4a8
JK
2630diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/stackprotector.h linux-4.14/arch/x86/include/asm/stackprotector.h
2631--- linux-4.14.orig/arch/x86/include/asm/stackprotector.h 2017-11-12 19:46:13.000000000 +0100
2632+++ linux-4.14/arch/x86/include/asm/stackprotector.h 2018-09-05 11:05:07.000000000 +0200
2633@@ -60,7 +60,7 @@
1a6e0f06
JK
2634 */
2635 static __always_inline void boot_init_stack_canary(void)
2636 {
2637- u64 canary;
2638+ u64 uninitialized_var(canary);
2639 u64 tsc;
2640
2641 #ifdef CONFIG_X86_64
e4b2b4a8 2642@@ -71,8 +71,14 @@
1a6e0f06
JK
2643 * of randomness. The TSC only matters for very early init,
2644 * there it already has some randomness on most systems. Later
2645 * on during the bootup the random pool has true entropy too.
1a6e0f06
JK
2646+ * For preempt-rt we need to weaken the randomness a bit, as
2647+ * we can't call into the random generator from atomic context
2648+ * due to locking constraints. We just leave canary
2649+ * uninitialized and use the TSC based randomness on top of it.
2650 */
2651+#ifndef CONFIG_PREEMPT_RT_FULL
2652 get_random_bytes(&canary, sizeof(canary));
2653+#endif
2654 tsc = rdtsc();
2655 canary += tsc + (tsc << 32UL);
e4b2b4a8
JK
2656 canary &= CANARY_MASK;
2657diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/thread_info.h linux-4.14/arch/x86/include/asm/thread_info.h
2658--- linux-4.14.orig/arch/x86/include/asm/thread_info.h 2018-09-05 11:03:20.000000000 +0200
2659+++ linux-4.14/arch/x86/include/asm/thread_info.h 2018-09-05 11:05:07.000000000 +0200
2660@@ -56,11 +56,14 @@
c7c16703
JK
2661 struct thread_info {
2662 unsigned long flags; /* low level flags */
e4b2b4a8
JK
2663 u32 status; /* thread synchronous flags */
2664+ int preempt_lazy_count; /* 0 => lazy preemptable
2665+ <0 => BUG */
1a6e0f06
JK
2666 };
2667
2668 #define INIT_THREAD_INFO(tsk) \
c7c16703
JK
2669 { \
2670 .flags = 0, \
2671+ .preempt_lazy_count = 0, \
2672 }
2673
2674 #define init_stack (init_thread_union.stack)
e4b2b4a8 2675@@ -69,6 +72,10 @@
1a6e0f06
JK
2676
2677 #include <asm/asm-offsets.h>
2678
2679+#define GET_THREAD_INFO(reg) \
2680+ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2681+ _ASM_SUB $(THREAD_SIZE),reg ;
2682+
2683 #endif
2684
2685 /*
e4b2b4a8 2686@@ -85,6 +92,7 @@
1a6e0f06
JK
2687 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
2688 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
2689 #define TIF_SECCOMP 8 /* secure computing */
2690+#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
2691 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
2692 #define TIF_UPROBE 12 /* breakpointed or singlestepping */
e4b2b4a8
JK
2693 #define TIF_PATCH_PENDING 13 /* pending live patching update */
2694@@ -112,6 +120,7 @@
1a6e0f06
JK
2695 #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
2696 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
2697 #define _TIF_SECCOMP (1 << TIF_SECCOMP)
2698+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2699 #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
2700 #define _TIF_UPROBE (1 << TIF_UPROBE)
e4b2b4a8
JK
2701 #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
2702@@ -153,6 +162,8 @@
1a6e0f06
JK
2703 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2704 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2705
2706+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2707+
2708 #define STACK_WARN (THREAD_SIZE/8)
2709
2710 /*
e4b2b4a8
JK
2711diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/uv/uv_bau.h linux-4.14/arch/x86/include/asm/uv/uv_bau.h
2712--- linux-4.14.orig/arch/x86/include/asm/uv/uv_bau.h 2017-11-12 19:46:13.000000000 +0100
2713+++ linux-4.14/arch/x86/include/asm/uv/uv_bau.h 2018-09-05 11:05:07.000000000 +0200
2714@@ -643,9 +643,9 @@
1a6e0f06
JK
2715 cycles_t send_message;
2716 cycles_t period_end;
2717 cycles_t period_time;
2718- spinlock_t uvhub_lock;
2719- spinlock_t queue_lock;
2720- spinlock_t disable_lock;
2721+ raw_spinlock_t uvhub_lock;
2722+ raw_spinlock_t queue_lock;
2723+ raw_spinlock_t disable_lock;
2724 /* tunables */
2725 int max_concurr;
2726 int max_concurr_const;
e4b2b4a8 2727@@ -847,15 +847,15 @@
1a6e0f06
JK
2728 * to be lowered below the current 'v'. atomic_add_unless can only stop
2729 * on equal.
2730 */
2731-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2732+static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2733 {
2734- spin_lock(lock);
2735+ raw_spin_lock(lock);
2736 if (atomic_read(v) >= u) {
2737- spin_unlock(lock);
2738+ raw_spin_unlock(lock);
2739 return 0;
2740 }
2741 atomic_inc(v);
2742- spin_unlock(lock);
2743+ raw_spin_unlock(lock);
2744 return 1;
2745 }
2746
e4b2b4a8
JK
2747diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/Kconfig linux-4.14/arch/x86/Kconfig
2748--- linux-4.14.orig/arch/x86/Kconfig 2018-09-05 11:03:20.000000000 +0200
2749+++ linux-4.14/arch/x86/Kconfig 2018-09-05 11:05:07.000000000 +0200
2750@@ -169,6 +169,7 @@
2751 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
2752 select HAVE_PERF_REGS
2753 select HAVE_PERF_USER_STACK_DUMP
2754+ select HAVE_PREEMPT_LAZY
2755 select HAVE_RCU_TABLE_FREE
2756 select HAVE_REGS_AND_STACK_ACCESS_API
2757 select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
2758@@ -256,8 +257,11 @@
2759 def_bool y
2760 depends on ISA_DMA_API
2761
2762+config RWSEM_GENERIC_SPINLOCK
2763+ def_bool PREEMPT_RT_FULL
2764+
2765 config RWSEM_XCHGADD_ALGORITHM
2766- def_bool y
2767+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1a6e0f06 2768
e4b2b4a8
JK
2769 config GENERIC_CALIBRATE_DELAY
2770 def_bool y
2771@@ -932,7 +936,7 @@
2772 config MAXSMP
2773 bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2774 depends on X86_64 && SMP && DEBUG_KERNEL
2775- select CPUMASK_OFFSTACK
2776+ select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2777 ---help---
2778 Enable maximum number of CPUS and NUMA Nodes for this architecture.
2779 If unsure, say N.
2780diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/apic/io_apic.c linux-4.14/arch/x86/kernel/apic/io_apic.c
2781--- linux-4.14.orig/arch/x86/kernel/apic/io_apic.c 2018-09-05 11:03:20.000000000 +0200
2782+++ linux-4.14/arch/x86/kernel/apic/io_apic.c 2018-09-05 11:05:07.000000000 +0200
2783@@ -1691,7 +1691,8 @@
1a6e0f06
JK
2784 static inline bool ioapic_irqd_mask(struct irq_data *data)
2785 {
2786 /* If we are moving the irq we need to mask it */
2787- if (unlikely(irqd_is_setaffinity_pending(data))) {
2788+ if (unlikely(irqd_is_setaffinity_pending(data) &&
2789+ !irqd_irq_inprogress(data))) {
2790 mask_ioapic_irq(data);
2791 return true;
2792 }
e4b2b4a8
JK
2793diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/asm-offsets.c linux-4.14/arch/x86/kernel/asm-offsets.c
2794--- linux-4.14.orig/arch/x86/kernel/asm-offsets.c 2018-09-05 11:03:20.000000000 +0200
2795+++ linux-4.14/arch/x86/kernel/asm-offsets.c 2018-09-05 11:05:07.000000000 +0200
2796@@ -38,6 +38,7 @@
1a6e0f06
JK
2797
2798 BLANK();
c7c16703
JK
2799 OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2800+ OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
1a6e0f06 2801 OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
c7c16703
JK
2802
2803 BLANK();
e4b2b4a8 2804@@ -94,6 +95,7 @@
1a6e0f06
JK
2805
2806 BLANK();
2807 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2808+ DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
1a6e0f06 2809
e4b2b4a8
JK
2810 /* TLB state for the entry code */
2811 OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
2812diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c linux-4.14/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
2813--- linux-4.14.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c 2017-11-12 19:46:13.000000000 +0100
2814+++ linux-4.14/arch/x86/kernel/cpu/mcheck/dev-mcelog.c 2018-09-05 11:05:07.000000000 +0200
2815@@ -14,6 +14,7 @@
2816 #include <linux/slab.h>
2817 #include <linux/kmod.h>
2818 #include <linux/poll.h>
2819+#include <linux/swork.h>
1a6e0f06 2820
e4b2b4a8 2821 #include "mce-internal.h"
1a6e0f06 2822
e4b2b4a8 2823@@ -86,13 +87,43 @@
1a6e0f06 2824
e4b2b4a8 2825 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1a6e0f06 2826
1a6e0f06 2827-
e4b2b4a8
JK
2828-void mce_work_trigger(void)
2829+static void __mce_work_trigger(struct swork_event *event)
2830 {
2831 if (mce_helper[0])
2832 schedule_work(&mce_trigger_work);
1a6e0f06
JK
2833 }
2834
1a6e0f06
JK
2835+#ifdef CONFIG_PREEMPT_RT_FULL
2836+static bool notify_work_ready __read_mostly;
2837+static struct swork_event notify_work;
2838+
2839+static int mce_notify_work_init(void)
2840+{
2841+ int err;
2842+
2843+ err = swork_get();
2844+ if (err)
2845+ return err;
2846+
e4b2b4a8 2847+ INIT_SWORK(&notify_work, __mce_work_trigger);
1a6e0f06
JK
2848+ notify_work_ready = true;
2849+ return 0;
2850+}
2851+
e4b2b4a8 2852+void mce_work_trigger(void)
1a6e0f06
JK
2853+{
2854+ if (notify_work_ready)
2855+ swork_queue(&notify_work);
2856+}
e4b2b4a8 2857+
1a6e0f06 2858+#else
e4b2b4a8 2859+void mce_work_trigger(void)
1a6e0f06 2860+{
e4b2b4a8 2861+ __mce_work_trigger(NULL);
1a6e0f06
JK
2862+}
2863+static inline int mce_notify_work_init(void) { return 0; }
2864+#endif
2865+
e4b2b4a8
JK
2866 static ssize_t
2867 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
1a6e0f06 2868 {
e4b2b4a8
JK
2869@@ -356,7 +387,7 @@
2870
2871 return err;
2872 }
1a6e0f06 2873-
e4b2b4a8
JK
2874+ mce_notify_work_init();
2875 mce_register_decode_chain(&dev_mcelog_nb);
2876 return 0;
2877 }
2878diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-4.14/arch/x86/kernel/cpu/mcheck/mce.c
2879--- linux-4.14.orig/arch/x86/kernel/cpu/mcheck/mce.c 2018-09-05 11:03:20.000000000 +0200
2880+++ linux-4.14/arch/x86/kernel/cpu/mcheck/mce.c 2018-09-05 11:05:07.000000000 +0200
2881@@ -42,6 +42,7 @@
2882 #include <linux/debugfs.h>
2883 #include <linux/irq_work.h>
2884 #include <linux/export.h>
2885+#include <linux/jiffies.h>
2886 #include <linux/jump_label.h>
2887
2888 #include <asm/intel-family.h>
2889@@ -1365,7 +1366,7 @@
2890 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2891
2892 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2893-static DEFINE_PER_CPU(struct timer_list, mce_timer);
2894+static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2895
2896 static unsigned long mce_adjust_timer_default(unsigned long interval)
2897 {
2898@@ -1374,27 +1375,19 @@
2899
2900 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2901
2902-static void __start_timer(struct timer_list *t, unsigned long interval)
2903+static void __start_timer(struct hrtimer *t, unsigned long iv)
2904 {
2905- unsigned long when = jiffies + interval;
2906- unsigned long flags;
1a6e0f06 2907-
e4b2b4a8 2908- local_irq_save(flags);
1a6e0f06 2909-
e4b2b4a8
JK
2910- if (!timer_pending(t) || time_before(when, t->expires))
2911- mod_timer(t, round_jiffies(when));
2912+ if (!iv)
2913+ return;
2914
2915- local_irq_restore(flags);
2916+ hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2917+ 0, HRTIMER_MODE_REL_PINNED);
2918 }
2919
2920-static void mce_timer_fn(unsigned long data)
2921+static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2922 {
2923- struct timer_list *t = this_cpu_ptr(&mce_timer);
2924- int cpu = smp_processor_id();
2925 unsigned long iv;
2926
2927- WARN_ON(cpu != data);
1a6e0f06 2928-
e4b2b4a8
JK
2929 iv = __this_cpu_read(mce_next_interval);
2930
2931 if (mce_available(this_cpu_ptr(&cpu_info))) {
2932@@ -1417,7 +1410,11 @@
2933
2934 done:
2935 __this_cpu_write(mce_next_interval, iv);
2936- __start_timer(t, iv);
2937+ if (!iv)
2938+ return HRTIMER_NORESTART;
2939+
2940+ hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv)));
2941+ return HRTIMER_RESTART;
1a6e0f06
JK
2942 }
2943
e4b2b4a8
JK
2944 /*
2945@@ -1425,7 +1422,7 @@
2946 */
2947 void mce_timer_kick(unsigned long interval)
1a6e0f06 2948 {
e4b2b4a8
JK
2949- struct timer_list *t = this_cpu_ptr(&mce_timer);
2950+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
2951 unsigned long iv = __this_cpu_read(mce_next_interval);
1a6e0f06 2952
e4b2b4a8
JK
2953 __start_timer(t, interval);
2954@@ -1440,7 +1437,7 @@
2955 int cpu;
1a6e0f06 2956
e4b2b4a8
JK
2957 for_each_online_cpu(cpu)
2958- del_timer_sync(&per_cpu(mce_timer, cpu));
2959+ hrtimer_cancel(&per_cpu(mce_timer, cpu));
2960 }
1a6e0f06 2961
e4b2b4a8
JK
2962 /*
2963@@ -1769,7 +1766,7 @@
2964 }
1a6e0f06
JK
2965 }
2966
e4b2b4a8
JK
2967-static void mce_start_timer(struct timer_list *t)
2968+static void mce_start_timer(struct hrtimer *t)
2969 {
2970 unsigned long iv = check_interval * HZ;
2971
2972@@ -1782,18 +1779,19 @@
2973
2974 static void __mcheck_cpu_setup_timer(void)
1a6e0f06
JK
2975 {
2976- struct timer_list *t = this_cpu_ptr(&mce_timer);
e4b2b4a8 2977- unsigned int cpu = smp_processor_id();
1a6e0f06 2978+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
1a6e0f06
JK
2979
2980- setup_pinned_timer(t, mce_timer_fn, cpu);
2981+ hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2982+ t->function = mce_timer_fn;
1a6e0f06
JK
2983 }
2984
e4b2b4a8
JK
2985 static void __mcheck_cpu_init_timer(void)
2986 {
2987- struct timer_list *t = this_cpu_ptr(&mce_timer);
2988- unsigned int cpu = smp_processor_id();
2989+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
1a6e0f06 2990+
e4b2b4a8
JK
2991+ hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2992+ t->function = mce_timer_fn;
1a6e0f06 2993
e4b2b4a8
JK
2994- setup_pinned_timer(t, mce_timer_fn, cpu);
2995 mce_start_timer(t);
1a6e0f06
JK
2996 }
2997
e4b2b4a8
JK
2998@@ -2309,7 +2307,7 @@
2999
3000 static int mce_cpu_online(unsigned int cpu)
1a6e0f06 3001 {
e4b2b4a8
JK
3002- struct timer_list *t = this_cpu_ptr(&mce_timer);
3003+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
3004 int ret;
1a6e0f06 3005
e4b2b4a8
JK
3006 mce_device_create(cpu);
3007@@ -2326,10 +2324,10 @@
1a6e0f06 3008
e4b2b4a8
JK
3009 static int mce_cpu_pre_down(unsigned int cpu)
3010 {
3011- struct timer_list *t = this_cpu_ptr(&mce_timer);
3012+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
1a6e0f06 3013
e4b2b4a8
JK
3014 mce_disable_cpu();
3015- del_timer_sync(t);
3016+ hrtimer_cancel(t);
3017 mce_threshold_remove_device(cpu);
3018 mce_device_remove(cpu);
3019 return 0;
3020diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/fpu/core.c linux-4.14/arch/x86/kernel/fpu/core.c
3021--- linux-4.14.orig/arch/x86/kernel/fpu/core.c 2018-09-05 11:03:20.000000000 +0200
3022+++ linux-4.14/arch/x86/kernel/fpu/core.c 2018-09-05 11:05:07.000000000 +0200
3023@@ -138,6 +138,18 @@
3024 }
3025 EXPORT_SYMBOL_GPL(kernel_fpu_end);
3026
3027+void kernel_fpu_resched(void)
3028+{
3029+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
3030+
3031+ if (should_resched(PREEMPT_OFFSET)) {
3032+ kernel_fpu_end();
3033+ cond_resched();
3034+ kernel_fpu_begin();
3035+ }
3036+}
3037+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
3038+
3039 /*
3040 * Save the FPU state (mark it for reload if necessary):
3041 *
3042diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/irq_32.c linux-4.14/arch/x86/kernel/irq_32.c
3043--- linux-4.14.orig/arch/x86/kernel/irq_32.c 2018-09-05 11:03:20.000000000 +0200
3044+++ linux-4.14/arch/x86/kernel/irq_32.c 2018-09-05 11:05:07.000000000 +0200
3045@@ -130,6 +130,7 @@
1a6e0f06
JK
3046 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
3047 }
3048
3049+#ifndef CONFIG_PREEMPT_RT_FULL
3050 void do_softirq_own_stack(void)
3051 {
3052 struct irq_stack *irqstk;
e4b2b4a8 3053@@ -146,6 +147,7 @@
1a6e0f06
JK
3054
3055 call_on_stack(__do_softirq, isp);
3056 }
3057+#endif
3058
3059 bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3060 {
e4b2b4a8
JK
3061diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/process_32.c linux-4.14/arch/x86/kernel/process_32.c
3062--- linux-4.14.orig/arch/x86/kernel/process_32.c 2018-09-05 11:03:20.000000000 +0200
3063+++ linux-4.14/arch/x86/kernel/process_32.c 2018-09-05 11:05:07.000000000 +0200
3064@@ -38,6 +38,7 @@
1a6e0f06
JK
3065 #include <linux/io.h>
3066 #include <linux/kdebug.h>
e4b2b4a8 3067 #include <linux/syscalls.h>
1a6e0f06
JK
3068+#include <linux/highmem.h>
3069
3070 #include <asm/pgtable.h>
3071 #include <asm/ldt.h>
e4b2b4a8 3072@@ -198,6 +199,35 @@
1a6e0f06
JK
3073 }
3074 EXPORT_SYMBOL_GPL(start_thread);
3075
3076+#ifdef CONFIG_PREEMPT_RT_FULL
3077+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3078+{
3079+ int i;
3080+
3081+ /*
3082+ * Clear @prev's kmap_atomic mappings
3083+ */
3084+ for (i = 0; i < prev_p->kmap_idx; i++) {
3085+ int idx = i + KM_TYPE_NR * smp_processor_id();
3086+ pte_t *ptep = kmap_pte - idx;
3087+
3088+ kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3089+ }
3090+ /*
3091+ * Restore @next_p's kmap_atomic mappings
3092+ */
3093+ for (i = 0; i < next_p->kmap_idx; i++) {
3094+ int idx = i + KM_TYPE_NR * smp_processor_id();
3095+
3096+ if (!pte_none(next_p->kmap_pte[i]))
3097+ set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3098+ }
3099+}
3100+#else
3101+static inline void
3102+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3103+#endif
3104+
3105
3106 /*
3107 * switch_to(x,y) should switch tasks from x to y.
e4b2b4a8 3108@@ -273,6 +303,8 @@
1a6e0f06
JK
3109 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3110 __switch_to_xtra(prev_p, next_p, tss);
3111
3112+ switch_kmaps(prev_p, next_p);
3113+
3114 /*
3115 * Leave lazy mode, flushing any hypercalls made here.
3116 * This must be done before restoring TLS segments so
e4b2b4a8
JK
3117diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kvm/lapic.c linux-4.14/arch/x86/kvm/lapic.c
3118--- linux-4.14.orig/arch/x86/kvm/lapic.c 2018-09-05 11:03:20.000000000 +0200
3119+++ linux-4.14/arch/x86/kvm/lapic.c 2018-09-05 11:05:07.000000000 +0200
3120@@ -2120,7 +2120,7 @@
3121 apic->vcpu = vcpu;
3122
1a6e0f06 3123 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
e4b2b4a8
JK
3124- HRTIMER_MODE_ABS_PINNED);
3125+ HRTIMER_MODE_ABS_PINNED_HARD);
1a6e0f06 3126 apic->lapic_timer.timer.function = apic_timer_fn;
1a6e0f06
JK
3127
3128 /*
e4b2b4a8
JK
3129diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kvm/x86.c linux-4.14/arch/x86/kvm/x86.c
3130--- linux-4.14.orig/arch/x86/kvm/x86.c 2018-09-05 11:03:20.000000000 +0200
3131+++ linux-4.14/arch/x86/kvm/x86.c 2018-09-05 11:05:07.000000000 +0200
3132@@ -6285,6 +6285,13 @@
1a6e0f06
JK
3133 goto out;
3134 }
3135
3136+#ifdef CONFIG_PREEMPT_RT_FULL
3137+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3138+ printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3139+ return -EOPNOTSUPP;
3140+ }
3141+#endif
3142+
3143 r = kvm_mmu_module_init();
3144 if (r)
3145 goto out_free_percpu;
e4b2b4a8
JK
3146diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/mm/highmem_32.c linux-4.14/arch/x86/mm/highmem_32.c
3147--- linux-4.14.orig/arch/x86/mm/highmem_32.c 2017-11-12 19:46:13.000000000 +0100
3148+++ linux-4.14/arch/x86/mm/highmem_32.c 2018-09-05 11:05:07.000000000 +0200
3149@@ -32,10 +32,11 @@
1a6e0f06
JK
3150 */
3151 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3152 {
3153+ pte_t pte = mk_pte(page, prot);
3154 unsigned long vaddr;
3155 int idx, type;
3156
3157- preempt_disable();
3158+ preempt_disable_nort();
3159 pagefault_disable();
3160
3161 if (!PageHighMem(page))
e4b2b4a8 3162@@ -45,7 +46,10 @@
1a6e0f06
JK
3163 idx = type + KM_TYPE_NR*smp_processor_id();
3164 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3165 BUG_ON(!pte_none(*(kmap_pte-idx)));
3166- set_pte(kmap_pte-idx, mk_pte(page, prot));
3167+#ifdef CONFIG_PREEMPT_RT_FULL
3168+ current->kmap_pte[type] = pte;
3169+#endif
3170+ set_pte(kmap_pte-idx, pte);
3171 arch_flush_lazy_mmu_mode();
3172
3173 return (void *)vaddr;
e4b2b4a8 3174@@ -88,6 +92,9 @@
1a6e0f06
JK
3175 * is a bad idea also, in case the page changes cacheability
3176 * attributes or becomes a protected page in a hypervisor.
3177 */
3178+#ifdef CONFIG_PREEMPT_RT_FULL
3179+ current->kmap_pte[type] = __pte(0);
3180+#endif
3181 kpte_clear_flush(kmap_pte-idx, vaddr);
3182 kmap_atomic_idx_pop();
3183 arch_flush_lazy_mmu_mode();
e4b2b4a8 3184@@ -100,7 +107,7 @@
1a6e0f06
JK
3185 #endif
3186
3187 pagefault_enable();
3188- preempt_enable();
3189+ preempt_enable_nort();
3190 }
3191 EXPORT_SYMBOL(__kunmap_atomic);
3192
e4b2b4a8
JK
3193diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/mm/iomap_32.c linux-4.14/arch/x86/mm/iomap_32.c
3194--- linux-4.14.orig/arch/x86/mm/iomap_32.c 2017-11-12 19:46:13.000000000 +0100
3195+++ linux-4.14/arch/x86/mm/iomap_32.c 2018-09-05 11:05:07.000000000 +0200
3196@@ -56,6 +56,7 @@
1a6e0f06
JK
3197
3198 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3199 {
3200+ pte_t pte = pfn_pte(pfn, prot);
3201 unsigned long vaddr;
3202 int idx, type;
3203
e4b2b4a8 3204@@ -65,7 +66,12 @@
1a6e0f06
JK
3205 type = kmap_atomic_idx_push();
3206 idx = type + KM_TYPE_NR * smp_processor_id();
3207 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3208- set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3209+ WARN_ON(!pte_none(*(kmap_pte - idx)));
3210+
3211+#ifdef CONFIG_PREEMPT_RT_FULL
3212+ current->kmap_pte[type] = pte;
3213+#endif
3214+ set_pte(kmap_pte - idx, pte);
3215 arch_flush_lazy_mmu_mode();
3216
3217 return (void *)vaddr;
e4b2b4a8 3218@@ -113,6 +119,9 @@
1a6e0f06
JK
3219 * is a bad idea also, in case the page changes cacheability
3220 * attributes or becomes a protected page in a hypervisor.
3221 */
3222+#ifdef CONFIG_PREEMPT_RT_FULL
3223+ current->kmap_pte[type] = __pte(0);
3224+#endif
3225 kpte_clear_flush(kmap_pte-idx, vaddr);
3226 kmap_atomic_idx_pop();
3227 }
e4b2b4a8
JK
3228diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/platform/uv/tlb_uv.c linux-4.14/arch/x86/platform/uv/tlb_uv.c
3229--- linux-4.14.orig/arch/x86/platform/uv/tlb_uv.c 2018-09-05 11:03:20.000000000 +0200
3230+++ linux-4.14/arch/x86/platform/uv/tlb_uv.c 2018-09-05 11:05:07.000000000 +0200
3231@@ -740,9 +740,9 @@
1a6e0f06
JK
3232
3233 quiesce_local_uvhub(hmaster);
3234
3235- spin_lock(&hmaster->queue_lock);
3236+ raw_spin_lock(&hmaster->queue_lock);
3237 reset_with_ipi(&bau_desc->distribution, bcp);
3238- spin_unlock(&hmaster->queue_lock);
3239+ raw_spin_unlock(&hmaster->queue_lock);
3240
3241 end_uvhub_quiesce(hmaster);
3242
e4b2b4a8 3243@@ -762,9 +762,9 @@
1a6e0f06
JK
3244
3245 quiesce_local_uvhub(hmaster);
3246
3247- spin_lock(&hmaster->queue_lock);
3248+ raw_spin_lock(&hmaster->queue_lock);
3249 reset_with_ipi(&bau_desc->distribution, bcp);
3250- spin_unlock(&hmaster->queue_lock);
3251+ raw_spin_unlock(&hmaster->queue_lock);
3252
3253 end_uvhub_quiesce(hmaster);
3254
e4b2b4a8 3255@@ -785,7 +785,7 @@
1a6e0f06
JK
3256 cycles_t tm1;
3257
3258 hmaster = bcp->uvhub_master;
3259- spin_lock(&hmaster->disable_lock);
3260+ raw_spin_lock(&hmaster->disable_lock);
3261 if (!bcp->baudisabled) {
3262 stat->s_bau_disabled++;
3263 tm1 = get_cycles();
e4b2b4a8 3264@@ -798,7 +798,7 @@
1a6e0f06
JK
3265 }
3266 }
3267 }
3268- spin_unlock(&hmaster->disable_lock);
3269+ raw_spin_unlock(&hmaster->disable_lock);
3270 }
3271
3272 static void count_max_concurr(int stat, struct bau_control *bcp,
e4b2b4a8 3273@@ -861,7 +861,7 @@
1a6e0f06
JK
3274 */
3275 static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3276 {
3277- spinlock_t *lock = &hmaster->uvhub_lock;
3278+ raw_spinlock_t *lock = &hmaster->uvhub_lock;
3279 atomic_t *v;
3280
3281 v = &hmaster->active_descriptor_count;
e4b2b4a8 3282@@ -995,7 +995,7 @@
1a6e0f06
JK
3283 struct bau_control *hmaster;
3284
3285 hmaster = bcp->uvhub_master;
3286- spin_lock(&hmaster->disable_lock);
3287+ raw_spin_lock(&hmaster->disable_lock);
3288 if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3289 stat->s_bau_reenabled++;
3290 for_each_present_cpu(tcpu) {
e4b2b4a8 3291@@ -1007,10 +1007,10 @@
1a6e0f06
JK
3292 tbcp->period_giveups = 0;
3293 }
3294 }
3295- spin_unlock(&hmaster->disable_lock);
3296+ raw_spin_unlock(&hmaster->disable_lock);
3297 return 0;
3298 }
3299- spin_unlock(&hmaster->disable_lock);
3300+ raw_spin_unlock(&hmaster->disable_lock);
3301 return -1;
3302 }
3303
e4b2b4a8 3304@@ -1942,9 +1942,9 @@
1a6e0f06 3305 bcp->cong_reps = congested_reps;
c7c16703
JK
3306 bcp->disabled_period = sec_2_cycles(disabled_period);
3307 bcp->giveup_limit = giveup_limit;
1a6e0f06
JK
3308- spin_lock_init(&bcp->queue_lock);
3309- spin_lock_init(&bcp->uvhub_lock);
3310- spin_lock_init(&bcp->disable_lock);
3311+ raw_spin_lock_init(&bcp->queue_lock);
3312+ raw_spin_lock_init(&bcp->uvhub_lock);
3313+ raw_spin_lock_init(&bcp->disable_lock);
3314 }
3315 }
3316
e4b2b4a8
JK
3317diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/platform/uv/uv_time.c linux-4.14/arch/x86/platform/uv/uv_time.c
3318--- linux-4.14.orig/arch/x86/platform/uv/uv_time.c 2017-11-12 19:46:13.000000000 +0100
3319+++ linux-4.14/arch/x86/platform/uv/uv_time.c 2018-09-05 11:05:07.000000000 +0200
3320@@ -57,7 +57,7 @@
1a6e0f06
JK
3321
3322 /* There is one of these allocated per node */
3323 struct uv_rtc_timer_head {
3324- spinlock_t lock;
3325+ raw_spinlock_t lock;
3326 /* next cpu waiting for timer, local node relative: */
3327 int next_cpu;
3328 /* number of cpus on this node: */
e4b2b4a8 3329@@ -177,7 +177,7 @@
1a6e0f06
JK
3330 uv_rtc_deallocate_timers();
3331 return -ENOMEM;
3332 }
3333- spin_lock_init(&head->lock);
3334+ raw_spin_lock_init(&head->lock);
3335 head->ncpus = uv_blade_nr_possible_cpus(bid);
3336 head->next_cpu = -1;
3337 blade_info[bid] = head;
e4b2b4a8 3338@@ -231,7 +231,7 @@
1a6e0f06
JK
3339 unsigned long flags;
3340 int next_cpu;
3341
3342- spin_lock_irqsave(&head->lock, flags);
3343+ raw_spin_lock_irqsave(&head->lock, flags);
3344
3345 next_cpu = head->next_cpu;
3346 *t = expires;
e4b2b4a8 3347@@ -243,12 +243,12 @@
1a6e0f06
JK
3348 if (uv_setup_intr(cpu, expires)) {
3349 *t = ULLONG_MAX;
3350 uv_rtc_find_next_timer(head, pnode);
3351- spin_unlock_irqrestore(&head->lock, flags);
3352+ raw_spin_unlock_irqrestore(&head->lock, flags);
3353 return -ETIME;
3354 }
3355 }
3356
3357- spin_unlock_irqrestore(&head->lock, flags);
3358+ raw_spin_unlock_irqrestore(&head->lock, flags);
3359 return 0;
3360 }
3361
e4b2b4a8 3362@@ -267,7 +267,7 @@
1a6e0f06
JK
3363 unsigned long flags;
3364 int rc = 0;
3365
3366- spin_lock_irqsave(&head->lock, flags);
3367+ raw_spin_lock_irqsave(&head->lock, flags);
3368
3369 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3370 rc = 1;
e4b2b4a8 3371@@ -279,7 +279,7 @@
1a6e0f06
JK
3372 uv_rtc_find_next_timer(head, pnode);
3373 }
3374
3375- spin_unlock_irqrestore(&head->lock, flags);
3376+ raw_spin_unlock_irqrestore(&head->lock, flags);
3377
3378 return rc;
3379 }
e4b2b4a8
JK
3380@@ -299,13 +299,17 @@
3381 static u64 uv_read_rtc(struct clocksource *cs)
1a6e0f06
JK
3382 {
3383 unsigned long offset;
e4b2b4a8 3384+ u64 cycles;
1a6e0f06
JK
3385
3386+ preempt_disable();
3387 if (uv_get_min_hub_revision_id() == 1)
3388 offset = 0;
3389 else
3390 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3391
e4b2b4a8
JK
3392- return (u64)uv_read_local_mmr(UVH_RTC | offset);
3393+ cycles = (u64)uv_read_local_mmr(UVH_RTC | offset);
1a6e0f06 3394+ preempt_enable();
1a6e0f06
JK
3395+ return cycles;
3396 }
3397
3398 /*
e4b2b4a8
JK
3399diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/xtensa/include/asm/spinlock_types.h linux-4.14/arch/xtensa/include/asm/spinlock_types.h
3400--- linux-4.14.orig/arch/xtensa/include/asm/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
3401+++ linux-4.14/arch/xtensa/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
3402@@ -2,10 +2,6 @@
3403 #ifndef __ASM_SPINLOCK_TYPES_H
3404 #define __ASM_SPINLOCK_TYPES_H
3405
3406-#ifndef __LINUX_SPINLOCK_TYPES_H
3407-# error "please don't include this file directly"
3408-#endif
3409-
3410 typedef struct {
3411 volatile unsigned int slock;
3412 } arch_spinlock_t;
3413diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-core.c linux-4.14/block/blk-core.c
3414--- linux-4.14.orig/block/blk-core.c 2018-09-05 11:03:20.000000000 +0200
3415+++ linux-4.14/block/blk-core.c 2018-09-05 11:05:07.000000000 +0200
3416@@ -116,6 +116,9 @@
1a6e0f06
JK
3417
3418 INIT_LIST_HEAD(&rq->queuelist);
3419 INIT_LIST_HEAD(&rq->timeout_list);
3420+#ifdef CONFIG_PREEMPT_RT_FULL
3421+ INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3422+#endif
3423 rq->cpu = -1;
3424 rq->q = q;
3425 rq->__sector = (sector_t) -1;
e4b2b4a8 3426@@ -280,7 +283,7 @@
1a6e0f06
JK
3427 void blk_start_queue(struct request_queue *q)
3428 {
e4b2b4a8
JK
3429 lockdep_assert_held(q->queue_lock);
3430- WARN_ON(!in_interrupt() && !irqs_disabled());
3431+ WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
3432 WARN_ON_ONCE(q->mq_ops);
1a6e0f06
JK
3433
3434 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
e4b2b4a8
JK
3435@@ -808,12 +811,21 @@
3436 percpu_ref_put(&q->q_usage_counter);
3437 }
3438
3439+static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
3440+{
3441+ struct request_queue *q =
3442+ container_of(sev, struct request_queue, mq_pcpu_wake);
3443+
3444+ wake_up_all(&q->mq_freeze_wq);
3445+}
3446+
3447 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3448 {
1a6e0f06
JK
3449 struct request_queue *q =
3450 container_of(ref, struct request_queue, q_usage_counter);
3451
3452- wake_up_all(&q->mq_freeze_wq);
e4b2b4a8
JK
3453+ if (wq_has_sleeper(&q->mq_freeze_wq))
3454+ swork_queue(&q->mq_pcpu_wake);
1a6e0f06
JK
3455 }
3456
3457 static void blk_rq_timed_out_timer(unsigned long data)
e4b2b4a8 3458@@ -890,6 +902,7 @@
1a6e0f06
JK
3459 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3460
e4b2b4a8
JK
3461 init_waitqueue_head(&q->mq_freeze_wq);
3462+ INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
1a6e0f06
JK
3463
3464 /*
3465 * Init percpu_ref in atomic mode so that it's faster to shutdown.
e4b2b4a8 3466@@ -3308,7 +3321,7 @@
1a6e0f06
JK
3467 blk_run_queue_async(q);
3468 else
3469 __blk_run_queue(q);
3470- spin_unlock(q->queue_lock);
3471+ spin_unlock_irq(q->queue_lock);
3472 }
3473
3474 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
e4b2b4a8 3475@@ -3356,7 +3369,6 @@
1a6e0f06
JK
3476 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3477 {
3478 struct request_queue *q;
3479- unsigned long flags;
3480 struct request *rq;
3481 LIST_HEAD(list);
3482 unsigned int depth;
e4b2b4a8 3483@@ -3376,11 +3388,6 @@
1a6e0f06
JK
3484 q = NULL;
3485 depth = 0;
3486
3487- /*
3488- * Save and disable interrupts here, to avoid doing it for every
3489- * queue lock we have to take.
3490- */
3491- local_irq_save(flags);
3492 while (!list_empty(&list)) {
3493 rq = list_entry_rq(list.next);
3494 list_del_init(&rq->queuelist);
e4b2b4a8 3495@@ -3393,7 +3400,7 @@
1a6e0f06
JK
3496 queue_unplugged(q, depth, from_schedule);
3497 q = rq->q;
3498 depth = 0;
3499- spin_lock(q->queue_lock);
3500+ spin_lock_irq(q->queue_lock);
3501 }
3502
3503 /*
e4b2b4a8 3504@@ -3420,8 +3427,6 @@
1a6e0f06
JK
3505 */
3506 if (q)
3507 queue_unplugged(q, depth, from_schedule);
3508-
3509- local_irq_restore(flags);
3510 }
3511
3512 void blk_finish_plug(struct blk_plug *plug)
e4b2b4a8
JK
3513@@ -3631,6 +3636,8 @@
3514 if (!kblockd_workqueue)
3515 panic("Failed to create kblockd\n");
3516
3517+ BUG_ON(swork_get());
3518+
3519 request_cachep = kmem_cache_create("blkdev_requests",
3520 sizeof(struct request), 0, SLAB_PANIC, NULL);
3521
3522diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-ioc.c linux-4.14/block/blk-ioc.c
3523--- linux-4.14.orig/block/blk-ioc.c 2017-11-12 19:46:13.000000000 +0100
3524+++ linux-4.14/block/blk-ioc.c 2018-09-05 11:05:07.000000000 +0200
3525@@ -9,6 +9,7 @@
1a6e0f06
JK
3526 #include <linux/blkdev.h>
3527 #include <linux/slab.h>
e4b2b4a8 3528 #include <linux/sched/task.h>
1a6e0f06
JK
3529+#include <linux/delay.h>
3530
3531 #include "blk.h"
3532
e4b2b4a8 3533@@ -118,7 +119,7 @@
1a6e0f06
JK
3534 spin_unlock(q->queue_lock);
3535 } else {
3536 spin_unlock_irqrestore(&ioc->lock, flags);
3537- cpu_relax();
3538+ cpu_chill();
3539 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3540 }
3541 }
e4b2b4a8
JK
3542@@ -202,7 +203,7 @@
3543 spin_unlock(icq->q->queue_lock);
3544 } else {
3545 spin_unlock_irqrestore(&ioc->lock, flags);
3546- cpu_relax();
3547+ cpu_chill();
3548 goto retry;
3549 }
1a6e0f06 3550 }
e4b2b4a8
JK
3551diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-mq.c linux-4.14/block/blk-mq.c
3552--- linux-4.14.orig/block/blk-mq.c 2018-09-05 11:03:20.000000000 +0200
3553+++ linux-4.14/block/blk-mq.c 2018-09-05 11:05:07.000000000 +0200
3554@@ -339,6 +339,9 @@
3555 /* tag was already set */
3556 rq->extra_len = 0;
1a6e0f06
JK
3557
3558+#ifdef CONFIG_PREEMPT_RT_FULL
3559+ INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3560+#endif
3561 INIT_LIST_HEAD(&rq->timeout_list);
3562 rq->timeout = 0;
3563
e4b2b4a8 3564@@ -533,12 +536,24 @@
1a6e0f06
JK
3565 }
3566 EXPORT_SYMBOL(blk_mq_end_request);
3567
3568+#ifdef CONFIG_PREEMPT_RT_FULL
3569+
3570+void __blk_mq_complete_request_remote_work(struct work_struct *work)
3571+{
3572+ struct request *rq = container_of(work, struct request, work);
3573+
3574+ rq->q->softirq_done_fn(rq);
3575+}
3576+
3577+#else
3578+
3579 static void __blk_mq_complete_request_remote(void *data)
3580 {
3581 struct request *rq = data;
e4b2b4a8 3582
1a6e0f06
JK
3583 rq->q->softirq_done_fn(rq);
3584 }
1a6e0f06 3585+#endif
e4b2b4a8
JK
3586
3587 static void __blk_mq_complete_request(struct request *rq)
1a6e0f06 3588 {
e4b2b4a8 3589@@ -558,19 +573,27 @@
1a6e0f06
JK
3590 return;
3591 }
3592
3593- cpu = get_cpu();
3594+ cpu = get_cpu_light();
3595 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3596 shared = cpus_share_cache(cpu, ctx->cpu);
3597
3598 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3599+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8
JK
3600+ /*
3601+ * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
3602+ * here. But we could try to invoke it one the CPU like this.
3603+ */
1a6e0f06
JK
3604+ schedule_work_on(ctx->cpu, &rq->work);
3605+#else
3606 rq->csd.func = __blk_mq_complete_request_remote;
3607 rq->csd.info = rq;
3608 rq->csd.flags = 0;
3609 smp_call_function_single_async(ctx->cpu, &rq->csd);
3610+#endif
3611 } else {
3612 rq->q->softirq_done_fn(rq);
3613 }
3614- put_cpu();
3615+ put_cpu_light();
3616 }
3617
e4b2b4a8
JK
3618 /**
3619@@ -1238,14 +1261,14 @@
1a6e0f06
JK
3620 return;
3621
c7c16703 3622 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1a6e0f06
JK
3623- int cpu = get_cpu();
3624+ int cpu = get_cpu_light();
3625 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3626 __blk_mq_run_hw_queue(hctx);
3627- put_cpu();
3628+ put_cpu_light();
3629 return;
3630 }
3631
3632- put_cpu();
3633+ put_cpu_light();
3634 }
3635
e4b2b4a8
JK
3636 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
3637@@ -2863,10 +2886,9 @@
3638 kt = nsecs;
3639
3640 mode = HRTIMER_MODE_REL;
3641- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3642+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
3643 hrtimer_set_expires(&hs.timer, kt);
3644
3645- hrtimer_init_sleeper(&hs, current);
3646 do {
3647 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
3648 break;
3649diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-mq.h linux-4.14/block/blk-mq.h
3650--- linux-4.14.orig/block/blk-mq.h 2018-09-05 11:03:20.000000000 +0200
3651+++ linux-4.14/block/blk-mq.h 2018-09-05 11:05:07.000000000 +0200
3652@@ -98,12 +98,12 @@
1a6e0f06
JK
3653 */
3654 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3655 {
3656- return __blk_mq_get_ctx(q, get_cpu());
3657+ return __blk_mq_get_ctx(q, get_cpu_light());
3658 }
3659
3660 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3661 {
3662- put_cpu();
3663+ put_cpu_light();
3664 }
3665
3666 struct blk_mq_alloc_data {
e4b2b4a8
JK
3667diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-softirq.c linux-4.14/block/blk-softirq.c
3668--- linux-4.14.orig/block/blk-softirq.c 2017-11-12 19:46:13.000000000 +0100
3669+++ linux-4.14/block/blk-softirq.c 2018-09-05 11:05:07.000000000 +0200
3670@@ -53,6 +53,7 @@
1a6e0f06
JK
3671 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3672
3673 local_irq_restore(flags);
3674+ preempt_check_resched_rt();
3675 }
3676
3677 /*
e4b2b4a8 3678@@ -91,6 +92,7 @@
c7c16703
JK
3679 this_cpu_ptr(&blk_cpu_done));
3680 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3681 local_irq_enable();
3682+ preempt_check_resched_rt();
1a6e0f06 3683
c7c16703
JK
3684 return 0;
3685 }
e4b2b4a8 3686@@ -143,6 +145,7 @@
1a6e0f06
JK
3687 goto do_local;
3688
3689 local_irq_restore(flags);
3690+ preempt_check_resched_rt();
3691 }
3692
3693 /**
e4b2b4a8
JK
3694diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/bounce.c linux-4.14/block/bounce.c
3695--- linux-4.14.orig/block/bounce.c 2018-09-05 11:03:20.000000000 +0200
3696+++ linux-4.14/block/bounce.c 2018-09-05 11:05:07.000000000 +0200
3697@@ -66,11 +66,11 @@
1a6e0f06
JK
3698 unsigned long flags;
3699 unsigned char *vto;
3700
3701- local_irq_save(flags);
3702+ local_irq_save_nort(flags);
3703 vto = kmap_atomic(to->bv_page);
3704 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3705 kunmap_atomic(vto);
3706- local_irq_restore(flags);
3707+ local_irq_restore_nort(flags);
3708 }
3709
3710 #else /* CONFIG_HIGHMEM */
e4b2b4a8
JK
3711diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/algapi.c linux-4.14/crypto/algapi.c
3712--- linux-4.14.orig/crypto/algapi.c 2018-09-05 11:03:20.000000000 +0200
3713+++ linux-4.14/crypto/algapi.c 2018-09-05 11:05:07.000000000 +0200
3714@@ -731,13 +731,13 @@
1a6e0f06
JK
3715
3716 int crypto_register_notifier(struct notifier_block *nb)
3717 {
3718- return blocking_notifier_chain_register(&crypto_chain, nb);
3719+ return srcu_notifier_chain_register(&crypto_chain, nb);
3720 }
3721 EXPORT_SYMBOL_GPL(crypto_register_notifier);
3722
3723 int crypto_unregister_notifier(struct notifier_block *nb)
3724 {
3725- return blocking_notifier_chain_unregister(&crypto_chain, nb);
3726+ return srcu_notifier_chain_unregister(&crypto_chain, nb);
3727 }
3728 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3729
e4b2b4a8
JK
3730diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/api.c linux-4.14/crypto/api.c
3731--- linux-4.14.orig/crypto/api.c 2017-11-12 19:46:13.000000000 +0100
3732+++ linux-4.14/crypto/api.c 2018-09-05 11:05:07.000000000 +0200
3733@@ -31,7 +31,7 @@
1a6e0f06
JK
3734 DECLARE_RWSEM(crypto_alg_sem);
3735 EXPORT_SYMBOL_GPL(crypto_alg_sem);
3736
3737-BLOCKING_NOTIFIER_HEAD(crypto_chain);
3738+SRCU_NOTIFIER_HEAD(crypto_chain);
3739 EXPORT_SYMBOL_GPL(crypto_chain);
3740
3741 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
e4b2b4a8 3742@@ -236,10 +236,10 @@
1a6e0f06
JK
3743 {
3744 int ok;
3745
3746- ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3747+ ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3748 if (ok == NOTIFY_DONE) {
3749 request_module("cryptomgr");
3750- ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3751+ ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3752 }
3753
3754 return ok;
e4b2b4a8
JK
3755diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/internal.h linux-4.14/crypto/internal.h
3756--- linux-4.14.orig/crypto/internal.h 2017-11-12 19:46:13.000000000 +0100
3757+++ linux-4.14/crypto/internal.h 2018-09-05 11:05:07.000000000 +0200
3758@@ -47,7 +47,7 @@
1a6e0f06
JK
3759
3760 extern struct list_head crypto_alg_list;
3761 extern struct rw_semaphore crypto_alg_sem;
3762-extern struct blocking_notifier_head crypto_chain;
3763+extern struct srcu_notifier_head crypto_chain;
3764
3765 #ifdef CONFIG_PROC_FS
3766 void __init crypto_init_proc(void);
e4b2b4a8 3767@@ -143,7 +143,7 @@
1a6e0f06
JK
3768
3769 static inline void crypto_notify(unsigned long val, void *v)
3770 {
3771- blocking_notifier_call_chain(&crypto_chain, val, v);
3772+ srcu_notifier_call_chain(&crypto_chain, val, v);
3773 }
3774
3775 #endif /* _CRYPTO_INTERNAL_H */
e4b2b4a8
JK
3776diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/events.txt linux-4.14/Documentation/trace/events.txt
3777--- linux-4.14.orig/Documentation/trace/events.txt 2017-11-12 19:46:13.000000000 +0100
3778+++ linux-4.14/Documentation/trace/events.txt 2018-09-05 11:05:07.000000000 +0200
3779@@ -517,1550 +517,4 @@
3780 totals derived from one or more trace event format fields and/or
3781 event counts (hitcount).
3782
3783- The format of a hist trigger is as follows:
3784-
3785- hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
3786- [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
3787- [:clear][:name=histname1] [if <filter>]
3788-
3789- When a matching event is hit, an entry is added to a hash table
3790- using the key(s) and value(s) named. Keys and values correspond to
3791- fields in the event's format description. Values must correspond to
3792- numeric fields - on an event hit, the value(s) will be added to a
3793- sum kept for that field. The special string 'hitcount' can be used
3794- in place of an explicit value field - this is simply a count of
3795- event hits. If 'values' isn't specified, an implicit 'hitcount'
3796- value will be automatically created and used as the only value.
3797- Keys can be any field, or the special string 'stacktrace', which
3798- will use the event's kernel stacktrace as the key. The keywords
3799- 'keys' or 'key' can be used to specify keys, and the keywords
3800- 'values', 'vals', or 'val' can be used to specify values. Compound
3801- keys consisting of up to two fields can be specified by the 'keys'
3802- keyword. Hashing a compound key produces a unique entry in the
3803- table for each unique combination of component keys, and can be
3804- useful for providing more fine-grained summaries of event data.
3805- Additionally, sort keys consisting of up to two fields can be
3806- specified by the 'sort' keyword. If more than one field is
3807- specified, the result will be a 'sort within a sort': the first key
3808- is taken to be the primary sort key and the second the secondary
3809- key. If a hist trigger is given a name using the 'name' parameter,
3810- its histogram data will be shared with other triggers of the same
3811- name, and trigger hits will update this common data. Only triggers
3812- with 'compatible' fields can be combined in this way; triggers are
3813- 'compatible' if the fields named in the trigger share the same
3814- number and type of fields and those fields also have the same names.
3815- Note that any two events always share the compatible 'hitcount' and
3816- 'stacktrace' fields and can therefore be combined using those
3817- fields, however pointless that may be.
3818-
3819- 'hist' triggers add a 'hist' file to each event's subdirectory.
3820- Reading the 'hist' file for the event will dump the hash table in
3821- its entirety to stdout. If there are multiple hist triggers
3822- attached to an event, there will be a table for each trigger in the
3823- output. The table displayed for a named trigger will be the same as
3824- any other instance having the same name. Each printed hash table
3825- entry is a simple list of the keys and values comprising the entry;
3826- keys are printed first and are delineated by curly braces, and are
3827- followed by the set of value fields for the entry. By default,
3828- numeric fields are displayed as base-10 integers. This can be
3829- modified by appending any of the following modifiers to the field
3830- name:
3831-
3832- .hex display a number as a hex value
3833- .sym display an address as a symbol
3834- .sym-offset display an address as a symbol and offset
3835- .syscall display a syscall id as a system call name
3836- .execname display a common_pid as a program name
3837-
3838- Note that in general the semantics of a given field aren't
3839- interpreted when applying a modifier to it, but there are some
3840- restrictions to be aware of in this regard:
3841-
3842- - only the 'hex' modifier can be used for values (because values
3843- are essentially sums, and the other modifiers don't make sense
3844- in that context).
3845- - the 'execname' modifier can only be used on a 'common_pid'. The
3846- reason for this is that the execname is simply the 'comm' value
3847- saved for the 'current' process when an event was triggered,
3848- which is the same as the common_pid value saved by the event
3849- tracing code. Trying to apply that comm value to other pid
3850- values wouldn't be correct, and typically events that care save
3851- pid-specific comm fields in the event itself.
3852-
3853- A typical usage scenario would be the following to enable a hist
3854- trigger, read its current contents, and then turn it off:
3855-
3856- # echo 'hist:keys=skbaddr.hex:vals=len' > \
3857- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3858-
3859- # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
3860-
3861- # echo '!hist:keys=skbaddr.hex:vals=len' > \
3862- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3863-
3864- The trigger file itself can be read to show the details of the
3865- currently attached hist trigger. This information is also displayed
3866- at the top of the 'hist' file when read.
3867-
3868- By default, the size of the hash table is 2048 entries. The 'size'
3869- parameter can be used to specify more or fewer than that. The units
3870- are in terms of hashtable entries - if a run uses more entries than
3871- specified, the results will show the number of 'drops', the number
3872- of hits that were ignored. The size should be a power of 2 between
3873- 128 and 131072 (any non- power-of-2 number specified will be rounded
3874- up).
3875-
3876- The 'sort' parameter can be used to specify a value field to sort
3877- on. The default if unspecified is 'hitcount' and the default sort
3878- order is 'ascending'. To sort in the opposite direction, append
3879- .descending' to the sort key.
3880-
3881- The 'pause' parameter can be used to pause an existing hist trigger
3882- or to start a hist trigger but not log any events until told to do
3883- so. 'continue' or 'cont' can be used to start or restart a paused
3884- hist trigger.
3885-
3886- The 'clear' parameter will clear the contents of a running hist
3887- trigger and leave its current paused/active state.
3888-
3889- Note that the 'pause', 'cont', and 'clear' parameters should be
3890- applied using 'append' shell operator ('>>') if applied to an
3891- existing trigger, rather than via the '>' operator, which will cause
3892- the trigger to be removed through truncation.
3893-
3894-- enable_hist/disable_hist
3895-
3896- The enable_hist and disable_hist triggers can be used to have one
3897- event conditionally start and stop another event's already-attached
3898- hist trigger. Any number of enable_hist and disable_hist triggers
3899- can be attached to a given event, allowing that event to kick off
3900- and stop aggregations on a host of other events.
3901-
3902- The format is very similar to the enable/disable_event triggers:
3903-
3904- enable_hist:<system>:<event>[:count]
3905- disable_hist:<system>:<event>[:count]
3906-
3907- Instead of enabling or disabling the tracing of the target event
3908- into the trace buffer as the enable/disable_event triggers do, the
3909- enable/disable_hist triggers enable or disable the aggregation of
3910- the target event into a hash table.
3911-
3912- A typical usage scenario for the enable_hist/disable_hist triggers
3913- would be to first set up a paused hist trigger on some event,
3914- followed by an enable_hist/disable_hist pair that turns the hist
3915- aggregation on and off when conditions of interest are hit:
3916-
3917- # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
3918- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
3919-
3920- # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
3921- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
3922-
3923- # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
3924- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
3925-
3926- The above sets up an initially paused hist trigger which is unpaused
3927- and starts aggregating events when a given program is executed, and
3928- which stops aggregating when the process exits and the hist trigger
3929- is paused again.
3930-
3931- The examples below provide a more concrete illustration of the
3932- concepts and typical usage patterns discussed above.
3933-
3934-
3935-6.2 'hist' trigger examples
3936----------------------------
3937-
3938- The first set of examples creates aggregations using the kmalloc
3939- event. The fields that can be used for the hist trigger are listed
3940- in the kmalloc event's format file:
3941-
3942- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
3943- name: kmalloc
3944- ID: 374
3945- format:
3946- field:unsigned short common_type; offset:0; size:2; signed:0;
3947- field:unsigned char common_flags; offset:2; size:1; signed:0;
3948- field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
3949- field:int common_pid; offset:4; size:4; signed:1;
3950-
3951- field:unsigned long call_site; offset:8; size:8; signed:0;
3952- field:const void * ptr; offset:16; size:8; signed:0;
3953- field:size_t bytes_req; offset:24; size:8; signed:0;
3954- field:size_t bytes_alloc; offset:32; size:8; signed:0;
3955- field:gfp_t gfp_flags; offset:40; size:4; signed:0;
3956-
3957- We'll start by creating a hist trigger that generates a simple table
3958- that lists the total number of bytes requested for each function in
3959- the kernel that made one or more calls to kmalloc:
3960-
3961- # echo 'hist:key=call_site:val=bytes_req' > \
3962- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
3963-
3964- This tells the tracing system to create a 'hist' trigger using the
3965- call_site field of the kmalloc event as the key for the table, which
3966- just means that each unique call_site address will have an entry
3967- created for it in the table. The 'val=bytes_req' parameter tells
3968- the hist trigger that for each unique entry (call_site) in the
3969- table, it should keep a running total of the number of bytes
3970- requested by that call_site.
3971-
3972- We'll let it run for awhile and then dump the contents of the 'hist'
3973- file in the kmalloc event's subdirectory (for readability, a number
3974- of entries have been omitted):
3975-
3976- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
3977- # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
3978-
3979- { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
3980- { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
3981- { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
3982- { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
3983- { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
3984- { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
3985- { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
3986- { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
3987- { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
3988- { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
3989- .
3990- .
3991- .
3992- { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
3993- { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
3994- { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
3995- { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
3996- { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
3997- { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
3998- { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
3999- { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
4000- { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
4001- { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
4002- { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
4003- { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
4004-
4005- Totals:
4006- Hits: 4610
4007- Entries: 45
4008- Dropped: 0
4009-
4010- The output displays a line for each entry, beginning with the key
4011- specified in the trigger, followed by the value(s) also specified in
4012- the trigger. At the beginning of the output is a line that displays
4013- the trigger info, which can also be displayed by reading the
4014- 'trigger' file:
4015-
4016- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4017- hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
4018-
4019- At the end of the output are a few lines that display the overall
4020- totals for the run. The 'Hits' field shows the total number of
4021- times the event trigger was hit, the 'Entries' field shows the total
4022- number of used entries in the hash table, and the 'Dropped' field
4023- shows the number of hits that were dropped because the number of
4024- used entries for the run exceeded the maximum number of entries
4025- allowed for the table (normally 0, but if not a hint that you may
4026- want to increase the size of the table using the 'size' parameter).
4027-
4028- Notice in the above output that there's an extra field, 'hitcount',
4029- which wasn't specified in the trigger. Also notice that in the
4030- trigger info output, there's a parameter, 'sort=hitcount', which
4031- wasn't specified in the trigger either. The reason for that is that
4032- every trigger implicitly keeps a count of the total number of hits
4033- attributed to a given entry, called the 'hitcount'. That hitcount
4034- information is explicitly displayed in the output, and in the
4035- absence of a user-specified sort parameter, is used as the default
4036- sort field.
4037-
4038- The value 'hitcount' can be used in place of an explicit value in
4039- the 'values' parameter if you don't really need to have any
4040- particular field summed and are mainly interested in hit
4041- frequencies.
4042-
4043- To turn the hist trigger off, simply call up the trigger in the
4044- command history and re-execute it with a '!' prepended:
4045-
4046- # echo '!hist:key=call_site:val=bytes_req' > \
4047- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4048-
4049- Finally, notice that the call_site as displayed in the output above
4050- isn't really very useful. It's an address, but normally addresses
4051- are displayed in hex. To have a numeric field displayed as a hex
4052- value, simply append '.hex' to the field name in the trigger:
4053-
4054- # echo 'hist:key=call_site.hex:val=bytes_req' > \
4055- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4056-
4057- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4058- # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
4059-
4060- { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
4061- { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
4062- { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
4063- { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
4064- { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
4065- { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
4066- { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
4067- { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
4068- { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
4069- { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
4070- { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
4071- { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
4072- .
4073- .
4074- .
4075- { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
4076- { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
4077- { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
4078- { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
4079- { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
4080- { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
4081- { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
4082- { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
4083- { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
4084- { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
4085- { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
4086-
4087- Totals:
4088- Hits: 4775
4089- Entries: 46
4090- Dropped: 0
4091-
4092- Even that's only marginally more useful - while hex values do look
4093- more like addresses, what users are typically more interested in
4094- when looking at text addresses are the corresponding symbols
4095- instead. To have an address displayed as symbolic value instead,
4096- simply append '.sym' or '.sym-offset' to the field name in the
4097- trigger:
4098-
4099- # echo 'hist:key=call_site.sym:val=bytes_req' > \
4100- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4101-
4102- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4103- # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
4104-
4105- { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
4106- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
4107- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
4108- { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
4109- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
4110- { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
4111- { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
4112- { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
4113- { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
4114- { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
4115- { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
4116- { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
4117- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
4118- { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
4119- .
4120- .
4121- .
4122- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
4123- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
4124- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
4125- { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
4126- { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
4127- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
4128- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
4129- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
4130- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
4131- { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
4132- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
4133- { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
4134- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
4135-
4136- Totals:
4137- Hits: 109928
4138- Entries: 71
4139- Dropped: 0
4140-
4141- Because the default sort key above is 'hitcount', the above shows a
4142- the list of call_sites by increasing hitcount, so that at the bottom
4143- we see the functions that made the most kmalloc calls during the
4144- run. If instead we we wanted to see the top kmalloc callers in
4145- terms of the number of bytes requested rather than the number of
4146- calls, and we wanted the top caller to appear at the top, we can use
4147- the 'sort' parameter, along with the 'descending' modifier:
4148-
4149- # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
4150- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4151-
4152- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4153- # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
4154-
4155- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
4156- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
4157- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
4158- { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
4159- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
4160- { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
4161- { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
4162- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
4163- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
4164- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
4165- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
4166- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
4167- { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
4168- .
4169- .
4170- .
4171- { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
4172- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
4173- { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
4174- { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
4175- { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
4176- { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
4177- { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
4178- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
4179- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
4180- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
4181-
4182- Totals:
4183- Hits: 32133
4184- Entries: 81
4185- Dropped: 0
4186-
4187- To display the offset and size information in addition to the symbol
4188- name, just use 'sym-offset' instead:
4189-
4190- # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
4191- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4192-
4193- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4194- # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
4195-
4196- { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
4197- { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
4198- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
4199- { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
4200- { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
4201- { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
4202- { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
4203- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
4204- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
4205- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
4206- { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
4207- { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
4208- .
4209- .
4210- .
4211- { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
4212- { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
4213- { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
4214- { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
4215- { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
4216- { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
4217- { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
4218-
4219- Totals:
4220- Hits: 26098
4221- Entries: 64
4222- Dropped: 0
4223-
4224- We can also add multiple fields to the 'values' parameter. For
4225- example, we might want to see the total number of bytes allocated
4226- alongside bytes requested, and display the result sorted by bytes
4227- allocated in a descending order:
4228-
4229- # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
4230- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4231-
4232- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4233- # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
4234-
4235- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
4236- { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
4237- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
4238- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
4239- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
4240- { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
4241- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
4242- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
4243- { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
4244- { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
4245- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
4246- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
4247- .
4248- .
4249- .
4250- { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
4251- { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4252- { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4253- { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4254- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
4255- { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
4256- { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
4257- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
4258- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
4259- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
4260-
4261- Totals:
4262- Hits: 66598
4263- Entries: 65
4264- Dropped: 0
4265-
4266- Finally, to finish off our kmalloc example, instead of simply having
4267- the hist trigger display symbolic call_sites, we can have the hist
4268- trigger additionally display the complete set of kernel stack traces
4269- that led to each call_site. To do that, we simply use the special
4270- value 'stacktrace' for the key parameter:
4271-
4272- # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
4273- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4274-
4275- The above trigger will use the kernel stack trace in effect when an
4276- event is triggered as the key for the hash table. This allows the
4277- enumeration of every kernel callpath that led up to a particular
4278- event, along with a running total of any of the event fields for
4279- that event. Here we tally bytes requested and bytes allocated for
4280- every callpath in the system that led up to a kmalloc (in this case
4281- every callpath to a kmalloc for a kernel compile):
4282-
4283- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4284- # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
4285-
4286- { stacktrace:
4287- __kmalloc_track_caller+0x10b/0x1a0
4288- kmemdup+0x20/0x50
4289- hidraw_report_event+0x8a/0x120 [hid]
4290- hid_report_raw_event+0x3ea/0x440 [hid]
4291- hid_input_report+0x112/0x190 [hid]
4292- hid_irq_in+0xc2/0x260 [usbhid]
4293- __usb_hcd_giveback_urb+0x72/0x120
4294- usb_giveback_urb_bh+0x9e/0xe0
4295- tasklet_hi_action+0xf8/0x100
4296- __do_softirq+0x114/0x2c0
4297- irq_exit+0xa5/0xb0
4298- do_IRQ+0x5a/0xf0
4299- ret_from_intr+0x0/0x30
4300- cpuidle_enter+0x17/0x20
4301- cpu_startup_entry+0x315/0x3e0
4302- rest_init+0x7c/0x80
4303- } hitcount: 3 bytes_req: 21 bytes_alloc: 24
4304- { stacktrace:
4305- __kmalloc_track_caller+0x10b/0x1a0
4306- kmemdup+0x20/0x50
4307- hidraw_report_event+0x8a/0x120 [hid]
4308- hid_report_raw_event+0x3ea/0x440 [hid]
4309- hid_input_report+0x112/0x190 [hid]
4310- hid_irq_in+0xc2/0x260 [usbhid]
4311- __usb_hcd_giveback_urb+0x72/0x120
4312- usb_giveback_urb_bh+0x9e/0xe0
4313- tasklet_hi_action+0xf8/0x100
4314- __do_softirq+0x114/0x2c0
4315- irq_exit+0xa5/0xb0
4316- do_IRQ+0x5a/0xf0
4317- ret_from_intr+0x0/0x30
4318- } hitcount: 3 bytes_req: 21 bytes_alloc: 24
4319- { stacktrace:
4320- kmem_cache_alloc_trace+0xeb/0x150
4321- aa_alloc_task_context+0x27/0x40
4322- apparmor_cred_prepare+0x1f/0x50
4323- security_prepare_creds+0x16/0x20
4324- prepare_creds+0xdf/0x1a0
4325- SyS_capset+0xb5/0x200
4326- system_call_fastpath+0x12/0x6a
4327- } hitcount: 1 bytes_req: 32 bytes_alloc: 32
4328- .
4329- .
4330- .
4331- { stacktrace:
4332- __kmalloc+0x11b/0x1b0
4333- i915_gem_execbuffer2+0x6c/0x2c0 [i915]
4334- drm_ioctl+0x349/0x670 [drm]
4335- do_vfs_ioctl+0x2f0/0x4f0
4336- SyS_ioctl+0x81/0xa0
4337- system_call_fastpath+0x12/0x6a
4338- } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
4339- { stacktrace:
4340- __kmalloc+0x11b/0x1b0
4341- load_elf_phdrs+0x76/0xa0
4342- load_elf_binary+0x102/0x1650
4343- search_binary_handler+0x97/0x1d0
4344- do_execveat_common.isra.34+0x551/0x6e0
4345- SyS_execve+0x3a/0x50
4346- return_from_execve+0x0/0x23
4347- } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
4348- { stacktrace:
4349- kmem_cache_alloc_trace+0xeb/0x150
4350- apparmor_file_alloc_security+0x27/0x40
4351- security_file_alloc+0x16/0x20
4352- get_empty_filp+0x93/0x1c0
4353- path_openat+0x31/0x5f0
4354- do_filp_open+0x3a/0x90
4355- do_sys_open+0x128/0x220
4356- SyS_open+0x1e/0x20
4357- system_call_fastpath+0x12/0x6a
4358- } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
4359- { stacktrace:
4360- __kmalloc+0x11b/0x1b0
4361- seq_buf_alloc+0x1b/0x50
4362- seq_read+0x2cc/0x370
4363- proc_reg_read+0x3d/0x80
4364- __vfs_read+0x28/0xe0
4365- vfs_read+0x86/0x140
4366- SyS_read+0x46/0xb0
4367- system_call_fastpath+0x12/0x6a
4368- } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
4369-
4370- Totals:
4371- Hits: 6085872
4372- Entries: 253
4373- Dropped: 0
4374-
4375- If you key a hist trigger on common_pid, in order for example to
4376- gather and display sorted totals for each process, you can use the
4377- special .execname modifier to display the executable names for the
4378- processes in the table rather than raw pids. The example below
4379- keeps a per-process sum of total bytes read:
4380-
4381- # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
4382- /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
4383-
4384- # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
4385- # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
4386-
4387- { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
4388- { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
4389- { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
4390- { common_pid: bash [ 8710] } hitcount: 3 count: 66369
4391- { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
4392- { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
4393- { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
4394- { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
4395- { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
4396- { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
4397- { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
4398- .
4399- .
4400- .
4401- { common_pid: postgres [ 1892] } hitcount: 2 count: 32
4402- { common_pid: postgres [ 1891] } hitcount: 2 count: 32
4403- { common_pid: gmain [ 8704] } hitcount: 2 count: 32
4404- { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
4405- { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
4406- { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
4407- { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
4408- { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
4409- { common_pid: init [ 1] } hitcount: 2 count: 2
4410-
4411- Totals:
4412- Hits: 2116
4413- Entries: 51
4414- Dropped: 0
4415-
4416- Similarly, if you key a hist trigger on syscall id, for example to
4417- gather and display a list of systemwide syscall hits, you can use
4418- the special .syscall modifier to display the syscall names rather
4419- than raw ids. The example below keeps a running total of syscall
4420- counts for the system during the run:
4421-
4422- # echo 'hist:key=id.syscall:val=hitcount' > \
4423- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4424-
4425- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4426- # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
4427-
4428- { id: sys_fsync [ 74] } hitcount: 1
4429- { id: sys_newuname [ 63] } hitcount: 1
4430- { id: sys_prctl [157] } hitcount: 1
4431- { id: sys_statfs [137] } hitcount: 1
4432- { id: sys_symlink [ 88] } hitcount: 1
4433- { id: sys_sendmmsg [307] } hitcount: 1
4434- { id: sys_semctl [ 66] } hitcount: 1
4435- { id: sys_readlink [ 89] } hitcount: 3
4436- { id: sys_bind [ 49] } hitcount: 3
4437- { id: sys_getsockname [ 51] } hitcount: 3
4438- { id: sys_unlink [ 87] } hitcount: 3
4439- { id: sys_rename [ 82] } hitcount: 4
4440- { id: unknown_syscall [ 58] } hitcount: 4
4441- { id: sys_connect [ 42] } hitcount: 4
4442- { id: sys_getpid [ 39] } hitcount: 4
4443- .
4444- .
4445- .
4446- { id: sys_rt_sigprocmask [ 14] } hitcount: 952
4447- { id: sys_futex [202] } hitcount: 1534
4448- { id: sys_write [ 1] } hitcount: 2689
4449- { id: sys_setitimer [ 38] } hitcount: 2797
4450- { id: sys_read [ 0] } hitcount: 3202
4451- { id: sys_select [ 23] } hitcount: 3773
4452- { id: sys_writev [ 20] } hitcount: 4531
4453- { id: sys_poll [ 7] } hitcount: 8314
4454- { id: sys_recvmsg [ 47] } hitcount: 13738
4455- { id: sys_ioctl [ 16] } hitcount: 21843
4456-
4457- Totals:
4458- Hits: 67612
4459- Entries: 72
4460- Dropped: 0
4461-
4462- The syscall counts above provide a rough overall picture of system
4463- call activity on the system; we can see for example that the most
4464- popular system call on this system was the 'sys_ioctl' system call.
4465-
4466- We can use 'compound' keys to refine that number and provide some
4467- further insight as to which processes exactly contribute to the
4468- overall ioctl count.
4469-
4470- The command below keeps a hitcount for every unique combination of
4471- system call id and pid - the end result is essentially a table
4472- that keeps a per-pid sum of system call hits. The results are
4473- sorted using the system call id as the primary key, and the
4474- hitcount sum as the secondary key:
4475-
4476- # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
4477- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4478-
4479- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4480- # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
4481-
4482- { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
4483- { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
4484- { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
4485- { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
4486- { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
4487- { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
4488- { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
4489- { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
4490- { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
4491- { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
4492- .
4493- .
4494- .
4495- { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
4496- { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
4497- { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
4498- { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
4499- { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
4500- .
4501- .
4502- .
4503- { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
4504- { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
4505- { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
4506- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
4507- { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
4508- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
4509- { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
4510- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
4511- { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
4512- { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
4513- { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
4514-
4515- Totals:
4516- Hits: 31536
4517- Entries: 323
4518- Dropped: 0
4519-
4520- The above list does give us a breakdown of the ioctl syscall by
4521- pid, but it also gives us quite a bit more than that, which we
4522- don't really care about at the moment. Since we know the syscall
4523- id for sys_ioctl (16, displayed next to the sys_ioctl name), we
4524- can use that to filter out all the other syscalls:
4525-
4526- # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
4527- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4528-
4529- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4530- # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
4531-
4532- { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
4533- { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
4534- { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
4535- { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
4536- { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
4537- { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
4538- { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
4539- { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
4540- { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
4541- .
4542- .
4543- .
4544- { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
4545- { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
4546- { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
4547- { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
4548- { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
4549- { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
4550-
4551- Totals:
4552- Hits: 101162
4553- Entries: 103
4554- Dropped: 0
4555-
4556- The above output shows that 'compiz' and 'Xorg' are far and away
4557- the heaviest ioctl callers (which might lead to questions about
4558- whether they really need to be making all those calls and to
4559- possible avenues for further investigation.)
4560-
4561- The compound key examples used a key and a sum value (hitcount) to
4562- sort the output, but we can just as easily use two keys instead.
4563- Here's an example where we use a compound key composed of the the
4564- common_pid and size event fields. Sorting with pid as the primary
4565- key and 'size' as the secondary key allows us to display an
4566- ordered summary of the recvfrom sizes, with counts, received by
4567- each process:
4568-
4569- # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
4570- /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
4571-
4572- # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
4573- # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
4574-
4575- { common_pid: smbd [ 784], size: 4 } hitcount: 1
4576- { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
4577- { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
4578- { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
4579- { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
4580- { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
4581- { common_pid: compiz [ 2994], size: 8 } hitcount: 1
4582- { common_pid: compiz [ 2994], size: 20 } hitcount: 11
4583- { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
4584- { common_pid: firefox [ 8817], size: 4 } hitcount: 1
4585- { common_pid: firefox [ 8817], size: 8 } hitcount: 5
4586- { common_pid: firefox [ 8817], size: 588 } hitcount: 2
4587- { common_pid: firefox [ 8817], size: 628 } hitcount: 1
4588- { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
4589- { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
4590- { common_pid: firefox [ 8822], size: 8 } hitcount: 2
4591- { common_pid: firefox [ 8822], size: 160 } hitcount: 2
4592- { common_pid: firefox [ 8822], size: 320 } hitcount: 2
4593- { common_pid: firefox [ 8822], size: 352 } hitcount: 1
4594- .
4595- .
4596- .
4597- { common_pid: pool [ 8923], size: 1960 } hitcount: 10
4598- { common_pid: pool [ 8923], size: 2048 } hitcount: 10
4599- { common_pid: pool [ 8924], size: 1960 } hitcount: 10
4600- { common_pid: pool [ 8924], size: 2048 } hitcount: 10
4601- { common_pid: pool [ 8928], size: 1964 } hitcount: 4
4602- { common_pid: pool [ 8928], size: 1965 } hitcount: 2
4603- { common_pid: pool [ 8928], size: 2048 } hitcount: 6
4604- { common_pid: pool [ 8929], size: 1982 } hitcount: 1
4605- { common_pid: pool [ 8929], size: 2048 } hitcount: 1
4606-
4607- Totals:
4608- Hits: 2016
4609- Entries: 224
4610- Dropped: 0
4611-
4612- The above example also illustrates the fact that although a compound
4613- key is treated as a single entity for hashing purposes, the sub-keys
4614- it's composed of can be accessed independently.
4615-
4616- The next example uses a string field as the hash key and
4617- demonstrates how you can manually pause and continue a hist trigger.
4618- In this example, we'll aggregate fork counts and don't expect a
4619- large number of entries in the hash table, so we'll drop it to a
4620- much smaller number, say 256:
4621-
4622- # echo 'hist:key=child_comm:val=hitcount:size=256' > \
4623- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4624-
4625- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4626- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
4627-
4628- { child_comm: dconf worker } hitcount: 1
4629- { child_comm: ibus-daemon } hitcount: 1
4630- { child_comm: whoopsie } hitcount: 1
4631- { child_comm: smbd } hitcount: 1
4632- { child_comm: gdbus } hitcount: 1
4633- { child_comm: kthreadd } hitcount: 1
4634- { child_comm: dconf worker } hitcount: 1
4635- { child_comm: evolution-alarm } hitcount: 2
4636- { child_comm: Socket Thread } hitcount: 2
4637- { child_comm: postgres } hitcount: 2
4638- { child_comm: bash } hitcount: 3
4639- { child_comm: compiz } hitcount: 3
4640- { child_comm: evolution-sourc } hitcount: 4
4641- { child_comm: dhclient } hitcount: 4
4642- { child_comm: pool } hitcount: 5
4643- { child_comm: nm-dispatcher.a } hitcount: 8
4644- { child_comm: firefox } hitcount: 8
4645- { child_comm: dbus-daemon } hitcount: 8
4646- { child_comm: glib-pacrunner } hitcount: 10
4647- { child_comm: evolution } hitcount: 23
4648-
4649- Totals:
4650- Hits: 89
4651- Entries: 20
4652- Dropped: 0
4653-
4654- If we want to pause the hist trigger, we can simply append :pause to
4655- the command that started the trigger. Notice that the trigger info
4656- displays as [paused]:
4657-
4658- # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
4659- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4660-
4661- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4662- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
4663-
4664- { child_comm: dconf worker } hitcount: 1
4665- { child_comm: kthreadd } hitcount: 1
4666- { child_comm: dconf worker } hitcount: 1
4667- { child_comm: gdbus } hitcount: 1
4668- { child_comm: ibus-daemon } hitcount: 1
4669- { child_comm: Socket Thread } hitcount: 2
4670- { child_comm: evolution-alarm } hitcount: 2
4671- { child_comm: smbd } hitcount: 2
4672- { child_comm: bash } hitcount: 3
4673- { child_comm: whoopsie } hitcount: 3
4674- { child_comm: compiz } hitcount: 3
4675- { child_comm: evolution-sourc } hitcount: 4
4676- { child_comm: pool } hitcount: 5
4677- { child_comm: postgres } hitcount: 6
4678- { child_comm: firefox } hitcount: 8
4679- { child_comm: dhclient } hitcount: 10
4680- { child_comm: emacs } hitcount: 12
4681- { child_comm: dbus-daemon } hitcount: 20
4682- { child_comm: nm-dispatcher.a } hitcount: 20
4683- { child_comm: evolution } hitcount: 35
4684- { child_comm: glib-pacrunner } hitcount: 59
4685-
4686- Totals:
4687- Hits: 199
4688- Entries: 21
4689- Dropped: 0
4690-
4691- To manually continue having the trigger aggregate events, append
4692- :cont instead. Notice that the trigger info displays as [active]
4693- again, and the data has changed:
4694-
4695- # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
4696- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4697-
4698- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4699- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
4700-
4701- { child_comm: dconf worker } hitcount: 1
4702- { child_comm: dconf worker } hitcount: 1
4703- { child_comm: kthreadd } hitcount: 1
4704- { child_comm: gdbus } hitcount: 1
4705- { child_comm: ibus-daemon } hitcount: 1
4706- { child_comm: Socket Thread } hitcount: 2
4707- { child_comm: evolution-alarm } hitcount: 2
4708- { child_comm: smbd } hitcount: 2
4709- { child_comm: whoopsie } hitcount: 3
4710- { child_comm: compiz } hitcount: 3
4711- { child_comm: evolution-sourc } hitcount: 4
4712- { child_comm: bash } hitcount: 5
4713- { child_comm: pool } hitcount: 5
4714- { child_comm: postgres } hitcount: 6
4715- { child_comm: firefox } hitcount: 8
4716- { child_comm: dhclient } hitcount: 11
4717- { child_comm: emacs } hitcount: 12
4718- { child_comm: dbus-daemon } hitcount: 22
4719- { child_comm: nm-dispatcher.a } hitcount: 22
4720- { child_comm: evolution } hitcount: 35
4721- { child_comm: glib-pacrunner } hitcount: 59
4722-
4723- Totals:
4724- Hits: 206
4725- Entries: 21
4726- Dropped: 0
4727-
4728- The previous example showed how to start and stop a hist trigger by
4729- appending 'pause' and 'continue' to the hist trigger command. A
4730- hist trigger can also be started in a paused state by initially
4731- starting the trigger with ':pause' appended. This allows you to
4732- start the trigger only when you're ready to start collecting data
4733- and not before. For example, you could start the trigger in a
4734- paused state, then unpause it and do something you want to measure,
4735- then pause the trigger again when done.
4736-
4737- Of course, doing this manually can be difficult and error-prone, but
4738- it is possible to automatically start and stop a hist trigger based
4739- on some condition, via the enable_hist and disable_hist triggers.
4740-
4741- For example, suppose we wanted to take a look at the relative
4742- weights in terms of skb length for each callpath that leads to a
4743- netif_receieve_skb event when downloading a decent-sized file using
4744- wget.
4745-
4746- First we set up an initially paused stacktrace trigger on the
4747- netif_receive_skb event:
4748-
4749- # echo 'hist:key=stacktrace:vals=len:pause' > \
4750- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4751-
4752- Next, we set up an 'enable_hist' trigger on the sched_process_exec
4753- event, with an 'if filename==/usr/bin/wget' filter. The effect of
4754- this new trigger is that it will 'unpause' the hist trigger we just
4755- set up on netif_receive_skb if and only if it sees a
4756- sched_process_exec event with a filename of '/usr/bin/wget'. When
4757- that happens, all netif_receive_skb events are aggregated into a
4758- hash table keyed on stacktrace:
4759-
4760- # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
4761- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4762-
4763- The aggregation continues until the netif_receive_skb is paused
4764- again, which is what the following disable_hist event does by
4765- creating a similar setup on the sched_process_exit event, using the
4766- filter 'comm==wget':
4767-
4768- # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
4769- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4770-
4771- Whenever a process exits and the comm field of the disable_hist
4772- trigger filter matches 'comm==wget', the netif_receive_skb hist
4773- trigger is disabled.
4774-
4775- The overall effect is that netif_receive_skb events are aggregated
4776- into the hash table for only the duration of the wget. Executing a
4777- wget command and then listing the 'hist' file will display the
4778- output generated by the wget command:
4779-
4780- $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
4781-
4782- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4783- # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
4784-
4785- { stacktrace:
4786- __netif_receive_skb_core+0x46d/0x990
4787- __netif_receive_skb+0x18/0x60
4788- netif_receive_skb_internal+0x23/0x90
4789- napi_gro_receive+0xc8/0x100
4790- ieee80211_deliver_skb+0xd6/0x270 [mac80211]
4791- ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
4792- ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
4793- ieee80211_rx+0x31d/0x900 [mac80211]
4794- iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
4795- iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
4796- iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
4797- irq_thread_fn+0x20/0x50
4798- irq_thread+0x11f/0x150
4799- kthread+0xd2/0xf0
4800- ret_from_fork+0x42/0x70
4801- } hitcount: 85 len: 28884
4802- { stacktrace:
4803- __netif_receive_skb_core+0x46d/0x990
4804- __netif_receive_skb+0x18/0x60
4805- netif_receive_skb_internal+0x23/0x90
4806- napi_gro_complete+0xa4/0xe0
4807- dev_gro_receive+0x23a/0x360
4808- napi_gro_receive+0x30/0x100
4809- ieee80211_deliver_skb+0xd6/0x270 [mac80211]
4810- ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
4811- ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
4812- ieee80211_rx+0x31d/0x900 [mac80211]
4813- iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
4814- iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
4815- iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
4816- irq_thread_fn+0x20/0x50
4817- irq_thread+0x11f/0x150
4818- kthread+0xd2/0xf0
4819- } hitcount: 98 len: 664329
4820- { stacktrace:
4821- __netif_receive_skb_core+0x46d/0x990
4822- __netif_receive_skb+0x18/0x60
4823- process_backlog+0xa8/0x150
4824- net_rx_action+0x15d/0x340
4825- __do_softirq+0x114/0x2c0
4826- do_softirq_own_stack+0x1c/0x30
4827- do_softirq+0x65/0x70
4828- __local_bh_enable_ip+0xb5/0xc0
4829- ip_finish_output+0x1f4/0x840
4830- ip_output+0x6b/0xc0
4831- ip_local_out_sk+0x31/0x40
4832- ip_send_skb+0x1a/0x50
4833- udp_send_skb+0x173/0x2a0
4834- udp_sendmsg+0x2bf/0x9f0
4835- inet_sendmsg+0x64/0xa0
4836- sock_sendmsg+0x3d/0x50
4837- } hitcount: 115 len: 13030
4838- { stacktrace:
4839- __netif_receive_skb_core+0x46d/0x990
4840- __netif_receive_skb+0x18/0x60
4841- netif_receive_skb_internal+0x23/0x90
4842- napi_gro_complete+0xa4/0xe0
4843- napi_gro_flush+0x6d/0x90
4844- iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
4845- irq_thread_fn+0x20/0x50
4846- irq_thread+0x11f/0x150
4847- kthread+0xd2/0xf0
4848- ret_from_fork+0x42/0x70
4849- } hitcount: 934 len: 5512212
4850-
4851- Totals:
4852- Hits: 1232
4853- Entries: 4
4854- Dropped: 0
4855-
4856- The above shows all the netif_receive_skb callpaths and their total
4857- lengths for the duration of the wget command.
4858-
4859- The 'clear' hist trigger param can be used to clear the hash table.
4860- Suppose we wanted to try another run of the previous example but
4861- this time also wanted to see the complete list of events that went
4862- into the histogram. In order to avoid having to set everything up
4863- again, we can just clear the histogram first:
4864-
4865- # echo 'hist:key=stacktrace:vals=len:clear' >> \
4866- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4867-
4868- Just to verify that it is in fact cleared, here's what we now see in
4869- the hist file:
4870-
4871- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4872- # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
4873-
4874- Totals:
4875- Hits: 0
4876- Entries: 0
4877- Dropped: 0
4878-
4879- Since we want to see the detailed list of every netif_receive_skb
4880- event occurring during the new run, which are in fact the same
4881- events being aggregated into the hash table, we add some additional
4882- 'enable_event' events to the triggering sched_process_exec and
4883- sched_process_exit events as such:
4884-
4885- # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
4886- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4887-
4888- # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
4889- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4890-
4891- If you read the trigger files for the sched_process_exec and
4892- sched_process_exit triggers, you should see two triggers for each:
4893- one enabling/disabling the hist aggregation and the other
4894- enabling/disabling the logging of events:
4895-
4896- # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4897- enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
4898- enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
4899-
4900- # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4901- enable_event:net:netif_receive_skb:unlimited if comm==wget
4902- disable_hist:net:netif_receive_skb:unlimited if comm==wget
4903-
4904- In other words, whenever either of the sched_process_exec or
4905- sched_process_exit events is hit and matches 'wget', it enables or
4906- disables both the histogram and the event log, and what you end up
4907- with is a hash table and set of events just covering the specified
4908- duration. Run the wget command again:
4909-
4910- $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
4911-
4912- Displaying the 'hist' file should show something similar to what you
4913- saw in the last run, but this time you should also see the
4914- individual events in the trace file:
4915-
4916- # cat /sys/kernel/debug/tracing/trace
4917-
4918- # tracer: nop
4919- #
4920- # entries-in-buffer/entries-written: 183/1426 #P:4
4921- #
4922- # _-----=> irqs-off
4923- # / _----=> need-resched
4924- # | / _---=> hardirq/softirq
4925- # || / _--=> preempt-depth
4926- # ||| / delay
4927- # TASK-PID CPU# |||| TIMESTAMP FUNCTION
4928- # | | | |||| | |
4929- wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
4930- wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
4931- dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
4932- dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
4933- ##### CPU 2 buffer started ####
4934- irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
4935- irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
4936- irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
4937- irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
4938- irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
4939- .
4940- .
4941- .
4942-
4943- The following example demonstrates how multiple hist triggers can be
4944- attached to a given event. This capability can be useful for
4945- creating a set of different summaries derived from the same set of
4946- events, or for comparing the effects of different filters, among
4947- other things.
4948-
4949- # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
4950- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4951- # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
4952- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4953- # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
4954- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4955- # echo 'hist:keys=skbaddr.hex:vals=len' >> \
4956- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4957- # echo 'hist:keys=len:vals=common_preempt_count' >> \
4958- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4959-
4960- The above set of commands create four triggers differing only in
4961- their filters, along with a completely different though fairly
4962- nonsensical trigger. Note that in order to append multiple hist
4963- triggers to the same file, you should use the '>>' operator to
4964- append them ('>' will also add the new hist trigger, but will remove
4965- any existing hist triggers beforehand).
4966-
4967- Displaying the contents of the 'hist' file for the event shows the
4968- contents of all five histograms:
4969-
4970- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4971-
4972- # event histogram
4973- #
4974- # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
4975- #
4976-
4977- { len: 176 } hitcount: 1 common_preempt_count: 0
4978- { len: 223 } hitcount: 1 common_preempt_count: 0
4979- { len: 4854 } hitcount: 1 common_preempt_count: 0
4980- { len: 395 } hitcount: 1 common_preempt_count: 0
4981- { len: 177 } hitcount: 1 common_preempt_count: 0
4982- { len: 446 } hitcount: 1 common_preempt_count: 0
4983- { len: 1601 } hitcount: 1 common_preempt_count: 0
4984- .
4985- .
4986- .
4987- { len: 1280 } hitcount: 66 common_preempt_count: 0
4988- { len: 116 } hitcount: 81 common_preempt_count: 40
4989- { len: 708 } hitcount: 112 common_preempt_count: 0
4990- { len: 46 } hitcount: 221 common_preempt_count: 0
4991- { len: 1264 } hitcount: 458 common_preempt_count: 0
4992-
4993- Totals:
4994- Hits: 1428
4995- Entries: 147
4996- Dropped: 0
4997-
4998-
4999- # event histogram
5000- #
5001- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5002- #
5003-
5004- { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
5005- { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
5006- { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
5007- { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
5008- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
5009- { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
5010- { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
5011- { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
5012- { skbaddr: ffff880100065900 } hitcount: 1 len: 46
5013- { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
5014- { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
5015- { skbaddr: ffff880100064700 } hitcount: 1 len: 365
5016- { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
5017- .
5018- .
5019- .
5020- { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
5021- { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
5022- { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
5023- { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
5024- { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
5025- { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
5026- { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
5027- { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
5028- { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
5029-
5030- Totals:
5031- Hits: 1451
5032- Entries: 318
5033- Dropped: 0
5034-
5035-
5036- # event histogram
5037- #
5038- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
5039- #
5040-
5041-
5042- Totals:
5043- Hits: 0
5044- Entries: 0
5045- Dropped: 0
5046-
5047-
5048- # event histogram
5049- #
5050- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
5051- #
5052-
5053- { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
5054- { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
5055- { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
5056- { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
5057- { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
5058- { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
5059- { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
5060- { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
5061- { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
5062- { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
5063- { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
5064- { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
5065-
5066- Totals:
5067- Hits: 14
5068- Entries: 12
5069- Dropped: 0
5070-
5071-
5072- # event histogram
5073- #
5074- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
5075- #
5076-
5077-
5078- Totals:
5079- Hits: 0
5080- Entries: 0
5081- Dropped: 0
5082-
5083- Named triggers can be used to have triggers share a common set of
5084- histogram data. This capability is mostly useful for combining the
5085- output of events generated by tracepoints contained inside inline
5086- functions, but names can be used in a hist trigger on any event.
5087- For example, these two triggers when hit will update the same 'len'
5088- field in the shared 'foo' histogram data:
5089-
5090- # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
5091- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
5092- # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
5093- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5094-
5095- You can see that they're updating common histogram data by reading
5096- each event's hist files at the same time:
5097-
5098- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
5099- cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5100-
5101- # event histogram
5102- #
5103- # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5104- #
5105-
5106- { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
5107- { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
5108- { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
5109- { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
5110- { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
5111- { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
5112- { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
5113- { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
5114- { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
5115- { skbaddr: ffff880064505000 } hitcount: 1 len: 46
5116- { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
5117- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
5118- { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
5119- { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
5120- { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
5121- { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
5122- { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
5123- { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
5124- { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
5125- { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
5126- { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
5127- { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
5128- { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
5129- { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
5130- { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
5131- { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
5132- { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
5133- { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
5134- { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
5135- { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
5136- { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
5137- { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
5138- { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
5139- { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
5140- { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
5141- { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
5142- { skbaddr: ffff880064504400 } hitcount: 4 len: 184
5143- { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
5144- { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
5145- { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
5146- { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
5147- { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
5148-
5149- Totals:
5150- Hits: 81
5151- Entries: 42
5152- Dropped: 0
5153- # event histogram
5154- #
5155- # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5156- #
5157-
5158- { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
5159- { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
5160- { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
5161- { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
5162- { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
5163- { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
5164- { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
5165- { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
5166- { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
5167- { skbaddr: ffff880064505000 } hitcount: 1 len: 46
5168- { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
5169- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
5170- { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
5171- { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
5172- { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
5173- { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
5174- { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
5175- { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
5176- { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
5177- { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
5178- { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
5179- { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
5180- { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
5181- { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
5182- { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
5183- { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
5184- { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
5185- { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
5186- { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
5187- { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
5188- { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
5189- { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
5190- { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
5191- { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
5192- { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
5193- { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
5194- { skbaddr: ffff880064504400 } hitcount: 4 len: 184
5195- { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
5196- { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
5197- { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
5198- { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
5199- { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
5200-
5201- Totals:
5202- Hits: 81
5203- Entries: 42
5204- Dropped: 0
5205-
5206- And here's an example that shows how to combine histogram data from
5207- any two events even if they don't share any 'compatible' fields
5208- other than 'hitcount' and 'stacktrace'. These commands create a
5209- couple of triggers named 'bar' using those fields:
5210-
5211- # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
5212- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
5213- # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
5214- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5215-
5216- And displaying the output of either shows some interesting if
5217- somewhat confusing output:
5218-
5219- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
5220- # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5221-
5222- # event histogram
5223- #
5224- # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
5225- #
5226-
5227- { stacktrace:
5228- _do_fork+0x18e/0x330
5229- kernel_thread+0x29/0x30
5230- kthreadd+0x154/0x1b0
5231- ret_from_fork+0x3f/0x70
5232- } hitcount: 1
5233- { stacktrace:
5234- netif_rx_internal+0xb2/0xd0
5235- netif_rx_ni+0x20/0x70
5236- dev_loopback_xmit+0xaa/0xd0
5237- ip_mc_output+0x126/0x240
5238- ip_local_out_sk+0x31/0x40
5239- igmp_send_report+0x1e9/0x230
5240- igmp_timer_expire+0xe9/0x120
5241- call_timer_fn+0x39/0xf0
5242- run_timer_softirq+0x1e1/0x290
5243- __do_softirq+0xfd/0x290
5244- irq_exit+0x98/0xb0
5245- smp_apic_timer_interrupt+0x4a/0x60
5246- apic_timer_interrupt+0x6d/0x80
5247- cpuidle_enter+0x17/0x20
5248- call_cpuidle+0x3b/0x60
5249- cpu_startup_entry+0x22d/0x310
5250- } hitcount: 1
5251- { stacktrace:
5252- netif_rx_internal+0xb2/0xd0
5253- netif_rx_ni+0x20/0x70
5254- dev_loopback_xmit+0xaa/0xd0
5255- ip_mc_output+0x17f/0x240
5256- ip_local_out_sk+0x31/0x40
5257- ip_send_skb+0x1a/0x50
5258- udp_send_skb+0x13e/0x270
5259- udp_sendmsg+0x2bf/0x980
5260- inet_sendmsg+0x67/0xa0
5261- sock_sendmsg+0x38/0x50
5262- SYSC_sendto+0xef/0x170
5263- SyS_sendto+0xe/0x10
5264- entry_SYSCALL_64_fastpath+0x12/0x6a
5265- } hitcount: 2
5266- { stacktrace:
5267- netif_rx_internal+0xb2/0xd0
5268- netif_rx+0x1c/0x60
5269- loopback_xmit+0x6c/0xb0
5270- dev_hard_start_xmit+0x219/0x3a0
5271- __dev_queue_xmit+0x415/0x4f0
5272- dev_queue_xmit_sk+0x13/0x20
5273- ip_finish_output2+0x237/0x340
5274- ip_finish_output+0x113/0x1d0
5275- ip_output+0x66/0xc0
5276- ip_local_out_sk+0x31/0x40
5277- ip_send_skb+0x1a/0x50
5278- udp_send_skb+0x16d/0x270
5279- udp_sendmsg+0x2bf/0x980
5280- inet_sendmsg+0x67/0xa0
5281- sock_sendmsg+0x38/0x50
5282- ___sys_sendmsg+0x14e/0x270
5283- } hitcount: 76
5284- { stacktrace:
5285- netif_rx_internal+0xb2/0xd0
5286- netif_rx+0x1c/0x60
5287- loopback_xmit+0x6c/0xb0
5288- dev_hard_start_xmit+0x219/0x3a0
5289- __dev_queue_xmit+0x415/0x4f0
5290- dev_queue_xmit_sk+0x13/0x20
5291- ip_finish_output2+0x237/0x340
5292- ip_finish_output+0x113/0x1d0
5293- ip_output+0x66/0xc0
5294- ip_local_out_sk+0x31/0x40
5295- ip_send_skb+0x1a/0x50
5296- udp_send_skb+0x16d/0x270
5297- udp_sendmsg+0x2bf/0x980
5298- inet_sendmsg+0x67/0xa0
5299- sock_sendmsg+0x38/0x50
5300- ___sys_sendmsg+0x269/0x270
5301- } hitcount: 77
5302- { stacktrace:
5303- netif_rx_internal+0xb2/0xd0
5304- netif_rx+0x1c/0x60
5305- loopback_xmit+0x6c/0xb0
5306- dev_hard_start_xmit+0x219/0x3a0
5307- __dev_queue_xmit+0x415/0x4f0
5308- dev_queue_xmit_sk+0x13/0x20
5309- ip_finish_output2+0x237/0x340
5310- ip_finish_output+0x113/0x1d0
5311- ip_output+0x66/0xc0
5312- ip_local_out_sk+0x31/0x40
5313- ip_send_skb+0x1a/0x50
5314- udp_send_skb+0x16d/0x270
5315- udp_sendmsg+0x2bf/0x980
5316- inet_sendmsg+0x67/0xa0
5317- sock_sendmsg+0x38/0x50
5318- SYSC_sendto+0xef/0x170
5319- } hitcount: 88
5320- { stacktrace:
5321- _do_fork+0x18e/0x330
5322- SyS_clone+0x19/0x20
5323- entry_SYSCALL_64_fastpath+0x12/0x6a
5324- } hitcount: 244
5325-
5326- Totals:
5327- Hits: 489
5328- Entries: 7
5329- Dropped: 0
5330+ See Documentation/trace/histogram.txt for details and examples.
5331diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/ftrace.txt linux-4.14/Documentation/trace/ftrace.txt
5332--- linux-4.14.orig/Documentation/trace/ftrace.txt 2017-11-12 19:46:13.000000000 +0100
5333+++ linux-4.14/Documentation/trace/ftrace.txt 2018-09-05 11:05:07.000000000 +0200
5334@@ -539,6 +539,30 @@
5335
5336 See events.txt for more information.
5337
5338+ timestamp_mode:
5339+
5340+ Certain tracers may change the timestamp mode used when
5341+ logging trace events into the event buffer. Events with
5342+ different modes can coexist within a buffer but the mode in
5343+ effect when an event is logged determines which timestamp mode
5344+ is used for that event. The default timestamp mode is
5345+ 'delta'.
5346+
5347+ Usual timestamp modes for tracing:
5348+
5349+ # cat timestamp_mode
5350+ [delta] absolute
5351+
5352+ The timestamp mode with the square brackets around it is the
5353+ one in effect.
5354+
5355+ delta: Default timestamp mode - timestamp is a delta against
5356+ a per-buffer timestamp.
5357+
5358+ absolute: The timestamp is a full timestamp, not a delta
5359+ against some other value. As such it takes up more
5360+ space and is less efficient.
5361+
5362 hwlat_detector:
5363
5364 Directory for the Hardware Latency Detector.
5365diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/histogram.txt linux-4.14/Documentation/trace/histogram.txt
5366--- linux-4.14.orig/Documentation/trace/histogram.txt 1970-01-01 01:00:00.000000000 +0100
5367+++ linux-4.14/Documentation/trace/histogram.txt 2018-09-05 11:05:07.000000000 +0200
5368@@ -0,0 +1,1995 @@
5369+ Event Histograms
5370+
5371+ Documentation written by Tom Zanussi
5372+
5373+1. Introduction
5374+===============
5375+
5376+ Histogram triggers are special event triggers that can be used to
5377+ aggregate trace event data into histograms. For information on
5378+ trace events and event triggers, see Documentation/trace/events.txt.
5379+
5380+
5381+2. Histogram Trigger Command
5382+============================
5383+
5384+ A histogram trigger command is an event trigger command that
5385+ aggregates event hits into a hash table keyed on one or more trace
5386+ event format fields (or stacktrace) and a set of running totals
5387+ derived from one or more trace event format fields and/or event
5388+ counts (hitcount).
5389+
5390+ The format of a hist trigger is as follows:
5391+
5392+ hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
5393+ [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
5394+ [:clear][:name=histname1] [if <filter>]
5395+
5396+ When a matching event is hit, an entry is added to a hash table
5397+ using the key(s) and value(s) named. Keys and values correspond to
5398+ fields in the event's format description. Values must correspond to
5399+ numeric fields - on an event hit, the value(s) will be added to a
5400+ sum kept for that field. The special string 'hitcount' can be used
5401+ in place of an explicit value field - this is simply a count of
5402+ event hits. If 'values' isn't specified, an implicit 'hitcount'
5403+ value will be automatically created and used as the only value.
5404+ Keys can be any field, or the special string 'stacktrace', which
5405+ will use the event's kernel stacktrace as the key. The keywords
5406+ 'keys' or 'key' can be used to specify keys, and the keywords
5407+ 'values', 'vals', or 'val' can be used to specify values. Compound
5408+ keys consisting of up to two fields can be specified by the 'keys'
5409+ keyword. Hashing a compound key produces a unique entry in the
5410+ table for each unique combination of component keys, and can be
5411+ useful for providing more fine-grained summaries of event data.
5412+ Additionally, sort keys consisting of up to two fields can be
5413+ specified by the 'sort' keyword. If more than one field is
5414+ specified, the result will be a 'sort within a sort': the first key
5415+ is taken to be the primary sort key and the second the secondary
5416+ key. If a hist trigger is given a name using the 'name' parameter,
5417+ its histogram data will be shared with other triggers of the same
5418+ name, and trigger hits will update this common data. Only triggers
5419+ with 'compatible' fields can be combined in this way; triggers are
5420+ 'compatible' if the fields named in the trigger share the same
5421+ number and type of fields and those fields also have the same names.
5422+ Note that any two events always share the compatible 'hitcount' and
5423+ 'stacktrace' fields and can therefore be combined using those
5424+ fields, however pointless that may be.
5425+
5426+ 'hist' triggers add a 'hist' file to each event's subdirectory.
5427+ Reading the 'hist' file for the event will dump the hash table in
5428+ its entirety to stdout. If there are multiple hist triggers
5429+ attached to an event, there will be a table for each trigger in the
5430+ output. The table displayed for a named trigger will be the same as
5431+ any other instance having the same name. Each printed hash table
5432+ entry is a simple list of the keys and values comprising the entry;
5433+ keys are printed first and are delineated by curly braces, and are
5434+ followed by the set of value fields for the entry. By default,
5435+ numeric fields are displayed as base-10 integers. This can be
5436+ modified by appending any of the following modifiers to the field
5437+ name:
5438+
5439+ .hex display a number as a hex value
5440+ .sym display an address as a symbol
5441+ .sym-offset display an address as a symbol and offset
5442+ .syscall display a syscall id as a system call name
5443+ .execname display a common_pid as a program name
5444+ .log2 display log2 value rather than raw number
5445+ .usecs display a common_timestamp in microseconds
5446+
5447+ Note that in general the semantics of a given field aren't
5448+ interpreted when applying a modifier to it, but there are some
5449+ restrictions to be aware of in this regard:
5450+
5451+ - only the 'hex' modifier can be used for values (because values
5452+ are essentially sums, and the other modifiers don't make sense
5453+ in that context).
5454+ - the 'execname' modifier can only be used on a 'common_pid'. The
5455+ reason for this is that the execname is simply the 'comm' value
5456+ saved for the 'current' process when an event was triggered,
5457+ which is the same as the common_pid value saved by the event
5458+ tracing code. Trying to apply that comm value to other pid
5459+ values wouldn't be correct, and typically events that care save
5460+ pid-specific comm fields in the event itself.
5461+
5462+ A typical usage scenario would be the following to enable a hist
5463+ trigger, read its current contents, and then turn it off:
5464+
5465+ # echo 'hist:keys=skbaddr.hex:vals=len' > \
5466+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5467+
5468+ # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5469+
5470+ # echo '!hist:keys=skbaddr.hex:vals=len' > \
5471+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5472+
5473+ The trigger file itself can be read to show the details of the
5474+ currently attached hist trigger. This information is also displayed
5475+ at the top of the 'hist' file when read.
5476+
5477+ By default, the size of the hash table is 2048 entries. The 'size'
5478+ parameter can be used to specify more or fewer than that. The units
5479+ are in terms of hashtable entries - if a run uses more entries than
5480+ specified, the results will show the number of 'drops', the number
5481+ of hits that were ignored. The size should be a power of 2 between
5482+ 128 and 131072 (any non- power-of-2 number specified will be rounded
5483+ up).
5484+
5485+ The 'sort' parameter can be used to specify a value field to sort
5486+ on. The default if unspecified is 'hitcount' and the default sort
5487+ order is 'ascending'. To sort in the opposite direction, append
5488+ .descending' to the sort key.
5489+
5490+ The 'pause' parameter can be used to pause an existing hist trigger
5491+ or to start a hist trigger but not log any events until told to do
5492+ so. 'continue' or 'cont' can be used to start or restart a paused
5493+ hist trigger.
5494+
5495+ The 'clear' parameter will clear the contents of a running hist
5496+ trigger and leave its current paused/active state.
5497+
5498+ Note that the 'pause', 'cont', and 'clear' parameters should be
5499+ applied using 'append' shell operator ('>>') if applied to an
5500+ existing trigger, rather than via the '>' operator, which will cause
5501+ the trigger to be removed through truncation.
5502+
5503+- enable_hist/disable_hist
5504+
5505+ The enable_hist and disable_hist triggers can be used to have one
5506+ event conditionally start and stop another event's already-attached
5507+ hist trigger. Any number of enable_hist and disable_hist triggers
5508+ can be attached to a given event, allowing that event to kick off
5509+ and stop aggregations on a host of other events.
5510+
5511+ The format is very similar to the enable/disable_event triggers:
5512+
5513+ enable_hist:<system>:<event>[:count]
5514+ disable_hist:<system>:<event>[:count]
5515+
5516+ Instead of enabling or disabling the tracing of the target event
5517+ into the trace buffer as the enable/disable_event triggers do, the
5518+ enable/disable_hist triggers enable or disable the aggregation of
5519+ the target event into a hash table.
5520+
5521+ A typical usage scenario for the enable_hist/disable_hist triggers
5522+ would be to first set up a paused hist trigger on some event,
5523+ followed by an enable_hist/disable_hist pair that turns the hist
5524+ aggregation on and off when conditions of interest are hit:
5525+
5526+ # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
5527+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
5528+
5529+ # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
5530+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
5531+
5532+ # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
5533+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
5534+
5535+ The above sets up an initially paused hist trigger which is unpaused
5536+ and starts aggregating events when a given program is executed, and
5537+ which stops aggregating when the process exits and the hist trigger
5538+ is paused again.
5539+
5540+ The examples below provide a more concrete illustration of the
5541+ concepts and typical usage patterns discussed above.
5542+
5543+ 'special' event fields
5544+ ------------------------
5545+
5546+ There are a number of 'special event fields' available for use as
5547+ keys or values in a hist trigger. These look like and behave as if
5548+ they were actual event fields, but aren't really part of the event's
5549+ field definition or format file. They are however available for any
5550+ event, and can be used anywhere an actual event field could be.
5551+ They are:
5552+
5553+ common_timestamp u64 - timestamp (from ring buffer) associated
5554+ with the event, in nanoseconds. May be
5555+ modified by .usecs to have timestamps
5556+ interpreted as microseconds.
5557+ cpu int - the cpu on which the event occurred.
5558+
5559+ Extended error information
5560+ --------------------------
5561+
5562+ For some error conditions encountered when invoking a hist trigger
5563+ command, extended error information is available via the
5564+ corresponding event's 'hist' file. Reading the hist file after an
5565+ error will display more detailed information about what went wrong,
5566+ if information is available. This extended error information will
5567+ be available until the next hist trigger command for that event.
5568+
5569+ If available for a given error condition, the extended error
5570+ information and usage takes the following form:
5571+
5572+ # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
5573+ echo: write error: Invalid argument
5574+
5575+ # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
5576+ ERROR: Couldn't yyy: zzz
5577+ Last command: xxx
5578+
5579+6.2 'hist' trigger examples
5580+---------------------------
5581+
5582+ The first set of examples creates aggregations using the kmalloc
5583+ event. The fields that can be used for the hist trigger are listed
5584+ in the kmalloc event's format file:
5585+
5586+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
5587+ name: kmalloc
5588+ ID: 374
5589+ format:
5590+ field:unsigned short common_type; offset:0; size:2; signed:0;
5591+ field:unsigned char common_flags; offset:2; size:1; signed:0;
5592+ field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
5593+ field:int common_pid; offset:4; size:4; signed:1;
5594+
5595+ field:unsigned long call_site; offset:8; size:8; signed:0;
5596+ field:const void * ptr; offset:16; size:8; signed:0;
5597+ field:size_t bytes_req; offset:24; size:8; signed:0;
5598+ field:size_t bytes_alloc; offset:32; size:8; signed:0;
5599+ field:gfp_t gfp_flags; offset:40; size:4; signed:0;
5600+
5601+ We'll start by creating a hist trigger that generates a simple table
5602+ that lists the total number of bytes requested for each function in
5603+ the kernel that made one or more calls to kmalloc:
5604+
5605+ # echo 'hist:key=call_site:val=bytes_req' > \
5606+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5607+
5608+ This tells the tracing system to create a 'hist' trigger using the
5609+ call_site field of the kmalloc event as the key for the table, which
5610+ just means that each unique call_site address will have an entry
5611+ created for it in the table. The 'val=bytes_req' parameter tells
5612+ the hist trigger that for each unique entry (call_site) in the
5613+ table, it should keep a running total of the number of bytes
5614+ requested by that call_site.
5615+
5616+ We'll let it run for awhile and then dump the contents of the 'hist'
5617+ file in the kmalloc event's subdirectory (for readability, a number
5618+ of entries have been omitted):
5619+
5620+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5621+ # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
5622+
5623+ { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
5624+ { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
5625+ { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
5626+ { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
5627+ { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
5628+ { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
5629+ { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
5630+ { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
5631+ { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
5632+ { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
5633+ .
5634+ .
5635+ .
5636+ { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
5637+ { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
5638+ { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
5639+ { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
5640+ { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
5641+ { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
5642+ { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
5643+ { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
5644+ { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
5645+ { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
5646+ { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
5647+ { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
5648+
5649+ Totals:
5650+ Hits: 4610
5651+ Entries: 45
5652+ Dropped: 0
5653+
5654+ The output displays a line for each entry, beginning with the key
5655+ specified in the trigger, followed by the value(s) also specified in
5656+ the trigger. At the beginning of the output is a line that displays
5657+ the trigger info, which can also be displayed by reading the
5658+ 'trigger' file:
5659+
5660+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5661+ hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
5662+
5663+ At the end of the output are a few lines that display the overall
5664+ totals for the run. The 'Hits' field shows the total number of
5665+ times the event trigger was hit, the 'Entries' field shows the total
5666+ number of used entries in the hash table, and the 'Dropped' field
5667+ shows the number of hits that were dropped because the number of
5668+ used entries for the run exceeded the maximum number of entries
5669+ allowed for the table (normally 0, but if not a hint that you may
5670+ want to increase the size of the table using the 'size' parameter).
5671+
5672+ Notice in the above output that there's an extra field, 'hitcount',
5673+ which wasn't specified in the trigger. Also notice that in the
5674+ trigger info output, there's a parameter, 'sort=hitcount', which
5675+ wasn't specified in the trigger either. The reason for that is that
5676+ every trigger implicitly keeps a count of the total number of hits
5677+ attributed to a given entry, called the 'hitcount'. That hitcount
5678+ information is explicitly displayed in the output, and in the
5679+ absence of a user-specified sort parameter, is used as the default
5680+ sort field.
5681+
5682+ The value 'hitcount' can be used in place of an explicit value in
5683+ the 'values' parameter if you don't really need to have any
5684+ particular field summed and are mainly interested in hit
5685+ frequencies.
5686+
5687+ To turn the hist trigger off, simply call up the trigger in the
5688+ command history and re-execute it with a '!' prepended:
5689+
5690+ # echo '!hist:key=call_site:val=bytes_req' > \
5691+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5692+
5693+ Finally, notice that the call_site as displayed in the output above
5694+ isn't really very useful. It's an address, but normally addresses
5695+ are displayed in hex. To have a numeric field displayed as a hex
5696+ value, simply append '.hex' to the field name in the trigger:
5697+
5698+ # echo 'hist:key=call_site.hex:val=bytes_req' > \
5699+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5700+
5701+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5702+ # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
5703+
5704+ { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
5705+ { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
5706+ { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
5707+ { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
5708+ { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
5709+ { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
5710+ { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
5711+ { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
5712+ { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
5713+ { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
5714+ { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
5715+ { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
5716+ .
5717+ .
5718+ .
5719+ { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
5720+ { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
5721+ { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
5722+ { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
5723+ { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
5724+ { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
5725+ { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
5726+ { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
5727+ { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
5728+ { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
5729+ { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
5730+
5731+ Totals:
5732+ Hits: 4775
5733+ Entries: 46
5734+ Dropped: 0
5735+
5736+ Even that's only marginally more useful - while hex values do look
5737+ more like addresses, what users are typically more interested in
5738+ when looking at text addresses are the corresponding symbols
5739+ instead. To have an address displayed as symbolic value instead,
5740+ simply append '.sym' or '.sym-offset' to the field name in the
5741+ trigger:
5742+
5743+ # echo 'hist:key=call_site.sym:val=bytes_req' > \
5744+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5745+
5746+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5747+ # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
5748+
5749+ { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
5750+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
5751+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
5752+ { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
5753+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
5754+ { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
5755+ { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
5756+ { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
5757+ { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
5758+ { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
5759+ { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
5760+ { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
5761+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
5762+ { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
5763+ .
5764+ .
5765+ .
5766+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
5767+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
5768+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
5769+ { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
5770+ { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
5771+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
5772+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
5773+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
5774+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
5775+ { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
5776+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
5777+ { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
5778+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
5779+
5780+ Totals:
5781+ Hits: 109928
5782+ Entries: 71
5783+ Dropped: 0
5784+
5785+ Because the default sort key above is 'hitcount', the above shows a
5786+ the list of call_sites by increasing hitcount, so that at the bottom
5787+ we see the functions that made the most kmalloc calls during the
5788+ run. If instead we we wanted to see the top kmalloc callers in
5789+ terms of the number of bytes requested rather than the number of
5790+ calls, and we wanted the top caller to appear at the top, we can use
5791+ the 'sort' parameter, along with the 'descending' modifier:
5792+
5793+ # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
5794+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5795+
5796+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5797+ # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
5798+
5799+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
5800+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
5801+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
5802+ { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
5803+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
5804+ { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
5805+ { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
5806+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
5807+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
5808+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
5809+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
5810+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
5811+ { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
5812+ .
5813+ .
5814+ .
5815+ { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
5816+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
5817+ { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
5818+ { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
5819+ { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
5820+ { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
5821+ { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
5822+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
5823+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
5824+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
5825+
5826+ Totals:
5827+ Hits: 32133
5828+ Entries: 81
5829+ Dropped: 0
5830+
5831+ To display the offset and size information in addition to the symbol
5832+ name, just use 'sym-offset' instead:
5833+
5834+ # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
5835+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5836+
5837+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5838+ # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
5839+
5840+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
5841+ { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
5842+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
5843+ { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
5844+ { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
5845+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
5846+ { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
5847+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
5848+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
5849+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
5850+ { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
5851+ { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
5852+ .
5853+ .
5854+ .
5855+ { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
5856+ { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
5857+ { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
5858+ { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
5859+ { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
5860+ { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
5861+ { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
5862+
5863+ Totals:
5864+ Hits: 26098
5865+ Entries: 64
5866+ Dropped: 0
5867+
5868+ We can also add multiple fields to the 'values' parameter. For
5869+ example, we might want to see the total number of bytes allocated
5870+ alongside bytes requested, and display the result sorted by bytes
5871+ allocated in a descending order:
5872+
5873+ # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
5874+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5875+
5876+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5877+ # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
5878+
5879+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
5880+ { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
5881+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
5882+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
5883+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
5884+ { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
5885+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
5886+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
5887+ { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
5888+ { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
5889+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
5890+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
5891+ .
5892+ .
5893+ .
5894+ { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
5895+ { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5896+ { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5897+ { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5898+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
5899+ { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
5900+ { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
5901+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
5902+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
5903+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
5904+
5905+ Totals:
5906+ Hits: 66598
5907+ Entries: 65
5908+ Dropped: 0
5909+
5910+ Finally, to finish off our kmalloc example, instead of simply having
5911+ the hist trigger display symbolic call_sites, we can have the hist
5912+ trigger additionally display the complete set of kernel stack traces
5913+ that led to each call_site. To do that, we simply use the special
5914+ value 'stacktrace' for the key parameter:
5915+
5916+ # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
5917+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5918+
5919+ The above trigger will use the kernel stack trace in effect when an
5920+ event is triggered as the key for the hash table. This allows the
5921+ enumeration of every kernel callpath that led up to a particular
5922+ event, along with a running total of any of the event fields for
5923+ that event. Here we tally bytes requested and bytes allocated for
5924+ every callpath in the system that led up to a kmalloc (in this case
5925+ every callpath to a kmalloc for a kernel compile):
5926+
5927+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5928+ # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
5929+
5930+ { stacktrace:
5931+ __kmalloc_track_caller+0x10b/0x1a0
5932+ kmemdup+0x20/0x50
5933+ hidraw_report_event+0x8a/0x120 [hid]
5934+ hid_report_raw_event+0x3ea/0x440 [hid]
5935+ hid_input_report+0x112/0x190 [hid]
5936+ hid_irq_in+0xc2/0x260 [usbhid]
5937+ __usb_hcd_giveback_urb+0x72/0x120
5938+ usb_giveback_urb_bh+0x9e/0xe0
5939+ tasklet_hi_action+0xf8/0x100
5940+ __do_softirq+0x114/0x2c0
5941+ irq_exit+0xa5/0xb0
5942+ do_IRQ+0x5a/0xf0
5943+ ret_from_intr+0x0/0x30
5944+ cpuidle_enter+0x17/0x20
5945+ cpu_startup_entry+0x315/0x3e0
5946+ rest_init+0x7c/0x80
5947+ } hitcount: 3 bytes_req: 21 bytes_alloc: 24
5948+ { stacktrace:
5949+ __kmalloc_track_caller+0x10b/0x1a0
5950+ kmemdup+0x20/0x50
5951+ hidraw_report_event+0x8a/0x120 [hid]
5952+ hid_report_raw_event+0x3ea/0x440 [hid]
5953+ hid_input_report+0x112/0x190 [hid]
5954+ hid_irq_in+0xc2/0x260 [usbhid]
5955+ __usb_hcd_giveback_urb+0x72/0x120
5956+ usb_giveback_urb_bh+0x9e/0xe0
5957+ tasklet_hi_action+0xf8/0x100
5958+ __do_softirq+0x114/0x2c0
5959+ irq_exit+0xa5/0xb0
5960+ do_IRQ+0x5a/0xf0
5961+ ret_from_intr+0x0/0x30
5962+ } hitcount: 3 bytes_req: 21 bytes_alloc: 24
5963+ { stacktrace:
5964+ kmem_cache_alloc_trace+0xeb/0x150
5965+ aa_alloc_task_context+0x27/0x40
5966+ apparmor_cred_prepare+0x1f/0x50
5967+ security_prepare_creds+0x16/0x20
5968+ prepare_creds+0xdf/0x1a0
5969+ SyS_capset+0xb5/0x200
5970+ system_call_fastpath+0x12/0x6a
5971+ } hitcount: 1 bytes_req: 32 bytes_alloc: 32
5972+ .
5973+ .
5974+ .
5975+ { stacktrace:
5976+ __kmalloc+0x11b/0x1b0
5977+ i915_gem_execbuffer2+0x6c/0x2c0 [i915]
5978+ drm_ioctl+0x349/0x670 [drm]
5979+ do_vfs_ioctl+0x2f0/0x4f0
5980+ SyS_ioctl+0x81/0xa0
5981+ system_call_fastpath+0x12/0x6a
5982+ } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
5983+ { stacktrace:
5984+ __kmalloc+0x11b/0x1b0
5985+ load_elf_phdrs+0x76/0xa0
5986+ load_elf_binary+0x102/0x1650
5987+ search_binary_handler+0x97/0x1d0
5988+ do_execveat_common.isra.34+0x551/0x6e0
5989+ SyS_execve+0x3a/0x50
5990+ return_from_execve+0x0/0x23
5991+ } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
5992+ { stacktrace:
5993+ kmem_cache_alloc_trace+0xeb/0x150
5994+ apparmor_file_alloc_security+0x27/0x40
5995+ security_file_alloc+0x16/0x20
5996+ get_empty_filp+0x93/0x1c0
5997+ path_openat+0x31/0x5f0
5998+ do_filp_open+0x3a/0x90
5999+ do_sys_open+0x128/0x220
6000+ SyS_open+0x1e/0x20
6001+ system_call_fastpath+0x12/0x6a
6002+ } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
6003+ { stacktrace:
6004+ __kmalloc+0x11b/0x1b0
6005+ seq_buf_alloc+0x1b/0x50
6006+ seq_read+0x2cc/0x370
6007+ proc_reg_read+0x3d/0x80
6008+ __vfs_read+0x28/0xe0
6009+ vfs_read+0x86/0x140
6010+ SyS_read+0x46/0xb0
6011+ system_call_fastpath+0x12/0x6a
6012+ } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
6013+
6014+ Totals:
6015+ Hits: 6085872
6016+ Entries: 253
6017+ Dropped: 0
6018+
6019+ If you key a hist trigger on common_pid, in order for example to
6020+ gather and display sorted totals for each process, you can use the
6021+ special .execname modifier to display the executable names for the
6022+ processes in the table rather than raw pids. The example below
6023+ keeps a per-process sum of total bytes read:
6024+
6025+ # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
6026+ /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
6027+
6028+ # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
6029+ # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
6030+
6031+ { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
6032+ { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
6033+ { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
6034+ { common_pid: bash [ 8710] } hitcount: 3 count: 66369
6035+ { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
6036+ { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
6037+ { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
6038+ { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
6039+ { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
6040+ { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
6041+ { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
6042+ .
6043+ .
6044+ .
6045+ { common_pid: postgres [ 1892] } hitcount: 2 count: 32
6046+ { common_pid: postgres [ 1891] } hitcount: 2 count: 32
6047+ { common_pid: gmain [ 8704] } hitcount: 2 count: 32
6048+ { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
6049+ { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
6050+ { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
6051+ { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
6052+ { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
6053+ { common_pid: init [ 1] } hitcount: 2 count: 2
6054+
6055+ Totals:
6056+ Hits: 2116
6057+ Entries: 51
6058+ Dropped: 0
6059+
6060+ Similarly, if you key a hist trigger on syscall id, for example to
6061+ gather and display a list of systemwide syscall hits, you can use
6062+ the special .syscall modifier to display the syscall names rather
6063+ than raw ids. The example below keeps a running total of syscall
6064+ counts for the system during the run:
6065+
6066+ # echo 'hist:key=id.syscall:val=hitcount' > \
6067+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6068+
6069+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6070+ # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
6071+
6072+ { id: sys_fsync [ 74] } hitcount: 1
6073+ { id: sys_newuname [ 63] } hitcount: 1
6074+ { id: sys_prctl [157] } hitcount: 1
6075+ { id: sys_statfs [137] } hitcount: 1
6076+ { id: sys_symlink [ 88] } hitcount: 1
6077+ { id: sys_sendmmsg [307] } hitcount: 1
6078+ { id: sys_semctl [ 66] } hitcount: 1
6079+ { id: sys_readlink [ 89] } hitcount: 3
6080+ { id: sys_bind [ 49] } hitcount: 3
6081+ { id: sys_getsockname [ 51] } hitcount: 3
6082+ { id: sys_unlink [ 87] } hitcount: 3
6083+ { id: sys_rename [ 82] } hitcount: 4
6084+ { id: unknown_syscall [ 58] } hitcount: 4
6085+ { id: sys_connect [ 42] } hitcount: 4
6086+ { id: sys_getpid [ 39] } hitcount: 4
6087+ .
6088+ .
6089+ .
6090+ { id: sys_rt_sigprocmask [ 14] } hitcount: 952
6091+ { id: sys_futex [202] } hitcount: 1534
6092+ { id: sys_write [ 1] } hitcount: 2689
6093+ { id: sys_setitimer [ 38] } hitcount: 2797
6094+ { id: sys_read [ 0] } hitcount: 3202
6095+ { id: sys_select [ 23] } hitcount: 3773
6096+ { id: sys_writev [ 20] } hitcount: 4531
6097+ { id: sys_poll [ 7] } hitcount: 8314
6098+ { id: sys_recvmsg [ 47] } hitcount: 13738
6099+ { id: sys_ioctl [ 16] } hitcount: 21843
6100+
6101+ Totals:
6102+ Hits: 67612
6103+ Entries: 72
6104+ Dropped: 0
6105+
6106+ The syscall counts above provide a rough overall picture of system
6107+ call activity on the system; we can see for example that the most
6108+ popular system call on this system was the 'sys_ioctl' system call.
6109+
6110+ We can use 'compound' keys to refine that number and provide some
6111+ further insight as to which processes exactly contribute to the
6112+ overall ioctl count.
6113+
6114+ The command below keeps a hitcount for every unique combination of
6115+ system call id and pid - the end result is essentially a table
6116+ that keeps a per-pid sum of system call hits. The results are
6117+ sorted using the system call id as the primary key, and the
6118+ hitcount sum as the secondary key:
6119+
6120+ # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
6121+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6122+
6123+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6124+ # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
6125+
6126+ { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
6127+ { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
6128+ { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
6129+ { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
6130+ { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
6131+ { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
6132+ { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
6133+ { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
6134+ { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
6135+ { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
6136+ .
6137+ .
6138+ .
6139+ { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
6140+ { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
6141+ { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
6142+ { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
6143+ { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
6144+ .
6145+ .
6146+ .
6147+ { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
6148+ { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
6149+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
6150+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
6151+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
6152+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
6153+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
6154+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
6155+ { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
6156+ { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
6157+ { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
6158+
6159+ Totals:
6160+ Hits: 31536
6161+ Entries: 323
6162+ Dropped: 0
6163+
6164+ The above list does give us a breakdown of the ioctl syscall by
6165+ pid, but it also gives us quite a bit more than that, which we
6166+ don't really care about at the moment. Since we know the syscall
6167+ id for sys_ioctl (16, displayed next to the sys_ioctl name), we
6168+ can use that to filter out all the other syscalls:
6169+
6170+ # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
6171+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6172+
6173+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6174+ # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
6175+
6176+ { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
6177+ { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
6178+ { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
6179+ { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
6180+ { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
6181+ { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
6182+ { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
6183+ { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
6184+ { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
6185+ .
6186+ .
6187+ .
6188+ { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
6189+ { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
6190+ { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
6191+ { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
6192+ { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
6193+ { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
6194+
6195+ Totals:
6196+ Hits: 101162
6197+ Entries: 103
6198+ Dropped: 0
6199+
6200+ The above output shows that 'compiz' and 'Xorg' are far and away
6201+ the heaviest ioctl callers (which might lead to questions about
6202+ whether they really need to be making all those calls and to
6203+ possible avenues for further investigation.)
6204+
6205+ The compound key examples used a key and a sum value (hitcount) to
6206+ sort the output, but we can just as easily use two keys instead.
6207+ Here's an example where we use a compound key composed of the the
6208+ common_pid and size event fields. Sorting with pid as the primary
6209+ key and 'size' as the secondary key allows us to display an
6210+ ordered summary of the recvfrom sizes, with counts, received by
6211+ each process:
6212+
6213+ # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
6214+ /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
6215+
6216+ # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
6217+ # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
6218+
6219+ { common_pid: smbd [ 784], size: 4 } hitcount: 1
6220+ { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
6221+ { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
6222+ { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
6223+ { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
6224+ { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
6225+ { common_pid: compiz [ 2994], size: 8 } hitcount: 1
6226+ { common_pid: compiz [ 2994], size: 20 } hitcount: 11
6227+ { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
6228+ { common_pid: firefox [ 8817], size: 4 } hitcount: 1
6229+ { common_pid: firefox [ 8817], size: 8 } hitcount: 5
6230+ { common_pid: firefox [ 8817], size: 588 } hitcount: 2
6231+ { common_pid: firefox [ 8817], size: 628 } hitcount: 1
6232+ { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
6233+ { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
6234+ { common_pid: firefox [ 8822], size: 8 } hitcount: 2
6235+ { common_pid: firefox [ 8822], size: 160 } hitcount: 2
6236+ { common_pid: firefox [ 8822], size: 320 } hitcount: 2
6237+ { common_pid: firefox [ 8822], size: 352 } hitcount: 1
6238+ .
6239+ .
6240+ .
6241+ { common_pid: pool [ 8923], size: 1960 } hitcount: 10
6242+ { common_pid: pool [ 8923], size: 2048 } hitcount: 10
6243+ { common_pid: pool [ 8924], size: 1960 } hitcount: 10
6244+ { common_pid: pool [ 8924], size: 2048 } hitcount: 10
6245+ { common_pid: pool [ 8928], size: 1964 } hitcount: 4
6246+ { common_pid: pool [ 8928], size: 1965 } hitcount: 2
6247+ { common_pid: pool [ 8928], size: 2048 } hitcount: 6
6248+ { common_pid: pool [ 8929], size: 1982 } hitcount: 1
6249+ { common_pid: pool [ 8929], size: 2048 } hitcount: 1
6250+
6251+ Totals:
6252+ Hits: 2016
6253+ Entries: 224
6254+ Dropped: 0
6255+
6256+ The above example also illustrates the fact that although a compound
6257+ key is treated as a single entity for hashing purposes, the sub-keys
6258+ it's composed of can be accessed independently.
6259+
6260+ The next example uses a string field as the hash key and
6261+ demonstrates how you can manually pause and continue a hist trigger.
6262+ In this example, we'll aggregate fork counts and don't expect a
6263+ large number of entries in the hash table, so we'll drop it to a
6264+ much smaller number, say 256:
6265+
6266+ # echo 'hist:key=child_comm:val=hitcount:size=256' > \
6267+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6268+
6269+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6270+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
6271+
6272+ { child_comm: dconf worker } hitcount: 1
6273+ { child_comm: ibus-daemon } hitcount: 1
6274+ { child_comm: whoopsie } hitcount: 1
6275+ { child_comm: smbd } hitcount: 1
6276+ { child_comm: gdbus } hitcount: 1
6277+ { child_comm: kthreadd } hitcount: 1
6278+ { child_comm: dconf worker } hitcount: 1
6279+ { child_comm: evolution-alarm } hitcount: 2
6280+ { child_comm: Socket Thread } hitcount: 2
6281+ { child_comm: postgres } hitcount: 2
6282+ { child_comm: bash } hitcount: 3
6283+ { child_comm: compiz } hitcount: 3
6284+ { child_comm: evolution-sourc } hitcount: 4
6285+ { child_comm: dhclient } hitcount: 4
6286+ { child_comm: pool } hitcount: 5
6287+ { child_comm: nm-dispatcher.a } hitcount: 8
6288+ { child_comm: firefox } hitcount: 8
6289+ { child_comm: dbus-daemon } hitcount: 8
6290+ { child_comm: glib-pacrunner } hitcount: 10
6291+ { child_comm: evolution } hitcount: 23
6292+
6293+ Totals:
6294+ Hits: 89
6295+ Entries: 20
6296+ Dropped: 0
6297+
6298+ If we want to pause the hist trigger, we can simply append :pause to
6299+ the command that started the trigger. Notice that the trigger info
6300+ displays as [paused]:
6301+
6302+ # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
6303+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6304+
6305+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6306+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
6307+
6308+ { child_comm: dconf worker } hitcount: 1
6309+ { child_comm: kthreadd } hitcount: 1
6310+ { child_comm: dconf worker } hitcount: 1
6311+ { child_comm: gdbus } hitcount: 1
6312+ { child_comm: ibus-daemon } hitcount: 1
6313+ { child_comm: Socket Thread } hitcount: 2
6314+ { child_comm: evolution-alarm } hitcount: 2
6315+ { child_comm: smbd } hitcount: 2
6316+ { child_comm: bash } hitcount: 3
6317+ { child_comm: whoopsie } hitcount: 3
6318+ { child_comm: compiz } hitcount: 3
6319+ { child_comm: evolution-sourc } hitcount: 4
6320+ { child_comm: pool } hitcount: 5
6321+ { child_comm: postgres } hitcount: 6
6322+ { child_comm: firefox } hitcount: 8
6323+ { child_comm: dhclient } hitcount: 10
6324+ { child_comm: emacs } hitcount: 12
6325+ { child_comm: dbus-daemon } hitcount: 20
6326+ { child_comm: nm-dispatcher.a } hitcount: 20
6327+ { child_comm: evolution } hitcount: 35
6328+ { child_comm: glib-pacrunner } hitcount: 59
6329+
6330+ Totals:
6331+ Hits: 199
6332+ Entries: 21
6333+ Dropped: 0
6334+
6335+ To manually continue having the trigger aggregate events, append
6336+ :cont instead. Notice that the trigger info displays as [active]
6337+ again, and the data has changed:
6338+
6339+ # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
6340+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6341+
6342+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6343+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
6344+
6345+ { child_comm: dconf worker } hitcount: 1
6346+ { child_comm: dconf worker } hitcount: 1
6347+ { child_comm: kthreadd } hitcount: 1
6348+ { child_comm: gdbus } hitcount: 1
6349+ { child_comm: ibus-daemon } hitcount: 1
6350+ { child_comm: Socket Thread } hitcount: 2
6351+ { child_comm: evolution-alarm } hitcount: 2
6352+ { child_comm: smbd } hitcount: 2
6353+ { child_comm: whoopsie } hitcount: 3
6354+ { child_comm: compiz } hitcount: 3
6355+ { child_comm: evolution-sourc } hitcount: 4
6356+ { child_comm: bash } hitcount: 5
6357+ { child_comm: pool } hitcount: 5
6358+ { child_comm: postgres } hitcount: 6
6359+ { child_comm: firefox } hitcount: 8
6360+ { child_comm: dhclient } hitcount: 11
6361+ { child_comm: emacs } hitcount: 12
6362+ { child_comm: dbus-daemon } hitcount: 22
6363+ { child_comm: nm-dispatcher.a } hitcount: 22
6364+ { child_comm: evolution } hitcount: 35
6365+ { child_comm: glib-pacrunner } hitcount: 59
6366+
6367+ Totals:
6368+ Hits: 206
6369+ Entries: 21
6370+ Dropped: 0
6371+
6372+ The previous example showed how to start and stop a hist trigger by
6373+ appending 'pause' and 'continue' to the hist trigger command. A
6374+ hist trigger can also be started in a paused state by initially
6375+ starting the trigger with ':pause' appended. This allows you to
6376+ start the trigger only when you're ready to start collecting data
6377+ and not before. For example, you could start the trigger in a
6378+ paused state, then unpause it and do something you want to measure,
6379+ then pause the trigger again when done.
6380+
6381+ Of course, doing this manually can be difficult and error-prone, but
6382+ it is possible to automatically start and stop a hist trigger based
6383+ on some condition, via the enable_hist and disable_hist triggers.
6384+
6385+ For example, suppose we wanted to take a look at the relative
6386+ weights in terms of skb length for each callpath that leads to a
6387+ netif_receieve_skb event when downloading a decent-sized file using
6388+ wget.
6389+
6390+ First we set up an initially paused stacktrace trigger on the
6391+ netif_receive_skb event:
6392+
6393+ # echo 'hist:key=stacktrace:vals=len:pause' > \
6394+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6395+
6396+ Next, we set up an 'enable_hist' trigger on the sched_process_exec
6397+ event, with an 'if filename==/usr/bin/wget' filter. The effect of
6398+ this new trigger is that it will 'unpause' the hist trigger we just
6399+ set up on netif_receive_skb if and only if it sees a
6400+ sched_process_exec event with a filename of '/usr/bin/wget'. When
6401+ that happens, all netif_receive_skb events are aggregated into a
6402+ hash table keyed on stacktrace:
6403+
6404+ # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
6405+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6406+
6407+ The aggregation continues until the netif_receive_skb is paused
6408+ again, which is what the following disable_hist event does by
6409+ creating a similar setup on the sched_process_exit event, using the
6410+ filter 'comm==wget':
6411+
6412+ # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
6413+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6414+
6415+ Whenever a process exits and the comm field of the disable_hist
6416+ trigger filter matches 'comm==wget', the netif_receive_skb hist
6417+ trigger is disabled.
6418+
6419+ The overall effect is that netif_receive_skb events are aggregated
6420+ into the hash table for only the duration of the wget. Executing a
6421+ wget command and then listing the 'hist' file will display the
6422+ output generated by the wget command:
6423+
6424+ $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
6425+
6426+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6427+ # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
6428+
6429+ { stacktrace:
6430+ __netif_receive_skb_core+0x46d/0x990
6431+ __netif_receive_skb+0x18/0x60
6432+ netif_receive_skb_internal+0x23/0x90
6433+ napi_gro_receive+0xc8/0x100
6434+ ieee80211_deliver_skb+0xd6/0x270 [mac80211]
6435+ ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
6436+ ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
6437+ ieee80211_rx+0x31d/0x900 [mac80211]
6438+ iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
6439+ iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
6440+ iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
6441+ irq_thread_fn+0x20/0x50
6442+ irq_thread+0x11f/0x150
6443+ kthread+0xd2/0xf0
6444+ ret_from_fork+0x42/0x70
6445+ } hitcount: 85 len: 28884
6446+ { stacktrace:
6447+ __netif_receive_skb_core+0x46d/0x990
6448+ __netif_receive_skb+0x18/0x60
6449+ netif_receive_skb_internal+0x23/0x90
6450+ napi_gro_complete+0xa4/0xe0
6451+ dev_gro_receive+0x23a/0x360
6452+ napi_gro_receive+0x30/0x100
6453+ ieee80211_deliver_skb+0xd6/0x270 [mac80211]
6454+ ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
6455+ ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
6456+ ieee80211_rx+0x31d/0x900 [mac80211]
6457+ iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
6458+ iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
6459+ iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
6460+ irq_thread_fn+0x20/0x50
6461+ irq_thread+0x11f/0x150
6462+ kthread+0xd2/0xf0
6463+ } hitcount: 98 len: 664329
6464+ { stacktrace:
6465+ __netif_receive_skb_core+0x46d/0x990
6466+ __netif_receive_skb+0x18/0x60
6467+ process_backlog+0xa8/0x150
6468+ net_rx_action+0x15d/0x340
6469+ __do_softirq+0x114/0x2c0
6470+ do_softirq_own_stack+0x1c/0x30
6471+ do_softirq+0x65/0x70
6472+ __local_bh_enable_ip+0xb5/0xc0
6473+ ip_finish_output+0x1f4/0x840
6474+ ip_output+0x6b/0xc0
6475+ ip_local_out_sk+0x31/0x40
6476+ ip_send_skb+0x1a/0x50
6477+ udp_send_skb+0x173/0x2a0
6478+ udp_sendmsg+0x2bf/0x9f0
6479+ inet_sendmsg+0x64/0xa0
6480+ sock_sendmsg+0x3d/0x50
6481+ } hitcount: 115 len: 13030
6482+ { stacktrace:
6483+ __netif_receive_skb_core+0x46d/0x990
6484+ __netif_receive_skb+0x18/0x60
6485+ netif_receive_skb_internal+0x23/0x90
6486+ napi_gro_complete+0xa4/0xe0
6487+ napi_gro_flush+0x6d/0x90
6488+ iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
6489+ irq_thread_fn+0x20/0x50
6490+ irq_thread+0x11f/0x150
6491+ kthread+0xd2/0xf0
6492+ ret_from_fork+0x42/0x70
6493+ } hitcount: 934 len: 5512212
6494+
6495+ Totals:
6496+ Hits: 1232
6497+ Entries: 4
6498+ Dropped: 0
6499+
6500+ The above shows all the netif_receive_skb callpaths and their total
6501+ lengths for the duration of the wget command.
6502+
6503+ The 'clear' hist trigger param can be used to clear the hash table.
6504+ Suppose we wanted to try another run of the previous example but
6505+ this time also wanted to see the complete list of events that went
6506+ into the histogram. In order to avoid having to set everything up
6507+ again, we can just clear the histogram first:
6508+
6509+ # echo 'hist:key=stacktrace:vals=len:clear' >> \
6510+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6511+
6512+ Just to verify that it is in fact cleared, here's what we now see in
6513+ the hist file:
6514+
6515+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6516+ # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
6517+
6518+ Totals:
6519+ Hits: 0
6520+ Entries: 0
6521+ Dropped: 0
6522+
6523+ Since we want to see the detailed list of every netif_receive_skb
6524+ event occurring during the new run, which are in fact the same
6525+ events being aggregated into the hash table, we add some additional
6526+ 'enable_event' events to the triggering sched_process_exec and
6527+ sched_process_exit events as such:
6528+
6529+ # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
6530+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6531+
6532+ # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
6533+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6534+
6535+ If you read the trigger files for the sched_process_exec and
6536+ sched_process_exit triggers, you should see two triggers for each:
6537+ one enabling/disabling the hist aggregation and the other
6538+ enabling/disabling the logging of events:
6539+
6540+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6541+ enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
6542+ enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
6543+
6544+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6545+ enable_event:net:netif_receive_skb:unlimited if comm==wget
6546+ disable_hist:net:netif_receive_skb:unlimited if comm==wget
6547+
6548+ In other words, whenever either of the sched_process_exec or
6549+ sched_process_exit events is hit and matches 'wget', it enables or
6550+ disables both the histogram and the event log, and what you end up
6551+ with is a hash table and set of events just covering the specified
6552+ duration. Run the wget command again:
6553+
6554+ $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
6555+
6556+ Displaying the 'hist' file should show something similar to what you
6557+ saw in the last run, but this time you should also see the
6558+ individual events in the trace file:
6559+
6560+ # cat /sys/kernel/debug/tracing/trace
6561+
6562+ # tracer: nop
6563+ #
6564+ # entries-in-buffer/entries-written: 183/1426 #P:4
6565+ #
6566+ # _-----=> irqs-off
6567+ # / _----=> need-resched
6568+ # | / _---=> hardirq/softirq
6569+ # || / _--=> preempt-depth
6570+ # ||| / delay
6571+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
6572+ # | | | |||| | |
6573+ wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
6574+ wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
6575+ dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
6576+ dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
6577+ ##### CPU 2 buffer started ####
6578+ irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
6579+ irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
6580+ irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
6581+ irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
6582+ irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
6583+ .
6584+ .
6585+ .
6586+
6587+ The following example demonstrates how multiple hist triggers can be
6588+ attached to a given event. This capability can be useful for
6589+ creating a set of different summaries derived from the same set of
6590+ events, or for comparing the effects of different filters, among
6591+ other things.
6592+
6593+ # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
6594+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6595+ # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
6596+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6597+ # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
6598+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6599+ # echo 'hist:keys=skbaddr.hex:vals=len' >> \
6600+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6601+ # echo 'hist:keys=len:vals=common_preempt_count' >> \
6602+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6603+
6604+ The above set of commands create four triggers differing only in
6605+ their filters, along with a completely different though fairly
6606+ nonsensical trigger. Note that in order to append multiple hist
6607+ triggers to the same file, you should use the '>>' operator to
6608+ append them ('>' will also add the new hist trigger, but will remove
6609+ any existing hist triggers beforehand).
6610+
6611+ Displaying the contents of the 'hist' file for the event shows the
6612+ contents of all five histograms:
6613+
6614+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6615+
6616+ # event histogram
6617+ #
6618+ # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
6619+ #
6620+
6621+ { len: 176 } hitcount: 1 common_preempt_count: 0
6622+ { len: 223 } hitcount: 1 common_preempt_count: 0
6623+ { len: 4854 } hitcount: 1 common_preempt_count: 0
6624+ { len: 395 } hitcount: 1 common_preempt_count: 0
6625+ { len: 177 } hitcount: 1 common_preempt_count: 0
6626+ { len: 446 } hitcount: 1 common_preempt_count: 0
6627+ { len: 1601 } hitcount: 1 common_preempt_count: 0
6628+ .
6629+ .
6630+ .
6631+ { len: 1280 } hitcount: 66 common_preempt_count: 0
6632+ { len: 116 } hitcount: 81 common_preempt_count: 40
6633+ { len: 708 } hitcount: 112 common_preempt_count: 0
6634+ { len: 46 } hitcount: 221 common_preempt_count: 0
6635+ { len: 1264 } hitcount: 458 common_preempt_count: 0
6636+
6637+ Totals:
6638+ Hits: 1428
6639+ Entries: 147
6640+ Dropped: 0
6641+
6642+
6643+ # event histogram
6644+ #
6645+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6646+ #
6647+
6648+ { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
6649+ { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
6650+ { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
6651+ { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
6652+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
6653+ { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
6654+ { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
6655+ { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
6656+ { skbaddr: ffff880100065900 } hitcount: 1 len: 46
6657+ { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
6658+ { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
6659+ { skbaddr: ffff880100064700 } hitcount: 1 len: 365
6660+ { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
6661+ .
6662+ .
6663+ .
6664+ { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
6665+ { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
6666+ { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
6667+ { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
6668+ { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
6669+ { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
6670+ { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
6671+ { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
6672+ { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
6673+
6674+ Totals:
6675+ Hits: 1451
6676+ Entries: 318
6677+ Dropped: 0
6678+
6679+
6680+ # event histogram
6681+ #
6682+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
6683+ #
6684+
6685+
6686+ Totals:
6687+ Hits: 0
6688+ Entries: 0
6689+ Dropped: 0
6690+
6691+
6692+ # event histogram
6693+ #
6694+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
6695+ #
6696+
6697+ { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
6698+ { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
6699+ { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
6700+ { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
6701+ { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
6702+ { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
6703+ { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
6704+ { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
6705+ { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
6706+ { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
6707+ { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
6708+ { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
6709+
6710+ Totals:
6711+ Hits: 14
6712+ Entries: 12
6713+ Dropped: 0
6714+
6715+
6716+ # event histogram
6717+ #
6718+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
6719+ #
6720+
6721+
6722+ Totals:
6723+ Hits: 0
6724+ Entries: 0
6725+ Dropped: 0
6726+
6727+ Named triggers can be used to have triggers share a common set of
6728+ histogram data. This capability is mostly useful for combining the
6729+ output of events generated by tracepoints contained inside inline
6730+ functions, but names can be used in a hist trigger on any event.
6731+ For example, these two triggers when hit will update the same 'len'
6732+ field in the shared 'foo' histogram data:
6733+
6734+ # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
6735+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6736+ # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
6737+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
6738+
6739+ You can see that they're updating common histogram data by reading
6740+ each event's hist files at the same time:
6741+
6742+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
6743+ cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
6744+
6745+ # event histogram
6746+ #
6747+ # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6748+ #
6749+
6750+ { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
6751+ { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
6752+ { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
6753+ { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
6754+ { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
6755+ { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
6756+ { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
6757+ { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
6758+ { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
6759+ { skbaddr: ffff880064505000 } hitcount: 1 len: 46
6760+ { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
6761+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
6762+ { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
6763+ { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
6764+ { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
6765+ { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
6766+ { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
6767+ { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
6768+ { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
6769+ { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
6770+ { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
6771+ { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
6772+ { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
6773+ { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
6774+ { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
6775+ { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
6776+ { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
6777+ { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
6778+ { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
6779+ { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
6780+ { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
6781+ { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
6782+ { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
6783+ { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
6784+ { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
6785+ { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
6786+ { skbaddr: ffff880064504400 } hitcount: 4 len: 184
6787+ { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
6788+ { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
6789+ { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
6790+ { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
6791+ { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
6792+
6793+ Totals:
6794+ Hits: 81
6795+ Entries: 42
6796+ Dropped: 0
6797+ # event histogram
6798+ #
6799+ # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6800+ #
6801+
6802+ { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
6803+ { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
6804+ { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
6805+ { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
6806+ { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
6807+ { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
6808+ { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
6809+ { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
6810+ { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
6811+ { skbaddr: ffff880064505000 } hitcount: 1 len: 46
6812+ { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
6813+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
6814+ { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
6815+ { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
6816+ { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
6817+ { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
6818+ { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
6819+ { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
6820+ { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
6821+ { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
6822+ { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
6823+ { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
6824+ { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
6825+ { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
6826+ { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
6827+ { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
6828+ { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
6829+ { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
6830+ { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
6831+ { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
6832+ { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
6833+ { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
6834+ { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
6835+ { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
6836+ { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
6837+ { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
6838+ { skbaddr: ffff880064504400 } hitcount: 4 len: 184
6839+ { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
6840+ { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
6841+ { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
6842+ { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
6843+ { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
6844+
6845+ Totals:
6846+ Hits: 81
6847+ Entries: 42
6848+ Dropped: 0
6849+
6850+ And here's an example that shows how to combine histogram data from
6851+ any two events even if they don't share any 'compatible' fields
6852+ other than 'hitcount' and 'stacktrace'. These commands create a
6853+ couple of triggers named 'bar' using those fields:
6854+
6855+ # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
6856+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6857+ # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
6858+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
6859+
6860+ And displaying the output of either shows some interesting if
6861+ somewhat confusing output:
6862+
6863+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6864+ # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
6865+
6866+ # event histogram
6867+ #
6868+ # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
6869+ #
6870+
6871+ { stacktrace:
6872+ _do_fork+0x18e/0x330
6873+ kernel_thread+0x29/0x30
6874+ kthreadd+0x154/0x1b0
6875+ ret_from_fork+0x3f/0x70
6876+ } hitcount: 1
6877+ { stacktrace:
6878+ netif_rx_internal+0xb2/0xd0
6879+ netif_rx_ni+0x20/0x70
6880+ dev_loopback_xmit+0xaa/0xd0
6881+ ip_mc_output+0x126/0x240
6882+ ip_local_out_sk+0x31/0x40
6883+ igmp_send_report+0x1e9/0x230
6884+ igmp_timer_expire+0xe9/0x120
6885+ call_timer_fn+0x39/0xf0
6886+ run_timer_softirq+0x1e1/0x290
6887+ __do_softirq+0xfd/0x290
6888+ irq_exit+0x98/0xb0
6889+ smp_apic_timer_interrupt+0x4a/0x60
6890+ apic_timer_interrupt+0x6d/0x80
6891+ cpuidle_enter+0x17/0x20
6892+ call_cpuidle+0x3b/0x60
6893+ cpu_startup_entry+0x22d/0x310
6894+ } hitcount: 1
6895+ { stacktrace:
6896+ netif_rx_internal+0xb2/0xd0
6897+ netif_rx_ni+0x20/0x70
6898+ dev_loopback_xmit+0xaa/0xd0
6899+ ip_mc_output+0x17f/0x240
6900+ ip_local_out_sk+0x31/0x40
6901+ ip_send_skb+0x1a/0x50
6902+ udp_send_skb+0x13e/0x270
6903+ udp_sendmsg+0x2bf/0x980
6904+ inet_sendmsg+0x67/0xa0
6905+ sock_sendmsg+0x38/0x50
6906+ SYSC_sendto+0xef/0x170
6907+ SyS_sendto+0xe/0x10
6908+ entry_SYSCALL_64_fastpath+0x12/0x6a
6909+ } hitcount: 2
6910+ { stacktrace:
6911+ netif_rx_internal+0xb2/0xd0
6912+ netif_rx+0x1c/0x60
6913+ loopback_xmit+0x6c/0xb0
6914+ dev_hard_start_xmit+0x219/0x3a0
6915+ __dev_queue_xmit+0x415/0x4f0
6916+ dev_queue_xmit_sk+0x13/0x20
6917+ ip_finish_output2+0x237/0x340
6918+ ip_finish_output+0x113/0x1d0
6919+ ip_output+0x66/0xc0
6920+ ip_local_out_sk+0x31/0x40
6921+ ip_send_skb+0x1a/0x50
6922+ udp_send_skb+0x16d/0x270
6923+ udp_sendmsg+0x2bf/0x980
6924+ inet_sendmsg+0x67/0xa0
6925+ sock_sendmsg+0x38/0x50
6926+ ___sys_sendmsg+0x14e/0x270
6927+ } hitcount: 76
6928+ { stacktrace:
6929+ netif_rx_internal+0xb2/0xd0
6930+ netif_rx+0x1c/0x60
6931+ loopback_xmit+0x6c/0xb0
6932+ dev_hard_start_xmit+0x219/0x3a0
6933+ __dev_queue_xmit+0x415/0x4f0
6934+ dev_queue_xmit_sk+0x13/0x20
6935+ ip_finish_output2+0x237/0x340
6936+ ip_finish_output+0x113/0x1d0
6937+ ip_output+0x66/0xc0
6938+ ip_local_out_sk+0x31/0x40
6939+ ip_send_skb+0x1a/0x50
6940+ udp_send_skb+0x16d/0x270
6941+ udp_sendmsg+0x2bf/0x980
6942+ inet_sendmsg+0x67/0xa0
6943+ sock_sendmsg+0x38/0x50
6944+ ___sys_sendmsg+0x269/0x270
6945+ } hitcount: 77
6946+ { stacktrace:
6947+ netif_rx_internal+0xb2/0xd0
6948+ netif_rx+0x1c/0x60
6949+ loopback_xmit+0x6c/0xb0
6950+ dev_hard_start_xmit+0x219/0x3a0
6951+ __dev_queue_xmit+0x415/0x4f0
6952+ dev_queue_xmit_sk+0x13/0x20
6953+ ip_finish_output2+0x237/0x340
6954+ ip_finish_output+0x113/0x1d0
6955+ ip_output+0x66/0xc0
6956+ ip_local_out_sk+0x31/0x40
6957+ ip_send_skb+0x1a/0x50
6958+ udp_send_skb+0x16d/0x270
6959+ udp_sendmsg+0x2bf/0x980
6960+ inet_sendmsg+0x67/0xa0
6961+ sock_sendmsg+0x38/0x50
6962+ SYSC_sendto+0xef/0x170
6963+ } hitcount: 88
6964+ { stacktrace:
6965+ _do_fork+0x18e/0x330
6966+ SyS_clone+0x19/0x20
6967+ entry_SYSCALL_64_fastpath+0x12/0x6a
6968+ } hitcount: 244
6969+
6970+ Totals:
6971+ Hits: 489
6972+ Entries: 7
6973+ Dropped: 0
6974+
6975+
6976+2.2 Inter-event hist triggers
6977+-----------------------------
6978+
6979+Inter-event hist triggers are hist triggers that combine values from
6980+one or more other events and create a histogram using that data. Data
6981+from an inter-event histogram can in turn become the source for
6982+further combined histograms, thus providing a chain of related
6983+histograms, which is important for some applications.
6984+
6985+The most important example of an inter-event quantity that can be used
6986+in this manner is latency, which is simply a difference in timestamps
6987+between two events. Although latency is the most important
6988+inter-event quantity, note that because the support is completely
6989+general across the trace event subsystem, any event field can be used
6990+in an inter-event quantity.
6991+
6992+An example of a histogram that combines data from other histograms
6993+into a useful chain would be a 'wakeupswitch latency' histogram that
6994+combines a 'wakeup latency' histogram and a 'switch latency'
6995+histogram.
6996+
6997+Normally, a hist trigger specification consists of a (possibly
6998+compound) key along with one or more numeric values, which are
6999+continually updated sums associated with that key. A histogram
7000+specification in this case consists of individual key and value
7001+specifications that refer to trace event fields associated with a
7002+single event type.
7003+
7004+The inter-event hist trigger extension allows fields from multiple
7005+events to be referenced and combined into a multi-event histogram
7006+specification. In support of this overall goal, a few enabling
7007+features have been added to the hist trigger support:
7008+
7009+ - In order to compute an inter-event quantity, a value from one
7010+ event needs to saved and then referenced from another event. This
7011+ requires the introduction of support for histogram 'variables'.
7012+
7013+ - The computation of inter-event quantities and their combination
7014+ require some minimal amount of support for applying simple
7015+ expressions to variables (+ and -).
7016+
7017+ - A histogram consisting of inter-event quantities isn't logically a
7018+ histogram on either event (so having the 'hist' file for either
7019+ event host the histogram output doesn't really make sense). To
7020+ address the idea that the histogram is associated with a
7021+ combination of events, support is added allowing the creation of
7022+ 'synthetic' events that are events derived from other events.
7023+ These synthetic events are full-fledged events just like any other
7024+ and can be used as such, as for instance to create the
7025+ 'combination' histograms mentioned previously.
7026+
7027+ - A set of 'actions' can be associated with histogram entries -
7028+ these can be used to generate the previously mentioned synthetic
7029+ events, but can also be used for other purposes, such as for
7030+ example saving context when a 'max' latency has been hit.
7031+
7032+ - Trace events don't have a 'timestamp' associated with them, but
7033+ there is an implicit timestamp saved along with an event in the
7034+ underlying ftrace ring buffer. This timestamp is now exposed as a
7035+ a synthetic field named 'common_timestamp' which can be used in
7036+ histograms as if it were any other event field; it isn't an actual
7037+ field in the trace format but rather is a synthesized value that
7038+ nonetheless can be used as if it were an actual field. By default
7039+ it is in units of nanoseconds; appending '.usecs' to a
7040+ common_timestamp field changes the units to microseconds.
7041+
7042+A note on inter-event timestamps: If common_timestamp is used in a
7043+histogram, the trace buffer is automatically switched over to using
7044+absolute timestamps and the "global" trace clock, in order to avoid
7045+bogus timestamp differences with other clocks that aren't coherent
7046+across CPUs. This can be overridden by specifying one of the other
7047+trace clocks instead, using the "clock=XXX" hist trigger attribute,
7048+where XXX is any of the clocks listed in the tracing/trace_clock
7049+pseudo-file.
7050+
7051+These features are described in more detail in the following sections.
7052+
7053+2.2.1 Histogram Variables
7054+-------------------------
7055+
7056+Variables are simply named locations used for saving and retrieving
7057+values between matching events. A 'matching' event is defined as an
7058+event that has a matching key - if a variable is saved for a histogram
7059+entry corresponding to that key, any subsequent event with a matching
7060+key can access that variable.
7061+
7062+A variable's value is normally available to any subsequent event until
7063+it is set to something else by a subsequent event. The one exception
7064+to that rule is that any variable used in an expression is essentially
7065+'read-once' - once it's used by an expression in a subsequent event,
7066+it's reset to its 'unset' state, which means it can't be used again
7067+unless it's set again. This ensures not only that an event doesn't
7068+use an uninitialized variable in a calculation, but that that variable
7069+is used only once and not for any unrelated subsequent match.
7070+
7071+The basic syntax for saving a variable is to simply prefix a unique
7072+variable name not corresponding to any keyword along with an '=' sign
7073+to any event field.
7074+
7075+Either keys or values can be saved and retrieved in this way. This
7076+creates a variable named 'ts0' for a histogram entry with the key
7077+'next_pid':
7078+
7079+ # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
7080+ event/trigger
7081+
7082+The ts0 variable can be accessed by any subsequent event having the
7083+same pid as 'next_pid'.
7084+
7085+Variable references are formed by prepending the variable name with
7086+the '$' sign. Thus for example, the ts0 variable above would be
7087+referenced as '$ts0' in expressions.
7088+
7089+Because 'vals=' is used, the common_timestamp variable value above
7090+will also be summed as a normal histogram value would (though for a
7091+timestamp it makes little sense).
7092+
7093+The below shows that a key value can also be saved in the same way:
7094+
7095+ # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
7096+
7097+If a variable isn't a key variable or prefixed with 'vals=', the
7098+associated event field will be saved in a variable but won't be summed
7099+as a value:
7100+
7101+ # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
7102+
7103+Multiple variables can be assigned at the same time. The below would
7104+result in both ts0 and b being created as variables, with both
7105+common_timestamp and field1 additionally being summed as values:
7106+
7107+ # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
7108+ event/trigger
7109+
7110+Note that variable assignments can appear either preceding or
7111+following their use. The command below behaves identically to the
7112+command above:
7113+
7114+ # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
7115+ event/trigger
7116+
7117+Any number of variables not bound to a 'vals=' prefix can also be
7118+assigned by simply separating them with colons. Below is the same
7119+thing but without the values being summed in the histogram:
7120+
7121+ # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
7122+
7123+Variables set as above can be referenced and used in expressions on
7124+another event.
7125+
7126+For example, here's how a latency can be calculated:
7127+
7128+ # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
7129+ # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
7130+
7131+In the first line above, the event's timetamp is saved into the
7132+variable ts0. In the next line, ts0 is subtracted from the second
7133+event's timestamp to produce the latency, which is then assigned into
7134+yet another variable, 'wakeup_lat'. The hist trigger below in turn
7135+makes use of the wakeup_lat variable to compute a combined latency
7136+using the same key and variable from yet another event:
7137+
7138+ # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
7139+
7140+2.2.2 Synthetic Events
7141+----------------------
7142+
7143+Synthetic events are user-defined events generated from hist trigger
7144+variables or fields associated with one or more other events. Their
7145+purpose is to provide a mechanism for displaying data spanning
7146+multiple events consistent with the existing and already familiar
7147+usage for normal events.
7148+
7149+To define a synthetic event, the user writes a simple specification
7150+consisting of the name of the new event along with one or more
7151+variables and their types, which can be any valid field type,
7152+separated by semicolons, to the tracing/synthetic_events file.
7153+
7154+For instance, the following creates a new event named 'wakeup_latency'
7155+with 3 fields: lat, pid, and prio. Each of those fields is simply a
7156+variable reference to a variable on another event:
7157+
7158+ # echo 'wakeup_latency \
7159+ u64 lat; \
7160+ pid_t pid; \
7161+ int prio' >> \
7162+ /sys/kernel/debug/tracing/synthetic_events
7163+
7164+Reading the tracing/synthetic_events file lists all the currently
7165+defined synthetic events, in this case the event defined above:
7166+
7167+ # cat /sys/kernel/debug/tracing/synthetic_events
7168+ wakeup_latency u64 lat; pid_t pid; int prio
7169+
7170+An existing synthetic event definition can be removed by prepending
7171+the command that defined it with a '!':
7172+
7173+ # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
7174+ /sys/kernel/debug/tracing/synthetic_events
7175+
7176+At this point, there isn't yet an actual 'wakeup_latency' event
7177+instantiated in the event subsytem - for this to happen, a 'hist
7178+trigger action' needs to be instantiated and bound to actual fields
7179+and variables defined on other events (see Section 6.3.3 below).
7180+
7181+Once that is done, an event instance is created, and a histogram can
7182+be defined using it:
7183+
7184+ # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
7185+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
7186+
7187+The new event is created under the tracing/events/synthetic/ directory
7188+and looks and behaves just like any other event:
7189+
7190+ # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
7191+ enable filter format hist id trigger
7192+
7193+Like any other event, once a histogram is enabled for the event, the
7194+output can be displayed by reading the event's 'hist' file.
7195+
7196+2.2.3 Hist trigger 'actions'
7197+----------------------------
7198+
7199+A hist trigger 'action' is a function that's executed whenever a
7200+histogram entry is added or updated.
7201+
7202+The default 'action' if no special function is explicity specified is
7203+as it always has been, to simply update the set of values associated
7204+with an entry. Some applications, however, may want to perform
7205+additional actions at that point, such as generate another event, or
7206+compare and save a maximum.
7207+
7208+The following additional actions are available. To specify an action
7209+for a given event, simply specify the action between colons in the
7210+hist trigger specification.
7211+
7212+ - onmatch(matching.event).<synthetic_event_name>(param list)
7213+
7214+ The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
7215+ trigger action is invoked whenever an event matches and the
7216+ histogram entry would be added or updated. It causes the named
7217+ synthetic event to be generated with the values given in the
7218+ 'param list'. The result is the generation of a synthetic event
7219+ that consists of the values contained in those variables at the
7220+ time the invoking event was hit.
7221+
7222+ The 'param list' consists of one or more parameters which may be
7223+ either variables or fields defined on either the 'matching.event'
7224+ or the target event. The variables or fields specified in the
7225+ param list may be either fully-qualified or unqualified. If a
7226+ variable is specified as unqualified, it must be unique between
7227+ the two events. A field name used as a param can be unqualified
7228+ if it refers to the target event, but must be fully qualified if
7229+ it refers to the matching event. A fully-qualified name is of the
7230+ form 'system.event_name.$var_name' or 'system.event_name.field'.
7231+
7232+ The 'matching.event' specification is simply the fully qualified
7233+ event name of the event that matches the target event for the
7234+ onmatch() functionality, in the form 'system.event_name'.
7235+
7236+ Finally, the number and type of variables/fields in the 'param
7237+ list' must match the number and types of the fields in the
7238+ synthetic event being generated.
7239+
7240+ As an example the below defines a simple synthetic event and uses
7241+ a variable defined on the sched_wakeup_new event as a parameter
7242+ when invoking the synthetic event. Here we define the synthetic
7243+ event:
7244+
7245+ # echo 'wakeup_new_test pid_t pid' >> \
7246+ /sys/kernel/debug/tracing/synthetic_events
7247+
7248+ # cat /sys/kernel/debug/tracing/synthetic_events
7249+ wakeup_new_test pid_t pid
7250+
7251+ The following hist trigger both defines the missing testpid
7252+ variable and specifies an onmatch() action that generates a
7253+ wakeup_new_test synthetic event whenever a sched_wakeup_new event
7254+ occurs, which because of the 'if comm == "cyclictest"' filter only
7255+ happens when the executable is cyclictest:
7256+
7257+ # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
7258+ wakeup_new_test($testpid) if comm=="cyclictest"' >> \
7259+ /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
7260+
7261+ Creating and displaying a histogram based on those events is now
7262+ just a matter of using the fields and new synthetic event in the
7263+ tracing/events/synthetic directory, as usual:
7264+
7265+ # echo 'hist:keys=pid:sort=pid' >> \
7266+ /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
7267+
7268+ Running 'cyclictest' should cause wakeup_new events to generate
7269+ wakeup_new_test synthetic events which should result in histogram
7270+ output in the wakeup_new_test event's hist file:
7271+
7272+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
7273+
7274+ A more typical usage would be to use two events to calculate a
7275+ latency. The following example uses a set of hist triggers to
7276+ produce a 'wakeup_latency' histogram:
7277+
7278+ First, we define a 'wakeup_latency' synthetic event:
7279+
7280+ # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
7281+ /sys/kernel/debug/tracing/synthetic_events
7282+
7283+ Next, we specify that whenever we see a sched_waking event for a
7284+ cyclictest thread, save the timestamp in a 'ts0' variable:
7285+
7286+ # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
7287+ if comm=="cyclictest"' >> \
7288+ /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
7289+
7290+ Then, when the corresponding thread is actually scheduled onto the
7291+ CPU by a sched_switch event, calculate the latency and use that
7292+ along with another variable and an event field to generate a
7293+ wakeup_latency synthetic event:
7294+
7295+ # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
7296+ onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
7297+ $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
7298+ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
7299+
7300+ We also need to create a histogram on the wakeup_latency synthetic
7301+ event in order to aggregate the generated synthetic event data:
7302+
7303+ # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
7304+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
7305+
7306+ Finally, once we've run cyclictest to actually generate some
7307+ events, we can see the output by looking at the wakeup_latency
7308+ synthetic event's hist file:
7309+
7310+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
7311+
7312+ - onmax(var).save(field,.. .)
7313+
7314+ The 'onmax(var).save(field,...)' hist trigger action is invoked
7315+ whenever the value of 'var' associated with a histogram entry
7316+ exceeds the current maximum contained in that variable.
7317+
7318+ The end result is that the trace event fields specified as the
7319+ onmax.save() params will be saved if 'var' exceeds the current
7320+ maximum for that hist trigger entry. This allows context from the
7321+ event that exhibited the new maximum to be saved for later
7322+ reference. When the histogram is displayed, additional fields
7323+ displaying the saved values will be printed.
7324+
7325+ As an example the below defines a couple of hist triggers, one for
7326+ sched_waking and another for sched_switch, keyed on pid. Whenever
7327+ a sched_waking occurs, the timestamp is saved in the entry
7328+ corresponding to the current pid, and when the scheduler switches
7329+ back to that pid, the timestamp difference is calculated. If the
7330+ resulting latency, stored in wakeup_lat, exceeds the current
7331+ maximum latency, the values specified in the save() fields are
7332+ recoreded:
7333+
7334+ # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
7335+ if comm=="cyclictest"' >> \
7336+ /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
7337+
7338+ # echo 'hist:keys=next_pid:\
7339+ wakeup_lat=common_timestamp.usecs-$ts0:\
7340+ onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
7341+ if next_comm=="cyclictest"' >> \
7342+ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
7343+
7344+ When the histogram is displayed, the max value and the saved
7345+ values corresponding to the max are displayed following the rest
7346+ of the fields:
7347+
7348+ # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
7349+ { next_pid: 2255 } hitcount: 239
7350+ common_timestamp-ts0: 0
7351+ max: 27
7352+ next_comm: cyclictest
7353+ prev_pid: 0 prev_prio: 120 prev_comm: swapper/1
7354+
7355+ { next_pid: 2256 } hitcount: 2355
7356+ common_timestamp-ts0: 0
7357+ max: 49 next_comm: cyclictest
7358+ prev_pid: 0 prev_prio: 120 prev_comm: swapper/0
7359+
7360+ Totals:
7361+ Hits: 12970
7362+ Entries: 2
7363+ Dropped: 0
7364diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/acglobal.h linux-4.14/drivers/acpi/acpica/acglobal.h
7365--- linux-4.14.orig/drivers/acpi/acpica/acglobal.h 2017-11-12 19:46:13.000000000 +0100
7366+++ linux-4.14/drivers/acpi/acpica/acglobal.h 2018-09-05 11:05:07.000000000 +0200
7367@@ -116,7 +116,7 @@
7368 * interrupt level
7369 */
7370 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
7371-ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7372+ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7373 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
1a6e0f06 7374
e4b2b4a8
JK
7375 /* Mutex for _OSI support */
7376diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/hwregs.c linux-4.14/drivers/acpi/acpica/hwregs.c
7377--- linux-4.14.orig/drivers/acpi/acpica/hwregs.c 2017-11-12 19:46:13.000000000 +0100
7378+++ linux-4.14/drivers/acpi/acpica/hwregs.c 2018-09-05 11:05:07.000000000 +0200
7379@@ -428,14 +428,14 @@
7380 ACPI_BITMASK_ALL_FIXED_STATUS,
7381 ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
1a6e0f06 7382
e4b2b4a8
JK
7383- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7384+ raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
1a6e0f06 7385
e4b2b4a8 7386 /* Clear the fixed events in PM1 A/B */
1a6e0f06 7387
e4b2b4a8
JK
7388 status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
7389 ACPI_BITMASK_ALL_FIXED_STATUS);
1a6e0f06 7390
e4b2b4a8
JK
7391- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7392+ raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
1a6e0f06 7393
e4b2b4a8
JK
7394 if (ACPI_FAILURE(status)) {
7395 goto exit;
7396diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/hwxface.c linux-4.14/drivers/acpi/acpica/hwxface.c
7397--- linux-4.14.orig/drivers/acpi/acpica/hwxface.c 2017-11-12 19:46:13.000000000 +0100
7398+++ linux-4.14/drivers/acpi/acpica/hwxface.c 2018-09-05 11:05:07.000000000 +0200
7399@@ -373,7 +373,7 @@
7400 return_ACPI_STATUS(AE_BAD_PARAMETER);
7401 }
1a6e0f06 7402
e4b2b4a8
JK
7403- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7404+ raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
1a6e0f06 7405
e4b2b4a8
JK
7406 /*
7407 * At this point, we know that the parent register is one of the
7408@@ -434,7 +434,7 @@
1a6e0f06 7409
e4b2b4a8
JK
7410 unlock_and_exit:
7411
7412- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7413+ raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7414 return_ACPI_STATUS(status);
7415 }
7416
7417diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/utmutex.c linux-4.14/drivers/acpi/acpica/utmutex.c
7418--- linux-4.14.orig/drivers/acpi/acpica/utmutex.c 2017-11-12 19:46:13.000000000 +0100
7419+++ linux-4.14/drivers/acpi/acpica/utmutex.c 2018-09-05 11:05:07.000000000 +0200
7420@@ -88,7 +88,7 @@
7421 return_ACPI_STATUS (status);
7422 }
7423
7424- status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
7425+ status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
7426 if (ACPI_FAILURE (status)) {
7427 return_ACPI_STATUS (status);
7428 }
7429@@ -145,7 +145,7 @@
7430 /* Delete the spinlocks */
7431
7432 acpi_os_delete_lock(acpi_gbl_gpe_lock);
7433- acpi_os_delete_lock(acpi_gbl_hardware_lock);
7434+ acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
7435 acpi_os_delete_lock(acpi_gbl_reference_count_lock);
7436
7437 /* Delete the reader/writer lock */
7438diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ata/libata-sff.c linux-4.14/drivers/ata/libata-sff.c
7439--- linux-4.14.orig/drivers/ata/libata-sff.c 2017-11-12 19:46:13.000000000 +0100
7440+++ linux-4.14/drivers/ata/libata-sff.c 2018-09-05 11:05:07.000000000 +0200
7441@@ -679,9 +679,9 @@
7442 unsigned long flags;
7443 unsigned int consumed;
7444
7445- local_irq_save(flags);
7446+ local_irq_save_nort(flags);
7447 consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
7448- local_irq_restore(flags);
7449+ local_irq_restore_nort(flags);
7450
7451 return consumed;
7452 }
7453diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/brd.c linux-4.14/drivers/block/brd.c
7454--- linux-4.14.orig/drivers/block/brd.c 2017-11-12 19:46:13.000000000 +0100
7455+++ linux-4.14/drivers/block/brd.c 2018-09-05 11:05:07.000000000 +0200
7456@@ -60,7 +60,6 @@
7457 /*
7458 * Look up and return a brd's page for a given sector.
7459 */
7460-static DEFINE_MUTEX(brd_mutex);
7461 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
7462 {
7463 pgoff_t idx;
7464diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zcomp.c linux-4.14/drivers/block/zram/zcomp.c
7465--- linux-4.14.orig/drivers/block/zram/zcomp.c 2017-11-12 19:46:13.000000000 +0100
7466+++ linux-4.14/drivers/block/zram/zcomp.c 2018-09-05 11:05:07.000000000 +0200
7467@@ -116,12 +116,20 @@
1a6e0f06
JK
7468
7469 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
7470 {
7471- return *get_cpu_ptr(comp->stream);
7472+ struct zcomp_strm *zstrm;
7473+
e4b2b4a8 7474+ zstrm = *get_local_ptr(comp->stream);
1a6e0f06
JK
7475+ spin_lock(&zstrm->zcomp_lock);
7476+ return zstrm;
7477 }
7478
7479 void zcomp_stream_put(struct zcomp *comp)
7480 {
7481- put_cpu_ptr(comp->stream);
7482+ struct zcomp_strm *zstrm;
7483+
7484+ zstrm = *this_cpu_ptr(comp->stream);
7485+ spin_unlock(&zstrm->zcomp_lock);
e4b2b4a8 7486+ put_local_ptr(zstrm);
1a6e0f06
JK
7487 }
7488
7489 int zcomp_compress(struct zcomp_strm *zstrm,
e4b2b4a8
JK
7490@@ -171,6 +179,7 @@
7491 pr_err("Can't allocate a compression stream\n");
7492 return -ENOMEM;
7493 }
7494+ spin_lock_init(&zstrm->zcomp_lock);
7495 *per_cpu_ptr(comp->stream, cpu) = zstrm;
7496 return 0;
7497 }
7498diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zcomp.h linux-4.14/drivers/block/zram/zcomp.h
7499--- linux-4.14.orig/drivers/block/zram/zcomp.h 2017-11-12 19:46:13.000000000 +0100
7500+++ linux-4.14/drivers/block/zram/zcomp.h 2018-09-05 11:05:07.000000000 +0200
7501@@ -14,6 +14,7 @@
1a6e0f06
JK
7502 /* compression/decompression buffer */
7503 void *buffer;
7504 struct crypto_comp *tfm;
7505+ spinlock_t zcomp_lock;
7506 };
7507
7508 /* dynamic per-device compression frontend */
e4b2b4a8
JK
7509diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zram_drv.c linux-4.14/drivers/block/zram/zram_drv.c
7510--- linux-4.14.orig/drivers/block/zram/zram_drv.c 2017-11-12 19:46:13.000000000 +0100
7511+++ linux-4.14/drivers/block/zram/zram_drv.c 2018-09-05 11:05:07.000000000 +0200
7512@@ -756,6 +756,30 @@
7513 static DEVICE_ATTR_RO(mm_stat);
7514 static DEVICE_ATTR_RO(debug_stat);
1a6e0f06 7515
e4b2b4a8
JK
7516+#ifdef CONFIG_PREEMPT_RT_BASE
7517+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
7518+{
7519+ size_t index;
7520+
7521+ for (index = 0; index < num_pages; index++)
7522+ spin_lock_init(&zram->table[index].lock);
7523+}
7524+
7525+static void zram_slot_lock(struct zram *zram, u32 index)
7526+{
7527+ spin_lock(&zram->table[index].lock);
7528+ __set_bit(ZRAM_ACCESS, &zram->table[index].value);
7529+}
7530+
7531+static void zram_slot_unlock(struct zram *zram, u32 index)
7532+{
7533+ __clear_bit(ZRAM_ACCESS, &zram->table[index].value);
7534+ spin_unlock(&zram->table[index].lock);
7535+}
7536+
7537+#else
7538+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
1a6e0f06 7539+
e4b2b4a8
JK
7540 static void zram_slot_lock(struct zram *zram, u32 index)
7541 {
7542 bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
7543@@ -765,6 +789,7 @@
7544 {
7545 bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
7546 }
7547+#endif
1a6e0f06 7548
e4b2b4a8
JK
7549 static void zram_meta_free(struct zram *zram, u64 disksize)
7550 {
7551@@ -794,6 +819,7 @@
7552 return false;
7553 }
7554
7555+ zram_meta_init_table_locks(zram, num_pages);
7556 return true;
7557 }
7558
7559@@ -845,6 +871,7 @@
1a6e0f06
JK
7560 unsigned long handle;
7561 unsigned int size;
e4b2b4a8 7562 void *src, *dst;
1a6e0f06
JK
7563+ struct zcomp_strm *zstrm;
7564
e4b2b4a8
JK
7565 if (zram_wb_enabled(zram)) {
7566 zram_slot_lock(zram, index);
7567@@ -879,6 +906,7 @@
1a6e0f06 7568
e4b2b4a8 7569 size = zram_get_obj_size(zram, index);
1a6e0f06
JK
7570
7571+ zstrm = zcomp_stream_get(zram->comp);
e4b2b4a8 7572 src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1a6e0f06 7573 if (size == PAGE_SIZE) {
e4b2b4a8
JK
7574 dst = kmap_atomic(page);
7575@@ -886,14 +914,13 @@
7576 kunmap_atomic(dst);
7577 ret = 0;
1a6e0f06
JK
7578 } else {
7579- struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
e4b2b4a8
JK
7580
7581 dst = kmap_atomic(page);
7582 ret = zcomp_decompress(zstrm, src, size, dst);
7583 kunmap_atomic(dst);
1a6e0f06
JK
7584- zcomp_stream_put(zram->comp);
7585 }
e4b2b4a8 7586 zs_unmap_object(zram->mem_pool, handle);
1a6e0f06 7587+ zcomp_stream_put(zram->comp);
e4b2b4a8 7588 zram_slot_unlock(zram, index);
1a6e0f06
JK
7589
7590 /* Should NEVER happen. Return bio error if it does. */
e4b2b4a8
JK
7591diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zram_drv.h linux-4.14/drivers/block/zram/zram_drv.h
7592--- linux-4.14.orig/drivers/block/zram/zram_drv.h 2017-11-12 19:46:13.000000000 +0100
7593+++ linux-4.14/drivers/block/zram/zram_drv.h 2018-09-05 11:05:07.000000000 +0200
7594@@ -77,6 +77,9 @@
7595 unsigned long element;
7596 };
1a6e0f06
JK
7597 unsigned long value;
7598+#ifdef CONFIG_PREEMPT_RT_BASE
7599+ spinlock_t lock;
7600+#endif
7601 };
7602
7603 struct zram_stats {
e4b2b4a8
JK
7604diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/char/random.c linux-4.14/drivers/char/random.c
7605--- linux-4.14.orig/drivers/char/random.c 2018-09-05 11:03:20.000000000 +0200
7606+++ linux-4.14/drivers/char/random.c 2018-09-05 11:05:07.000000000 +0200
7607@@ -265,6 +265,7 @@
7608 #include <linux/syscalls.h>
7609 #include <linux/completion.h>
7610 #include <linux/uuid.h>
7611+#include <linux/locallock.h>
7612 #include <crypto/chacha20.h>
7613
7614 #include <asm/processor.h>
7615@@ -856,7 +857,7 @@
7616 invalidate_batched_entropy();
7617 crng_init = 1;
7618 wake_up_interruptible(&crng_init_wait);
7619- pr_notice("random: fast init done\n");
7620+ /* pr_notice("random: fast init done\n"); */
7621 }
7622 return 1;
7623 }
7624@@ -941,17 +942,21 @@
7625 crng_init = 2;
7626 process_random_ready_list();
7627 wake_up_interruptible(&crng_init_wait);
7628- pr_notice("random: crng init done\n");
7629+ /* pr_notice("random: crng init done\n"); */
7630 if (unseeded_warning.missed) {
7631+#if 0
7632 pr_notice("random: %d get_random_xx warning(s) missed "
7633 "due to ratelimiting\n",
7634 unseeded_warning.missed);
7635+#endif
7636 unseeded_warning.missed = 0;
7637 }
7638 if (urandom_warning.missed) {
7639+#if 0
7640 pr_notice("random: %d urandom warning(s) missed "
7641 "due to ratelimiting\n",
7642 urandom_warning.missed);
7643+#endif
7644 urandom_warning.missed = 0;
7645 }
7646 }
7647@@ -1122,8 +1127,6 @@
1a6e0f06
JK
7648 } sample;
7649 long delta, delta2, delta3;
7650
7651- preempt_disable();
7652-
7653 sample.jiffies = jiffies;
7654 sample.cycles = random_get_entropy();
7655 sample.num = num;
e4b2b4a8 7656@@ -1164,7 +1167,6 @@
1a6e0f06
JK
7657 */
7658 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
7659 }
7660- preempt_enable();
7661 }
7662
7663 void add_input_randomness(unsigned int type, unsigned int code,
e4b2b4a8
JK
7664@@ -1221,28 +1223,27 @@
7665 return *ptr;
1a6e0f06
JK
7666 }
7667
7668-void add_interrupt_randomness(int irq, int irq_flags)
7669+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
7670 {
7671 struct entropy_store *r;
7672 struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
7673- struct pt_regs *regs = get_irq_regs();
7674 unsigned long now = jiffies;
7675 cycles_t cycles = random_get_entropy();
7676 __u32 c_high, j_high;
7677- __u64 ip;
7678 unsigned long seed;
7679 int credit = 0;
7680
7681 if (cycles == 0)
7682- cycles = get_reg(fast_pool, regs);
7683+ cycles = get_reg(fast_pool, NULL);
7684 c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
7685 j_high = (sizeof(now) > 4) ? now >> 32 : 0;
7686 fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
7687 fast_pool->pool[1] ^= now ^ c_high;
7688- ip = regs ? instruction_pointer(regs) : _RET_IP_;
7689+ if (!ip)
7690+ ip = _RET_IP_;
7691 fast_pool->pool[2] ^= ip;
7692 fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
7693- get_reg(fast_pool, regs);
7694+ get_reg(fast_pool, NULL);
7695
7696 fast_mix(fast_pool);
7697 add_interrupt_bench(cycles);
e4b2b4a8
JK
7698@@ -2200,6 +2201,7 @@
7699 * at any point prior.
7700 */
7701 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
7702+static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock);
7703 u64 get_random_u64(void)
7704 {
7705 u64 ret;
7706@@ -2220,7 +2222,7 @@
7707 warn_unseeded_randomness(&previous);
7708
7709 use_lock = READ_ONCE(crng_init) < 2;
7710- batch = &get_cpu_var(batched_entropy_u64);
7711+ batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7712 if (use_lock)
7713 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7714 if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
7715@@ -2230,12 +2232,13 @@
7716 ret = batch->entropy_u64[batch->position++];
7717 if (use_lock)
7718 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7719- put_cpu_var(batched_entropy_u64);
7720+ put_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7721 return ret;
7722 }
7723 EXPORT_SYMBOL(get_random_u64);
7724
7725 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
7726+static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock);
7727 u32 get_random_u32(void)
7728 {
7729 u32 ret;
7730@@ -2250,7 +2253,7 @@
7731 warn_unseeded_randomness(&previous);
7732
7733 use_lock = READ_ONCE(crng_init) < 2;
7734- batch = &get_cpu_var(batched_entropy_u32);
7735+ batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7736 if (use_lock)
7737 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7738 if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
7739@@ -2260,7 +2263,7 @@
7740 ret = batch->entropy_u32[batch->position++];
7741 if (use_lock)
7742 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7743- put_cpu_var(batched_entropy_u32);
7744+ put_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7745 return ret;
7746 }
7747 EXPORT_SYMBOL(get_random_u32);
7748diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/char/tpm/tpm_tis.c linux-4.14/drivers/char/tpm/tpm_tis.c
7749--- linux-4.14.orig/drivers/char/tpm/tpm_tis.c 2018-09-05 11:03:20.000000000 +0200
7750+++ linux-4.14/drivers/char/tpm/tpm_tis.c 2018-09-05 11:05:07.000000000 +0200
7751@@ -52,6 +52,31 @@
7752 return container_of(data, struct tpm_tis_tcg_phy, priv);
7753 }
7754
7755+#ifdef CONFIG_PREEMPT_RT_FULL
7756+/*
7757+ * Flushes previous write operations to chip so that a subsequent
7758+ * ioread*()s won't stall a cpu.
7759+ */
7760+static inline void tpm_tis_flush(void __iomem *iobase)
7761+{
7762+ ioread8(iobase + TPM_ACCESS(0));
7763+}
7764+#else
7765+#define tpm_tis_flush(iobase) do { } while (0)
7766+#endif
7767+
7768+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
7769+{
7770+ iowrite8(b, iobase + addr);
7771+ tpm_tis_flush(iobase);
7772+}
7773+
7774+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
7775+{
7776+ iowrite32(b, iobase + addr);
7777+ tpm_tis_flush(iobase);
7778+}
7779+
7780 static bool interrupts = true;
7781 module_param(interrupts, bool, 0444);
7782 MODULE_PARM_DESC(interrupts, "Enable interrupts");
7783@@ -149,7 +174,7 @@
7784 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7785
7786 while (len--)
7787- iowrite8(*value++, phy->iobase + addr);
7788+ tpm_tis_iowrite8(*value++, phy->iobase, addr);
7789
7790 return 0;
7791 }
7792@@ -176,7 +201,7 @@
7793 {
7794 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7795
7796- iowrite32(value, phy->iobase + addr);
7797+ tpm_tis_iowrite32(value, phy->iobase, addr);
7798
7799 return 0;
7800 }
7801diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/tcb_clksrc.c linux-4.14/drivers/clocksource/tcb_clksrc.c
7802--- linux-4.14.orig/drivers/clocksource/tcb_clksrc.c 2017-11-12 19:46:13.000000000 +0100
7803+++ linux-4.14/drivers/clocksource/tcb_clksrc.c 2018-09-05 11:05:07.000000000 +0200
7804@@ -25,8 +25,7 @@
1a6e0f06
JK
7805 * this 32 bit free-running counter. the second channel is not used.
7806 *
7807 * - The third channel may be used to provide a 16-bit clockevent
7808- * source, used in either periodic or oneshot mode. This runs
7809- * at 32 KiHZ, and can handle delays of up to two seconds.
7810+ * source, used in either periodic or oneshot mode.
7811 *
7812 * A boot clocksource and clockevent source are also currently needed,
7813 * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
e4b2b4a8 7814@@ -126,6 +125,8 @@
1a6e0f06
JK
7815 struct tc_clkevt_device {
7816 struct clock_event_device clkevt;
7817 struct clk *clk;
7818+ bool clk_enabled;
7819+ u32 freq;
7820 void __iomem *regs;
7821 };
7822
e4b2b4a8 7823@@ -134,15 +135,26 @@
1a6e0f06
JK
7824 return container_of(clkevt, struct tc_clkevt_device, clkevt);
7825 }
7826
7827-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
7828- * because using one of the divided clocks would usually mean the
7829- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
7830- *
7831- * A divided clock could be good for high resolution timers, since
7832- * 30.5 usec resolution can seem "low".
7833- */
7834 static u32 timer_clock;
7835
7836+static void tc_clk_disable(struct clock_event_device *d)
7837+{
7838+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7839+
7840+ clk_disable(tcd->clk);
7841+ tcd->clk_enabled = false;
7842+}
7843+
7844+static void tc_clk_enable(struct clock_event_device *d)
7845+{
7846+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7847+
7848+ if (tcd->clk_enabled)
7849+ return;
7850+ clk_enable(tcd->clk);
7851+ tcd->clk_enabled = true;
7852+}
7853+
7854 static int tc_shutdown(struct clock_event_device *d)
7855 {
7856 struct tc_clkevt_device *tcd = to_tc_clkevt(d);
e4b2b4a8 7857@@ -150,8 +162,14 @@
1a6e0f06 7858
e4b2b4a8
JK
7859 writel(0xff, regs + ATMEL_TC_REG(2, IDR));
7860 writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
1a6e0f06
JK
7861+ return 0;
7862+}
7863+
7864+static int tc_shutdown_clk_off(struct clock_event_device *d)
7865+{
7866+ tc_shutdown(d);
7867 if (!clockevent_state_detached(d))
7868- clk_disable(tcd->clk);
7869+ tc_clk_disable(d);
7870
7871 return 0;
7872 }
e4b2b4a8 7873@@ -164,9 +182,9 @@
1a6e0f06
JK
7874 if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
7875 tc_shutdown(d);
7876
7877- clk_enable(tcd->clk);
7878+ tc_clk_enable(d);
7879
7880- /* slow clock, count up to RC, then irq and stop */
7881+ /* count up to RC, then irq and stop */
e4b2b4a8 7882 writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
1a6e0f06 7883 ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
e4b2b4a8
JK
7884 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
7885@@ -186,12 +204,12 @@
1a6e0f06
JK
7886 /* By not making the gentime core emulate periodic mode on top
7887 * of oneshot, we get lower overhead and improved accuracy.
7888 */
7889- clk_enable(tcd->clk);
7890+ tc_clk_enable(d);
7891
7892- /* slow clock, count up to RC, then irq and restart */
7893+ /* count up to RC, then irq and restart */
e4b2b4a8 7894 writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
1a6e0f06 7895 regs + ATMEL_TC_REG(2, CMR));
e4b2b4a8
JK
7896- writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
7897+ writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
1a6e0f06
JK
7898
7899 /* Enable clock and interrupts on RC compare */
e4b2b4a8
JK
7900 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
7901@@ -218,9 +236,13 @@
1a6e0f06
JK
7902 .features = CLOCK_EVT_FEAT_PERIODIC |
7903 CLOCK_EVT_FEAT_ONESHOT,
7904 /* Should be lower than at91rm9200's system timer */
7905+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
7906 .rating = 125,
7907+#else
7908+ .rating = 200,
7909+#endif
7910 .set_next_event = tc_next_event,
7911- .set_state_shutdown = tc_shutdown,
7912+ .set_state_shutdown = tc_shutdown_clk_off,
7913 .set_state_periodic = tc_set_periodic,
7914 .set_state_oneshot = tc_set_oneshot,
7915 },
e4b2b4a8 7916@@ -240,8 +262,9 @@
1a6e0f06
JK
7917 return IRQ_NONE;
7918 }
7919
7920-static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
7921+static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
7922 {
7923+ unsigned divisor = atmel_tc_divisors[divisor_idx];
7924 int ret;
7925 struct clk *t2_clk = tc->clk[2];
7926 int irq = tc->irq[2];
e4b2b4a8 7927@@ -262,7 +285,11 @@
1a6e0f06
JK
7928 clkevt.regs = tc->regs;
7929 clkevt.clk = t2_clk;
7930
7931- timer_clock = clk32k_divisor_idx;
7932+ timer_clock = divisor_idx;
7933+ if (!divisor)
7934+ clkevt.freq = 32768;
7935+ else
7936+ clkevt.freq = clk_get_rate(t2_clk) / divisor;
7937
7938 clkevt.clkevt.cpumask = cpumask_of(0);
7939
e4b2b4a8 7940@@ -273,7 +300,7 @@
1a6e0f06
JK
7941 return ret;
7942 }
7943
7944- clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
7945+ clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
7946
7947 return ret;
7948 }
e4b2b4a8 7949@@ -410,7 +437,11 @@
1a6e0f06
JK
7950 goto err_disable_t1;
7951
7952 /* channel 2: periodic and oneshot timer support */
7953+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
7954 ret = setup_clkevents(tc, clk32k_divisor_idx);
7955+#else
7956+ ret = setup_clkevents(tc, best_divisor_idx);
7957+#endif
7958 if (ret)
7959 goto err_unregister_clksrc;
7960
e4b2b4a8
JK
7961diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/timer-atmel-pit.c linux-4.14/drivers/clocksource/timer-atmel-pit.c
7962--- linux-4.14.orig/drivers/clocksource/timer-atmel-pit.c 2017-11-12 19:46:13.000000000 +0100
7963+++ linux-4.14/drivers/clocksource/timer-atmel-pit.c 2018-09-05 11:05:07.000000000 +0200
7964@@ -46,6 +46,7 @@
1a6e0f06
JK
7965 u32 cycle;
7966 u32 cnt;
7967 unsigned int irq;
7968+ bool irq_requested;
7969 struct clk *mck;
7970 };
7971
e4b2b4a8 7972@@ -96,15 +97,29 @@
1a6e0f06
JK
7973
7974 /* disable irq, leaving the clocksource active */
7975 pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
7976+ if (data->irq_requested) {
7977+ free_irq(data->irq, data);
7978+ data->irq_requested = false;
7979+ }
7980 return 0;
7981 }
7982
7983+static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
7984 /*
7985 * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
7986 */
7987 static int pit_clkevt_set_periodic(struct clock_event_device *dev)
7988 {
7989 struct pit_data *data = clkevt_to_pit_data(dev);
7990+ int ret;
7991+
7992+ ret = request_irq(data->irq, at91sam926x_pit_interrupt,
7993+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
7994+ "at91_tick", data);
7995+ if (ret)
7996+ panic(pr_fmt("Unable to setup IRQ\n"));
7997+
7998+ data->irq_requested = true;
7999
8000 /* update clocksource counter */
8001 data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
e4b2b4a8 8002@@ -230,15 +245,6 @@
1a6e0f06
JK
8003 return ret;
8004 }
8005
8006- /* Set up irq handler */
8007- ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8008- IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8009- "at91_tick", data);
8010- if (ret) {
8011- pr_err("Unable to setup IRQ\n");
8012- return ret;
8013- }
8014-
8015 /* Set up and register clockevents */
8016 data->clkevt.name = "pit";
8017 data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
e4b2b4a8
JK
8018diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/timer-atmel-st.c linux-4.14/drivers/clocksource/timer-atmel-st.c
8019--- linux-4.14.orig/drivers/clocksource/timer-atmel-st.c 2017-11-12 19:46:13.000000000 +0100
8020+++ linux-4.14/drivers/clocksource/timer-atmel-st.c 2018-09-05 11:05:07.000000000 +0200
8021@@ -115,18 +115,29 @@
1a6e0f06
JK
8022 last_crtr = read_CRTR();
8023 }
8024
8025+static int atmel_st_irq;
8026+
8027 static int clkevt32k_shutdown(struct clock_event_device *evt)
8028 {
8029 clkdev32k_disable_and_flush_irq();
8030 irqmask = 0;
8031 regmap_write(regmap_st, AT91_ST_IER, irqmask);
8032+ free_irq(atmel_st_irq, regmap_st);
8033 return 0;
8034 }
8035
8036 static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8037 {
8038+ int ret;
8039+
8040 clkdev32k_disable_and_flush_irq();
8041
8042+ ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8043+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8044+ "at91_tick", regmap_st);
8045+ if (ret)
8046+ panic(pr_fmt("Unable to setup IRQ\n"));
8047+
8048 /*
8049 * ALM for oneshot irqs, set by next_event()
8050 * before 32 seconds have passed.
e4b2b4a8 8051@@ -139,8 +150,16 @@
1a6e0f06
JK
8052
8053 static int clkevt32k_set_periodic(struct clock_event_device *dev)
8054 {
8055+ int ret;
8056+
8057 clkdev32k_disable_and_flush_irq();
8058
8059+ ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8060+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8061+ "at91_tick", regmap_st);
8062+ if (ret)
8063+ panic(pr_fmt("Unable to setup IRQ\n"));
8064+
8065 /* PIT for periodic irqs; fixed rate of 1/HZ */
8066 irqmask = AT91_ST_PITS;
8067 regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
e4b2b4a8 8068@@ -198,7 +217,7 @@
1a6e0f06
JK
8069 {
8070 struct clk *sclk;
8071 unsigned int sclk_rate, val;
8072- int irq, ret;
8073+ int ret;
8074
8075 regmap_st = syscon_node_to_regmap(node);
8076 if (IS_ERR(regmap_st)) {
e4b2b4a8 8077@@ -212,21 +231,12 @@
1a6e0f06
JK
8078 regmap_read(regmap_st, AT91_ST_SR, &val);
8079
8080 /* Get the interrupts property */
8081- irq = irq_of_parse_and_map(node, 0);
8082- if (!irq) {
8083+ atmel_st_irq = irq_of_parse_and_map(node, 0);
8084+ if (!atmel_st_irq) {
8085 pr_err("Unable to get IRQ from DT\n");
8086 return -EINVAL;
8087 }
8088
8089- /* Make IRQs happen for the system timer */
8090- ret = request_irq(irq, at91rm9200_timer_interrupt,
8091- IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8092- "at91_tick", regmap_st);
8093- if (ret) {
8094- pr_err("Unable to setup IRQ\n");
8095- return ret;
8096- }
8097-
8098 sclk = of_clk_get(node, 0);
8099 if (IS_ERR(sclk)) {
8100 pr_err("Unable to get slow clock\n");
e4b2b4a8
JK
8101diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/connector/cn_proc.c linux-4.14/drivers/connector/cn_proc.c
8102--- linux-4.14.orig/drivers/connector/cn_proc.c 2017-11-12 19:46:13.000000000 +0100
8103+++ linux-4.14/drivers/connector/cn_proc.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
8104@@ -32,6 +32,7 @@
8105 #include <linux/pid_namespace.h>
8106
8107 #include <linux/cn_proc.h>
8108+#include <linux/locallock.h>
8109
8110 /*
8111 * Size of a cn_msg followed by a proc_event structure. Since the
e4b2b4a8 8112@@ -54,10 +55,11 @@
1a6e0f06
JK
8113
8114 /* proc_event_counts is used as the sequence number of the netlink message */
8115 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
8116+static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
8117
8118 static inline void send_msg(struct cn_msg *msg)
8119 {
8120- preempt_disable();
8121+ local_lock(send_msg_lock);
8122
8123 msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
8124 ((struct proc_event *)msg->data)->cpu = smp_processor_id();
e4b2b4a8 8125@@ -70,7 +72,7 @@
1a6e0f06
JK
8126 */
8127 cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
8128
8129- preempt_enable();
8130+ local_unlock(send_msg_lock);
8131 }
8132
8133 void proc_fork_connector(struct task_struct *task)
e4b2b4a8
JK
8134diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/cpufreq/Kconfig.x86 linux-4.14/drivers/cpufreq/Kconfig.x86
8135--- linux-4.14.orig/drivers/cpufreq/Kconfig.x86 2017-11-12 19:46:13.000000000 +0100
8136+++ linux-4.14/drivers/cpufreq/Kconfig.x86 2018-09-05 11:05:07.000000000 +0200
8137@@ -125,7 +125,7 @@
1a6e0f06
JK
8138
8139 config X86_POWERNOW_K8
8140 tristate "AMD Opteron/Athlon64 PowerNow!"
8141- depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8142+ depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8143 help
8144 This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8145 Support for K10 and newer processors is now in acpi-cpufreq.
e4b2b4a8
JK
8146diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/i915_gem_timeline.c linux-4.14/drivers/gpu/drm/i915/i915_gem_timeline.c
8147--- linux-4.14.orig/drivers/gpu/drm/i915/i915_gem_timeline.c 2017-11-12 19:46:13.000000000 +0100
8148+++ linux-4.14/drivers/gpu/drm/i915/i915_gem_timeline.c 2018-09-05 11:05:07.000000000 +0200
8149@@ -33,11 +33,8 @@
8150 {
8151 tl->fence_context = context;
8152 tl->common = parent;
8153-#ifdef CONFIG_DEBUG_SPINLOCK
8154- __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
8155-#else
8156 spin_lock_init(&tl->lock);
8157-#endif
8158+ lockdep_set_class_and_name(&tl->lock, lockclass, lockname);
8159 init_request_active(&tl->last_request, NULL);
8160 INIT_LIST_HEAD(&tl->requests);
8161 i915_syncmap_init(&tl->sync);
8162diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/i915_irq.c linux-4.14/drivers/gpu/drm/i915/i915_irq.c
8163--- linux-4.14.orig/drivers/gpu/drm/i915/i915_irq.c 2018-09-05 11:03:21.000000000 +0200
8164+++ linux-4.14/drivers/gpu/drm/i915/i915_irq.c 2018-09-05 11:05:07.000000000 +0200
8165@@ -867,6 +867,7 @@
1a6e0f06
JK
8166 spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8167
8168 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8169+ preempt_disable_rt();
8170
8171 /* Get optional system timestamp before query. */
8172 if (stime)
e4b2b4a8 8173@@ -918,6 +919,7 @@
1a6e0f06
JK
8174 *etime = ktime_get();
8175
8176 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8177+ preempt_enable_rt();
8178
8179 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8180
e4b2b4a8
JK
8181diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/intel_sprite.c linux-4.14/drivers/gpu/drm/i915/intel_sprite.c
8182--- linux-4.14.orig/drivers/gpu/drm/i915/intel_sprite.c 2018-09-05 11:03:21.000000000 +0200
8183+++ linux-4.14/drivers/gpu/drm/i915/intel_sprite.c 2018-09-05 11:05:07.000000000 +0200
8184@@ -36,6 +36,7 @@
c7c16703
JK
8185 #include <drm/drm_rect.h>
8186 #include <drm/drm_atomic.h>
8187 #include <drm/drm_plane_helper.h>
8188+#include <linux/locallock.h>
1a6e0f06 8189 #include "intel_drv.h"
c7c16703 8190 #include "intel_frontbuffer.h"
1a6e0f06 8191 #include <drm/i915_drm.h>
e4b2b4a8 8192@@ -67,7 +68,7 @@
1a6e0f06
JK
8193 }
8194
e4b2b4a8
JK
8195 #define VBLANK_EVASION_TIME_US 100
8196-
1a6e0f06 8197+static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
1a6e0f06
JK
8198 /**
8199 * intel_pipe_update_start() - start update of a set of display registers
8200 * @crtc: the crtc of which the registers are going to be updated
e4b2b4a8
JK
8201@@ -102,7 +103,7 @@
8202 VBLANK_EVASION_TIME_US);
1a6e0f06
JK
8203 max = vblank_start - 1;
8204
8205- local_irq_disable();
8206+ local_lock_irq(pipe_update_lock);
8207
8208 if (min <= 0 || max <= 0)
8209 return;
e4b2b4a8 8210@@ -132,11 +133,11 @@
1a6e0f06
JK
8211 break;
8212 }
8213
8214- local_irq_enable();
8215+ local_unlock_irq(pipe_update_lock);
8216
8217 timeout = schedule_timeout(timeout);
8218
8219- local_irq_disable();
8220+ local_lock_irq(pipe_update_lock);
8221 }
8222
8223 finish_wait(wq, &wait);
e4b2b4a8 8224@@ -201,7 +202,7 @@
1a6e0f06
JK
8225 crtc->base.state->event = NULL;
8226 }
8227
8228- local_irq_enable();
8229+ local_unlock_irq(pipe_update_lock);
8230
e4b2b4a8
JK
8231 if (intel_vgpu_active(dev_priv))
8232 return;
8233diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/radeon/radeon_display.c linux-4.14/drivers/gpu/drm/radeon/radeon_display.c
8234--- linux-4.14.orig/drivers/gpu/drm/radeon/radeon_display.c 2017-11-12 19:46:13.000000000 +0100
8235+++ linux-4.14/drivers/gpu/drm/radeon/radeon_display.c 2018-09-05 11:05:07.000000000 +0200
8236@@ -1839,6 +1839,7 @@
1a6e0f06
JK
8237 struct radeon_device *rdev = dev->dev_private;
8238
8239 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8240+ preempt_disable_rt();
8241
8242 /* Get optional system timestamp before query. */
8243 if (stime)
e4b2b4a8 8244@@ -1931,6 +1932,7 @@
1a6e0f06
JK
8245 *etime = ktime_get();
8246
8247 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8248+ preempt_enable_rt();
8249
8250 /* Decode into vertical and horizontal scanout position. */
8251 *vpos = position & 0x1fff;
e4b2b4a8
JK
8252diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/hv/vmbus_drv.c linux-4.14/drivers/hv/vmbus_drv.c
8253--- linux-4.14.orig/drivers/hv/vmbus_drv.c 2018-09-05 11:03:21.000000000 +0200
8254+++ linux-4.14/drivers/hv/vmbus_drv.c 2018-09-05 11:05:37.000000000 +0200
8255@@ -39,6 +39,7 @@
8256 #include <asm/hyperv.h>
8257 #include <asm/hypervisor.h>
8258 #include <asm/mshyperv.h>
8259+#include <asm/irq_regs.h>
8260 #include <linux/notifier.h>
8261 #include <linux/ptrace.h>
8262 #include <linux/screen_info.h>
8263@@ -966,6 +967,8 @@
8264 void *page_addr = hv_cpu->synic_event_page;
1a6e0f06
JK
8265 struct hv_message *msg;
8266 union hv_synic_event_flags *event;
8267+ struct pt_regs *regs = get_irq_regs();
8268+ u64 ip = regs ? instruction_pointer(regs) : 0;
8269 bool handled = false;
8270
e4b2b4a8
JK
8271 if (unlikely(page_addr == NULL))
8272@@ -1009,7 +1012,7 @@
8273 tasklet_schedule(&hv_cpu->msg_dpc);
1a6e0f06
JK
8274 }
8275
8276- add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8277+ add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
8278 }
8279
8280
e4b2b4a8
JK
8281diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/alim15x3.c linux-4.14/drivers/ide/alim15x3.c
8282--- linux-4.14.orig/drivers/ide/alim15x3.c 2017-11-12 19:46:13.000000000 +0100
8283+++ linux-4.14/drivers/ide/alim15x3.c 2018-09-05 11:05:07.000000000 +0200
8284@@ -234,7 +234,7 @@
1a6e0f06
JK
8285
8286 isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8287
8288- local_irq_save(flags);
8289+ local_irq_save_nort(flags);
8290
8291 if (m5229_revision < 0xC2) {
8292 /*
e4b2b4a8 8293@@ -325,7 +325,7 @@
1a6e0f06
JK
8294 }
8295 pci_dev_put(north);
8296 pci_dev_put(isa_dev);
8297- local_irq_restore(flags);
8298+ local_irq_restore_nort(flags);
8299 return 0;
8300 }
8301
e4b2b4a8
JK
8302diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/hpt366.c linux-4.14/drivers/ide/hpt366.c
8303--- linux-4.14.orig/drivers/ide/hpt366.c 2017-11-12 19:46:13.000000000 +0100
8304+++ linux-4.14/drivers/ide/hpt366.c 2018-09-05 11:05:07.000000000 +0200
8305@@ -1236,7 +1236,7 @@
1a6e0f06
JK
8306
8307 dma_old = inb(base + 2);
8308
8309- local_irq_save(flags);
8310+ local_irq_save_nort(flags);
8311
8312 dma_new = dma_old;
8313 pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
e4b2b4a8 8314@@ -1247,7 +1247,7 @@
1a6e0f06
JK
8315 if (dma_new != dma_old)
8316 outb(dma_new, base + 2);
8317
8318- local_irq_restore(flags);
8319+ local_irq_restore_nort(flags);
8320
8321 printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
8322 hwif->name, base, base + 7);
e4b2b4a8
JK
8323diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-io.c linux-4.14/drivers/ide/ide-io.c
8324--- linux-4.14.orig/drivers/ide/ide-io.c 2017-11-12 19:46:13.000000000 +0100
8325+++ linux-4.14/drivers/ide/ide-io.c 2018-09-05 11:05:07.000000000 +0200
8326@@ -660,7 +660,7 @@
8327 /* disable_irq_nosync ?? */
8328 disable_irq(hwif->irq);
8329 /* local CPU only, as if we were handling an interrupt */
8330- local_irq_disable();
8331+ local_irq_disable_nort();
8332 if (hwif->polling) {
8333 startstop = handler(drive);
8334 } else if (drive_is_ready(drive)) {
8335diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-iops.c linux-4.14/drivers/ide/ide-iops.c
8336--- linux-4.14.orig/drivers/ide/ide-iops.c 2017-11-12 19:46:13.000000000 +0100
8337+++ linux-4.14/drivers/ide/ide-iops.c 2018-09-05 11:05:07.000000000 +0200
8338@@ -129,12 +129,12 @@
8339 if ((stat & ATA_BUSY) == 0)
8340 break;
8341
8342- local_irq_restore(flags);
8343+ local_irq_restore_nort(flags);
8344 *rstat = stat;
8345 return -EBUSY;
8346 }
8347 }
8348- local_irq_restore(flags);
8349+ local_irq_restore_nort(flags);
8350 }
8351 /*
8352 * Allow status to settle, then read it again.
8353diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-io-std.c linux-4.14/drivers/ide/ide-io-std.c
8354--- linux-4.14.orig/drivers/ide/ide-io-std.c 2017-11-12 19:46:13.000000000 +0100
8355+++ linux-4.14/drivers/ide/ide-io-std.c 2018-09-05 11:05:07.000000000 +0200
8356@@ -175,7 +175,7 @@
1a6e0f06
JK
8357 unsigned long uninitialized_var(flags);
8358
8359 if ((io_32bit & 2) && !mmio) {
8360- local_irq_save(flags);
8361+ local_irq_save_nort(flags);
8362 ata_vlb_sync(io_ports->nsect_addr);
8363 }
8364
e4b2b4a8 8365@@ -186,7 +186,7 @@
1a6e0f06
JK
8366 insl(data_addr, buf, words);
8367
8368 if ((io_32bit & 2) && !mmio)
8369- local_irq_restore(flags);
8370+ local_irq_restore_nort(flags);
8371
8372 if (((len + 1) & 3) < 2)
8373 return;
e4b2b4a8 8374@@ -219,7 +219,7 @@
1a6e0f06
JK
8375 unsigned long uninitialized_var(flags);
8376
8377 if ((io_32bit & 2) && !mmio) {
8378- local_irq_save(flags);
8379+ local_irq_save_nort(flags);
8380 ata_vlb_sync(io_ports->nsect_addr);
8381 }
8382
e4b2b4a8 8383@@ -230,7 +230,7 @@
1a6e0f06
JK
8384 outsl(data_addr, buf, words);
8385
8386 if ((io_32bit & 2) && !mmio)
8387- local_irq_restore(flags);
8388+ local_irq_restore_nort(flags);
8389
8390 if (((len + 1) & 3) < 2)
8391 return;
e4b2b4a8
JK
8392diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-probe.c linux-4.14/drivers/ide/ide-probe.c
8393--- linux-4.14.orig/drivers/ide/ide-probe.c 2017-11-12 19:46:13.000000000 +0100
8394+++ linux-4.14/drivers/ide/ide-probe.c 2018-09-05 11:05:07.000000000 +0200
8395@@ -196,10 +196,10 @@
1a6e0f06
JK
8396 int bswap = 1;
8397
8398 /* local CPU only; some systems need this */
8399- local_irq_save(flags);
8400+ local_irq_save_nort(flags);
8401 /* read 512 bytes of id info */
8402 hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8403- local_irq_restore(flags);
8404+ local_irq_restore_nort(flags);
8405
8406 drive->dev_flags |= IDE_DFLAG_ID_READ;
8407 #ifdef DEBUG
e4b2b4a8
JK
8408diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-taskfile.c linux-4.14/drivers/ide/ide-taskfile.c
8409--- linux-4.14.orig/drivers/ide/ide-taskfile.c 2017-11-12 19:46:13.000000000 +0100
8410+++ linux-4.14/drivers/ide/ide-taskfile.c 2018-09-05 11:05:07.000000000 +0200
8411@@ -251,7 +251,7 @@
1a6e0f06
JK
8412
8413 page_is_high = PageHighMem(page);
8414 if (page_is_high)
8415- local_irq_save(flags);
8416+ local_irq_save_nort(flags);
8417
8418 buf = kmap_atomic(page) + offset;
8419
e4b2b4a8 8420@@ -272,7 +272,7 @@
1a6e0f06
JK
8421 kunmap_atomic(buf);
8422
8423 if (page_is_high)
8424- local_irq_restore(flags);
8425+ local_irq_restore_nort(flags);
8426
8427 len -= nr_bytes;
8428 }
e4b2b4a8 8429@@ -415,7 +415,7 @@
1a6e0f06
JK
8430 }
8431
8432 if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8433- local_irq_disable();
8434+ local_irq_disable_nort();
8435
8436 ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8437
e4b2b4a8
JK
8438diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/hfi1/affinity.c linux-4.14/drivers/infiniband/hw/hfi1/affinity.c
8439--- linux-4.14.orig/drivers/infiniband/hw/hfi1/affinity.c 2018-09-05 11:03:22.000000000 +0200
8440+++ linux-4.14/drivers/infiniband/hw/hfi1/affinity.c 2018-09-05 11:05:07.000000000 +0200
8441@@ -575,7 +575,7 @@
8442 struct hfi1_affinity_node *entry;
8443 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
8444 const struct cpumask *node_mask,
8445- *proc_mask = &current->cpus_allowed;
8446+ *proc_mask = current->cpus_ptr;
8447 struct hfi1_affinity_node_list *affinity = &node_affinity;
8448 struct cpu_mask_set *set = &affinity->proc;
8449
8450@@ -583,7 +583,7 @@
8451 * check whether process/context affinity has already
8452 * been set
8453 */
8454- if (cpumask_weight(proc_mask) == 1) {
8455+ if (current->nr_cpus_allowed == 1) {
8456 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
8457 current->pid, current->comm,
8458 cpumask_pr_args(proc_mask));
8459@@ -594,7 +594,7 @@
8460 cpu = cpumask_first(proc_mask);
8461 cpumask_set_cpu(cpu, &set->used);
8462 goto done;
8463- } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
8464+ } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
8465 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
8466 current->pid, current->comm,
8467 cpumask_pr_args(proc_mask));
8468diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/hfi1/sdma.c linux-4.14/drivers/infiniband/hw/hfi1/sdma.c
8469--- linux-4.14.orig/drivers/infiniband/hw/hfi1/sdma.c 2017-11-12 19:46:13.000000000 +0100
8470+++ linux-4.14/drivers/infiniband/hw/hfi1/sdma.c 2018-09-05 11:05:07.000000000 +0200
8471@@ -856,14 +856,13 @@
8472 {
8473 struct sdma_rht_node *rht_node;
8474 struct sdma_engine *sde = NULL;
8475- const struct cpumask *current_mask = &current->cpus_allowed;
8476 unsigned long cpu_id;
8477
8478 /*
8479 * To ensure that always the same sdma engine(s) will be
8480 * selected make sure the process is pinned to this CPU only.
8481 */
8482- if (cpumask_weight(current_mask) != 1)
8483+ if (current->nr_cpus_allowed != 1)
8484 goto out;
8485
8486 cpu_id = smp_processor_id();
8487diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/qib/qib_file_ops.c linux-4.14/drivers/infiniband/hw/qib/qib_file_ops.c
8488--- linux-4.14.orig/drivers/infiniband/hw/qib/qib_file_ops.c 2018-09-05 11:03:22.000000000 +0200
8489+++ linux-4.14/drivers/infiniband/hw/qib/qib_file_ops.c 2018-09-05 11:05:07.000000000 +0200
8490@@ -1167,7 +1167,7 @@
8491 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
8492 {
8493 struct qib_filedata *fd = fp->private_data;
8494- const unsigned int weight = cpumask_weight(&current->cpus_allowed);
8495+ const unsigned int weight = current->nr_cpus_allowed;
8496 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8497 int local_cpu;
8498
8499@@ -1648,9 +1648,8 @@
8500 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
8501 else {
8502 int unit;
8503- const unsigned int cpu = cpumask_first(&current->cpus_allowed);
8504- const unsigned int weight =
8505- cpumask_weight(&current->cpus_allowed);
8506+ const unsigned int cpu = cpumask_first(current->cpus_ptr);
8507+ const unsigned int weight = current->nr_cpus_allowed;
8508
8509 if (weight == 1 && !test_bit(cpu, qib_cpulist))
8510 if (!find_hca(cpu, &unit) && unit >= 0)
8511diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-4.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8512--- linux-4.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2018-09-05 11:03:22.000000000 +0200
8513+++ linux-4.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2018-09-05 11:05:07.000000000 +0200
8514@@ -898,7 +898,7 @@
1a6e0f06
JK
8515
8516 ipoib_dbg_mcast(priv, "restarting multicast task\n");
8517
8518- local_irq_save(flags);
8519+ local_irq_save_nort(flags);
8520 netif_addr_lock(dev);
8521 spin_lock(&priv->lock);
8522
e4b2b4a8 8523@@ -980,7 +980,7 @@
1a6e0f06
JK
8524
8525 spin_unlock(&priv->lock);
8526 netif_addr_unlock(dev);
8527- local_irq_restore(flags);
8528+ local_irq_restore_nort(flags);
8529
e4b2b4a8
JK
8530 ipoib_mcast_remove_list(&remove_list);
8531
8532diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/input/gameport/gameport.c linux-4.14/drivers/input/gameport/gameport.c
8533--- linux-4.14.orig/drivers/input/gameport/gameport.c 2017-11-12 19:46:13.000000000 +0100
8534+++ linux-4.14/drivers/input/gameport/gameport.c 2018-09-05 11:05:07.000000000 +0200
8535@@ -91,13 +91,13 @@
8536 tx = ~0;
8537
8538 for (i = 0; i < 50; i++) {
8539- local_irq_save(flags);
8540+ local_irq_save_nort(flags);
8541 t1 = ktime_get_ns();
8542 for (t = 0; t < 50; t++)
8543 gameport_read(gameport);
8544 t2 = ktime_get_ns();
8545 t3 = ktime_get_ns();
8546- local_irq_restore(flags);
8547+ local_irq_restore_nort(flags);
8548 udelay(i * 10);
8549 t = (t2 - t1) - (t3 - t2);
8550 if (t < tx)
8551@@ -124,12 +124,12 @@
8552 tx = 1 << 30;
8553
8554 for(i = 0; i < 50; i++) {
8555- local_irq_save(flags);
8556+ local_irq_save_nort(flags);
8557 GET_TIME(t1);
8558 for (t = 0; t < 50; t++) gameport_read(gameport);
8559 GET_TIME(t2);
8560 GET_TIME(t3);
8561- local_irq_restore(flags);
8562+ local_irq_restore_nort(flags);
8563 udelay(i * 10);
8564 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
8565 }
8566@@ -148,11 +148,11 @@
8567 tx = 1 << 30;
8568
8569 for(i = 0; i < 50; i++) {
8570- local_irq_save(flags);
8571+ local_irq_save_nort(flags);
8572 t1 = rdtsc();
8573 for (t = 0; t < 50; t++) gameport_read(gameport);
8574 t2 = rdtsc();
8575- local_irq_restore(flags);
8576+ local_irq_restore_nort(flags);
8577 udelay(i * 10);
8578 if (t2 - t1 < tx) tx = t2 - t1;
8579 }
8580diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu.c linux-4.14/drivers/iommu/amd_iommu.c
8581--- linux-4.14.orig/drivers/iommu/amd_iommu.c 2018-09-05 11:03:22.000000000 +0200
8582+++ linux-4.14/drivers/iommu/amd_iommu.c 2018-09-05 11:05:07.000000000 +0200
8583@@ -81,11 +81,12 @@
8584 */
8585 #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
8586
8587-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
8588+static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
8589+static DEFINE_SPINLOCK(pd_bitmap_lock);
8590+static DEFINE_SPINLOCK(iommu_table_lock);
8591
8592 /* List of all available dev_data structures */
8593-static LIST_HEAD(dev_data_list);
8594-static DEFINE_SPINLOCK(dev_data_list_lock);
8595+static LLIST_HEAD(dev_data_list);
8596
8597 LIST_HEAD(ioapic_map);
8598 LIST_HEAD(hpet_map);
8599@@ -204,40 +205,33 @@
8600 static struct iommu_dev_data *alloc_dev_data(u16 devid)
8601 {
8602 struct iommu_dev_data *dev_data;
8603- unsigned long flags;
8604
8605 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
8606 if (!dev_data)
8607 return NULL;
8608
8609 dev_data->devid = devid;
8610-
8611- spin_lock_irqsave(&dev_data_list_lock, flags);
8612- list_add_tail(&dev_data->dev_data_list, &dev_data_list);
8613- spin_unlock_irqrestore(&dev_data_list_lock, flags);
8614-
8615 ratelimit_default_init(&dev_data->rs);
8616
8617+ llist_add(&dev_data->dev_data_list, &dev_data_list);
8618 return dev_data;
8619 }
8620
8621 static struct iommu_dev_data *search_dev_data(u16 devid)
8622 {
8623 struct iommu_dev_data *dev_data;
8624- unsigned long flags;
8625+ struct llist_node *node;
8626
8627- spin_lock_irqsave(&dev_data_list_lock, flags);
8628- list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
8629+ if (llist_empty(&dev_data_list))
8630+ return NULL;
8631+
8632+ node = dev_data_list.first;
8633+ llist_for_each_entry(dev_data, node, dev_data_list) {
8634 if (dev_data->devid == devid)
8635- goto out_unlock;
8636+ return dev_data;
8637 }
8638
8639- dev_data = NULL;
8640-
8641-out_unlock:
8642- spin_unlock_irqrestore(&dev_data_list_lock, flags);
8643-
8644- return dev_data;
8645+ return NULL;
8646 }
8647
8648 static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
8649@@ -1056,9 +1050,9 @@
8650 unsigned long flags;
8651 int ret;
8652
8653- spin_lock_irqsave(&iommu->lock, flags);
8654+ raw_spin_lock_irqsave(&iommu->lock, flags);
8655 ret = __iommu_queue_command_sync(iommu, cmd, sync);
8656- spin_unlock_irqrestore(&iommu->lock, flags);
8657+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
8658
8659 return ret;
8660 }
8661@@ -1084,7 +1078,7 @@
8662
8663 build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
8664
8665- spin_lock_irqsave(&iommu->lock, flags);
8666+ raw_spin_lock_irqsave(&iommu->lock, flags);
8667
8668 iommu->cmd_sem = 0;
8669
8670@@ -1095,7 +1089,7 @@
8671 ret = wait_on_sem(&iommu->cmd_sem);
8672
8673 out_unlock:
8674- spin_unlock_irqrestore(&iommu->lock, flags);
8675+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
8676
8677 return ret;
8678 }
8679@@ -1604,29 +1598,26 @@
8680
8681 static u16 domain_id_alloc(void)
8682 {
8683- unsigned long flags;
8684 int id;
8685
8686- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8687+ spin_lock(&pd_bitmap_lock);
8688 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
8689 BUG_ON(id == 0);
8690 if (id > 0 && id < MAX_DOMAIN_ID)
8691 __set_bit(id, amd_iommu_pd_alloc_bitmap);
8692 else
8693 id = 0;
8694- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8695+ spin_unlock(&pd_bitmap_lock);
8696
8697 return id;
8698 }
8699
8700 static void domain_id_free(int id)
8701 {
8702- unsigned long flags;
8703-
8704- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8705+ spin_lock(&pd_bitmap_lock);
8706 if (id > 0 && id < MAX_DOMAIN_ID)
8707 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
8708- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8709+ spin_unlock(&pd_bitmap_lock);
8710 }
8711
8712 #define DEFINE_FREE_PT_FN(LVL, FN) \
8713@@ -1946,10 +1937,10 @@
8714 int ret;
8715
8716 /*
8717- * Must be called with IRQs disabled. Warn here to detect early
8718- * when its not.
8719+ * Must be called with IRQs disabled on a non RT kernel. Warn here to
8720+ * detect early when its not.
8721 */
8722- WARN_ON(!irqs_disabled());
8723+ WARN_ON_NONRT(!irqs_disabled());
8724
8725 /* lock domain */
8726 spin_lock(&domain->lock);
8727@@ -2095,9 +2086,9 @@
8728 }
8729
8730 skip_ats_check:
8731- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8732+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8733 ret = __attach_device(dev_data, domain);
8734- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8735+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8736
8737 /*
8738 * We might boot into a crash-kernel here. The crashed kernel
8739@@ -2117,10 +2108,10 @@
8740 struct protection_domain *domain;
8741
8742 /*
8743- * Must be called with IRQs disabled. Warn here to detect early
8744- * when its not.
8745+ * Must be called with IRQs disabled on a non RT kernel. Warn here to
8746+ * detect early when its not.
8747 */
8748- WARN_ON(!irqs_disabled());
8749+ WARN_ON_NONRT(!irqs_disabled());
8750
8751 if (WARN_ON(!dev_data->domain))
8752 return;
8753@@ -2147,9 +2138,9 @@
8754 domain = dev_data->domain;
8755
8756 /* lock device table */
8757- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8758+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8759 __detach_device(dev_data);
8760- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8761+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8762
8763 if (!dev_is_pci(dev))
8764 return;
8765@@ -2813,7 +2804,7 @@
8766 struct iommu_dev_data *entry;
8767 unsigned long flags;
8768
8769- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8770+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8771
8772 while (!list_empty(&domain->dev_list)) {
8773 entry = list_first_entry(&domain->dev_list,
8774@@ -2821,7 +2812,7 @@
8775 __detach_device(entry);
8776 }
8777
8778- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8779+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8780 }
8781
8782 static void protection_domain_free(struct protection_domain *domain)
8783@@ -3588,14 +3579,62 @@
8784 amd_iommu_dev_table[devid].data[2] = dte;
8785 }
8786
8787-static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
8788+static struct irq_remap_table *get_irq_table(u16 devid)
8789+{
8790+ struct irq_remap_table *table;
8791+
8792+ if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
8793+ "%s: no iommu for devid %x\n", __func__, devid))
8794+ return NULL;
8795+
8796+ table = irq_lookup_table[devid];
8797+ if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
8798+ return NULL;
8799+
8800+ return table;
8801+}
8802+
8803+static struct irq_remap_table *__alloc_irq_table(void)
8804+{
8805+ struct irq_remap_table *table;
8806+
8807+ table = kzalloc(sizeof(*table), GFP_KERNEL);
8808+ if (!table)
8809+ return NULL;
8810+
8811+ table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
8812+ if (!table->table) {
8813+ kfree(table);
8814+ return NULL;
8815+ }
8816+ raw_spin_lock_init(&table->lock);
8817+
8818+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
8819+ memset(table->table, 0,
8820+ MAX_IRQS_PER_TABLE * sizeof(u32));
8821+ else
8822+ memset(table->table, 0,
8823+ (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
8824+ return table;
8825+}
8826+
8827+static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
8828+ struct irq_remap_table *table)
8829+{
8830+ irq_lookup_table[devid] = table;
8831+ set_dte_irq_entry(devid, table);
8832+ iommu_flush_dte(iommu, devid);
8833+}
8834+
8835+static struct irq_remap_table *alloc_irq_table(u16 devid)
8836 {
8837 struct irq_remap_table *table = NULL;
8838+ struct irq_remap_table *new_table = NULL;
8839 struct amd_iommu *iommu;
8840 unsigned long flags;
8841 u16 alias;
8842
8843- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8844+ spin_lock_irqsave(&iommu_table_lock, flags);
8845
8846 iommu = amd_iommu_rlookup_table[devid];
8847 if (!iommu)
8848@@ -3608,60 +3647,45 @@
8849 alias = amd_iommu_alias_table[devid];
8850 table = irq_lookup_table[alias];
8851 if (table) {
8852- irq_lookup_table[devid] = table;
8853- set_dte_irq_entry(devid, table);
8854- iommu_flush_dte(iommu, devid);
8855- goto out;
8856+ set_remap_table_entry(iommu, devid, table);
8857+ goto out_wait;
8858 }
8859+ spin_unlock_irqrestore(&iommu_table_lock, flags);
8860
8861 /* Nothing there yet, allocate new irq remapping table */
8862- table = kzalloc(sizeof(*table), GFP_ATOMIC);
8863- if (!table)
8864- goto out_unlock;
8865-
8866- /* Initialize table spin-lock */
8867- spin_lock_init(&table->lock);
8868+ new_table = __alloc_irq_table();
8869+ if (!new_table)
8870+ return NULL;
8871
8872- if (ioapic)
8873- /* Keep the first 32 indexes free for IOAPIC interrupts */
8874- table->min_index = 32;
8875+ spin_lock_irqsave(&iommu_table_lock, flags);
8876
8877- table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
8878- if (!table->table) {
8879- kfree(table);
8880- table = NULL;
8881+ table = irq_lookup_table[devid];
8882+ if (table)
8883 goto out_unlock;
8884- }
8885-
8886- if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
8887- memset(table->table, 0,
8888- MAX_IRQS_PER_TABLE * sizeof(u32));
8889- else
8890- memset(table->table, 0,
8891- (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
8892-
8893- if (ioapic) {
8894- int i;
8895
8896- for (i = 0; i < 32; ++i)
8897- iommu->irte_ops->set_allocated(table, i);
8898+ table = irq_lookup_table[alias];
8899+ if (table) {
8900+ set_remap_table_entry(iommu, devid, table);
8901+ goto out_wait;
8902 }
8903
8904- irq_lookup_table[devid] = table;
8905- set_dte_irq_entry(devid, table);
8906- iommu_flush_dte(iommu, devid);
8907- if (devid != alias) {
8908- irq_lookup_table[alias] = table;
8909- set_dte_irq_entry(alias, table);
8910- iommu_flush_dte(iommu, alias);
8911- }
8912+ table = new_table;
8913+ new_table = NULL;
8914
8915-out:
8916+ set_remap_table_entry(iommu, devid, table);
8917+ if (devid != alias)
8918+ set_remap_table_entry(iommu, alias, table);
8919+
8920+out_wait:
8921 iommu_completion_wait(iommu);
8922
8923 out_unlock:
8924- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8925+ spin_unlock_irqrestore(&iommu_table_lock, flags);
8926
8927+ if (new_table) {
8928+ kmem_cache_free(amd_iommu_irq_cache, new_table->table);
8929+ kfree(new_table);
8930+ }
8931 return table;
8932 }
8933
8934@@ -3675,11 +3699,11 @@
8935 if (!iommu)
8936 return -ENODEV;
8937
8938- table = get_irq_table(devid, false);
8939+ table = alloc_irq_table(devid);
8940 if (!table)
8941 return -ENODEV;
8942
8943- spin_lock_irqsave(&table->lock, flags);
8944+ raw_spin_lock_irqsave(&table->lock, flags);
8945
8946 /* Scan table for free entries */
8947 for (c = 0, index = table->min_index;
8948@@ -3702,7 +3726,7 @@
8949 index = -ENOSPC;
8950
8951 out:
8952- spin_unlock_irqrestore(&table->lock, flags);
8953+ raw_spin_unlock_irqrestore(&table->lock, flags);
8954
8955 return index;
8956 }
8957@@ -3719,11 +3743,11 @@
8958 if (iommu == NULL)
8959 return -EINVAL;
8960
8961- table = get_irq_table(devid, false);
8962+ table = get_irq_table(devid);
8963 if (!table)
8964 return -ENOMEM;
8965
8966- spin_lock_irqsave(&table->lock, flags);
8967+ raw_spin_lock_irqsave(&table->lock, flags);
8968
8969 entry = (struct irte_ga *)table->table;
8970 entry = &entry[index];
8971@@ -3734,7 +3758,7 @@
8972 if (data)
8973 data->ref = entry;
8974
8975- spin_unlock_irqrestore(&table->lock, flags);
8976+ raw_spin_unlock_irqrestore(&table->lock, flags);
8977
8978 iommu_flush_irt(iommu, devid);
8979 iommu_completion_wait(iommu);
8980@@ -3752,13 +3776,13 @@
8981 if (iommu == NULL)
8982 return -EINVAL;
8983
8984- table = get_irq_table(devid, false);
8985+ table = get_irq_table(devid);
8986 if (!table)
8987 return -ENOMEM;
8988
8989- spin_lock_irqsave(&table->lock, flags);
8990+ raw_spin_lock_irqsave(&table->lock, flags);
8991 table->table[index] = irte->val;
8992- spin_unlock_irqrestore(&table->lock, flags);
8993+ raw_spin_unlock_irqrestore(&table->lock, flags);
8994
8995 iommu_flush_irt(iommu, devid);
8996 iommu_completion_wait(iommu);
8997@@ -3776,13 +3800,13 @@
8998 if (iommu == NULL)
8999 return;
9000
9001- table = get_irq_table(devid, false);
9002+ table = get_irq_table(devid);
9003 if (!table)
9004 return;
9005
9006- spin_lock_irqsave(&table->lock, flags);
9007+ raw_spin_lock_irqsave(&table->lock, flags);
9008 iommu->irte_ops->clear_allocated(table, index);
9009- spin_unlock_irqrestore(&table->lock, flags);
9010+ raw_spin_unlock_irqrestore(&table->lock, flags);
9011
9012 iommu_flush_irt(iommu, devid);
9013 iommu_completion_wait(iommu);
9014@@ -3863,10 +3887,8 @@
9015 u8 vector, u32 dest_apicid)
9016 {
9017 struct irte_ga *irte = (struct irte_ga *) entry;
9018- struct iommu_dev_data *dev_data = search_dev_data(devid);
9019
9020- if (!dev_data || !dev_data->use_vapic ||
9021- !irte->lo.fields_remap.guest_mode) {
9022+ if (!irte->lo.fields_remap.guest_mode) {
9023 irte->hi.fields.vector = vector;
9024 irte->lo.fields_remap.destination = dest_apicid;
9025 modify_irte_ga(devid, index, irte, NULL);
9026@@ -4072,7 +4094,7 @@
9027 struct amd_ir_data *data = NULL;
9028 struct irq_cfg *cfg;
9029 int i, ret, devid;
9030- int index = -1;
9031+ int index;
9032
9033 if (!info)
9034 return -EINVAL;
9035@@ -4096,10 +4118,26 @@
9036 return ret;
9037
9038 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
9039- if (get_irq_table(devid, true))
9040+ struct irq_remap_table *table;
9041+ struct amd_iommu *iommu;
9042+
9043+ table = alloc_irq_table(devid);
9044+ if (table) {
9045+ if (!table->min_index) {
9046+ /*
9047+ * Keep the first 32 indexes free for IOAPIC
9048+ * interrupts.
9049+ */
9050+ table->min_index = 32;
9051+ iommu = amd_iommu_rlookup_table[devid];
9052+ for (i = 0; i < 32; ++i)
9053+ iommu->irte_ops->set_allocated(table, i);
9054+ }
9055+ WARN_ON(table->min_index != 32);
9056 index = info->ioapic_pin;
9057- else
9058- ret = -ENOMEM;
9059+ } else {
9060+ index = -ENOMEM;
9061+ }
9062 } else {
9063 index = alloc_irq_index(devid, nr_irqs);
9064 }
9065@@ -4343,7 +4381,7 @@
9066 {
9067 unsigned long flags;
9068 struct amd_iommu *iommu;
9069- struct irq_remap_table *irt;
9070+ struct irq_remap_table *table;
9071 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
9072 int devid = ir_data->irq_2_irte.devid;
9073 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
9074@@ -4357,11 +4395,11 @@
9075 if (!iommu)
9076 return -ENODEV;
9077
9078- irt = get_irq_table(devid, false);
9079- if (!irt)
9080+ table = get_irq_table(devid);
9081+ if (!table)
9082 return -ENODEV;
9083
9084- spin_lock_irqsave(&irt->lock, flags);
9085+ raw_spin_lock_irqsave(&table->lock, flags);
9086
9087 if (ref->lo.fields_vapic.guest_mode) {
9088 if (cpu >= 0)
9089@@ -4370,7 +4408,7 @@
9090 barrier();
9091 }
9092
9093- spin_unlock_irqrestore(&irt->lock, flags);
9094+ raw_spin_unlock_irqrestore(&table->lock, flags);
9095
9096 iommu_flush_irt(iommu, devid);
9097 iommu_completion_wait(iommu);
9098diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu_init.c linux-4.14/drivers/iommu/amd_iommu_init.c
9099--- linux-4.14.orig/drivers/iommu/amd_iommu_init.c 2017-11-12 19:46:13.000000000 +0100
9100+++ linux-4.14/drivers/iommu/amd_iommu_init.c 2018-09-05 11:05:07.000000000 +0200
9101@@ -1474,7 +1474,7 @@
9102 {
9103 int ret;
9104
9105- spin_lock_init(&iommu->lock);
9106+ raw_spin_lock_init(&iommu->lock);
9107
9108 /* Add IOMMU to internal data structures */
9109 list_add_tail(&iommu->list, &amd_iommu_list);
9110diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu_types.h linux-4.14/drivers/iommu/amd_iommu_types.h
9111--- linux-4.14.orig/drivers/iommu/amd_iommu_types.h 2017-11-12 19:46:13.000000000 +0100
9112+++ linux-4.14/drivers/iommu/amd_iommu_types.h 2018-09-05 11:05:07.000000000 +0200
9113@@ -406,7 +406,7 @@
9114 #define IRQ_TABLE_ALIGNMENT 128
9115
9116 struct irq_remap_table {
9117- spinlock_t lock;
9118+ raw_spinlock_t lock;
9119 unsigned min_index;
9120 u32 *table;
9121 };
9122@@ -488,7 +488,7 @@
9123 int index;
9124
9125 /* locks the accesses to the hardware */
9126- spinlock_t lock;
9127+ raw_spinlock_t lock;
9128
9129 /* Pointer to PCI device of this IOMMU */
9130 struct pci_dev *dev;
9131@@ -625,7 +625,7 @@
9132 */
9133 struct iommu_dev_data {
9134 struct list_head list; /* For domain->dev_list */
9135- struct list_head dev_data_list; /* For global dev_data_list */
9136+ struct llist_node dev_data_list; /* For global dev_data_list */
9137 struct protection_domain *domain; /* Domain the device is bound to */
9138 u16 devid; /* PCI Device ID */
9139 u16 alias; /* Alias Device ID */
9140diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/iova.c linux-4.14/drivers/iommu/iova.c
9141--- linux-4.14.orig/drivers/iommu/iova.c 2017-11-12 19:46:13.000000000 +0100
9142+++ linux-4.14/drivers/iommu/iova.c 2018-09-05 11:05:07.000000000 +0200
9143@@ -570,7 +570,7 @@
9144 unsigned long pfn, unsigned long pages,
9145 unsigned long data)
9146 {
9147- struct iova_fq *fq = get_cpu_ptr(iovad->fq);
9148+ struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
9149 unsigned long flags;
9150 unsigned idx;
9151
9152@@ -600,8 +600,6 @@
9153 if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
9154 mod_timer(&iovad->fq_timer,
9155 jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
9156-
9157- put_cpu_ptr(iovad->fq);
9158 }
9159 EXPORT_SYMBOL_GPL(queue_iova);
9160
9161diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/leds/trigger/Kconfig linux-4.14/drivers/leds/trigger/Kconfig
9162--- linux-4.14.orig/drivers/leds/trigger/Kconfig 2017-11-12 19:46:13.000000000 +0100
9163+++ linux-4.14/drivers/leds/trigger/Kconfig 2018-09-05 11:05:07.000000000 +0200
9164@@ -69,7 +69,7 @@
9165
9166 config LEDS_TRIGGER_CPU
9167 bool "LED CPU Trigger"
9168- depends on LEDS_TRIGGERS
9169+ depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9170 help
9171 This allows LEDs to be controlled by active CPUs. This shows
9172 the active CPUs across an array of LEDs so you can see which
9173diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/bcache/Kconfig linux-4.14/drivers/md/bcache/Kconfig
9174--- linux-4.14.orig/drivers/md/bcache/Kconfig 2017-11-12 19:46:13.000000000 +0100
9175+++ linux-4.14/drivers/md/bcache/Kconfig 2018-09-05 11:05:07.000000000 +0200
9176@@ -1,6 +1,7 @@
9177
9178 config BCACHE
9179 tristate "Block device as cache"
9180+ depends on !PREEMPT_RT_FULL
9181 ---help---
9182 Allows a block device to be used as cache for other devices; uses
9183 a btree for indexing and the layout is optimized for SSDs.
9184diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/dm-rq.c linux-4.14/drivers/md/dm-rq.c
9185--- linux-4.14.orig/drivers/md/dm-rq.c 2017-11-12 19:46:13.000000000 +0100
9186+++ linux-4.14/drivers/md/dm-rq.c 2018-09-05 11:05:07.000000000 +0200
9187@@ -671,7 +671,7 @@
9188 /* Establish tio->ti before queuing work (map_tio_request) */
9189 tio->ti = ti;
9190 kthread_queue_work(&md->kworker, &tio->work);
9191- BUG_ON(!irqs_disabled());
9192+ BUG_ON_NONRT(!irqs_disabled());
9193 }
9194 }
9195
9196diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/raid5.c linux-4.14/drivers/md/raid5.c
9197--- linux-4.14.orig/drivers/md/raid5.c 2018-09-05 11:03:22.000000000 +0200
9198+++ linux-4.14/drivers/md/raid5.c 2018-09-05 11:05:07.000000000 +0200
9199@@ -410,7 +410,7 @@
9200 md_wakeup_thread(conf->mddev->thread);
9201 return;
9202 slow_path:
9203- local_irq_save(flags);
9204+ local_irq_save_nort(flags);
9205 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9206 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9207 INIT_LIST_HEAD(&list);
9208@@ -419,7 +419,7 @@
9209 spin_unlock(&conf->device_lock);
9210 release_inactive_stripe_list(conf, &list, hash);
9211 }
9212- local_irq_restore(flags);
9213+ local_irq_restore_nort(flags);
9214 }
9215
9216 static inline void remove_hash(struct stripe_head *sh)
9217@@ -2067,8 +2067,9 @@
9218 struct raid5_percpu *percpu;
9219 unsigned long cpu;
9220
9221- cpu = get_cpu();
9222+ cpu = get_cpu_light();
9223 percpu = per_cpu_ptr(conf->percpu, cpu);
9224+ spin_lock(&percpu->lock);
9225 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9226 ops_run_biofill(sh);
9227 overlap_clear++;
9228@@ -2127,7 +2128,8 @@
9229 if (test_and_clear_bit(R5_Overlap, &dev->flags))
9230 wake_up(&sh->raid_conf->wait_for_overlap);
9231 }
9232- put_cpu();
9233+ spin_unlock(&percpu->lock);
9234+ put_cpu_light();
9235 }
9236
9237 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
9238@@ -6775,6 +6777,7 @@
9239 __func__, cpu);
9240 return -ENOMEM;
9241 }
9242+ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9243 return 0;
9244 }
9245
9246@@ -6785,7 +6788,6 @@
9247 conf->percpu = alloc_percpu(struct raid5_percpu);
9248 if (!conf->percpu)
9249 return -ENOMEM;
9250-
9251 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
9252 if (!err) {
9253 conf->scribble_disks = max(conf->raid_disks,
9254diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/raid5.h linux-4.14/drivers/md/raid5.h
9255--- linux-4.14.orig/drivers/md/raid5.h 2017-11-12 19:46:13.000000000 +0100
9256+++ linux-4.14/drivers/md/raid5.h 2018-09-05 11:05:07.000000000 +0200
9257@@ -624,6 +624,7 @@
9258 int recovery_disabled;
9259 /* per cpu variables */
9260 struct raid5_percpu {
9261+ spinlock_t lock; /* Protection for -RT */
9262 struct page *spare_page; /* Used when checking P/Q in raid6 */
9263 struct flex_array *scribble; /* space for constructing buffer
9264 * lists and performing address
9265diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/mfd/atmel-smc.c linux-4.14/drivers/mfd/atmel-smc.c
9266--- linux-4.14.orig/drivers/mfd/atmel-smc.c 2017-11-12 19:46:13.000000000 +0100
9267+++ linux-4.14/drivers/mfd/atmel-smc.c 2018-09-05 11:05:07.000000000 +0200
9268@@ -12,6 +12,7 @@
9269 */
9270
9271 #include <linux/mfd/syscon/atmel-smc.h>
9272+#include <linux/string.h>
9273
9274 /**
9275 * atmel_smc_cs_conf_init - initialize a SMC CS conf
9276diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/misc/Kconfig linux-4.14/drivers/misc/Kconfig
9277--- linux-4.14.orig/drivers/misc/Kconfig 2017-11-12 19:46:13.000000000 +0100
9278+++ linux-4.14/drivers/misc/Kconfig 2018-09-05 11:05:07.000000000 +0200
9279@@ -54,6 +54,7 @@
9280 config ATMEL_TCLIB
9281 bool "Atmel AT32/AT91 Timer/Counter Library"
9282 depends on (AVR32 || ARCH_AT91)
9283+ default y if PREEMPT_RT_FULL
9284 help
9285 Select this if you want a library to allocate the Timer/Counter
9286 blocks found on many Atmel processors. This facilitates using
9287@@ -69,8 +70,7 @@
9288 are combined to make a single 32-bit timer.
9289
9290 When GENERIC_CLOCKEVENTS is defined, the third timer channel
9291- may be used as a clock event device supporting oneshot mode
9292- (delays of up to two seconds) based on the 32 KiHz clock.
9293+ may be used as a clock event device supporting oneshot mode.
9294
9295 config ATMEL_TCB_CLKSRC_BLOCK
9296 int
9297@@ -84,6 +84,15 @@
9298 TC can be used for other purposes, such as PWM generation and
9299 interval timing.
9300
9301+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9302+ bool "TC Block use 32 KiHz clock"
9303+ depends on ATMEL_TCB_CLKSRC
9304+ default y if !PREEMPT_RT_FULL
9305+ help
9306+ Select this to use 32 KiHz base clock rate as TC block clock
9307+ source for clock events.
9308+
9309+
9310 config DUMMY_IRQ
9311 tristate "Dummy IRQ handler"
9312 default n
9313diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/mmc/host/mmci.c linux-4.14/drivers/mmc/host/mmci.c
9314--- linux-4.14.orig/drivers/mmc/host/mmci.c 2017-11-12 19:46:13.000000000 +0100
9315+++ linux-4.14/drivers/mmc/host/mmci.c 2018-09-05 11:05:07.000000000 +0200
9316@@ -1200,15 +1200,12 @@
9317 struct sg_mapping_iter *sg_miter = &host->sg_miter;
9318 struct variant_data *variant = host->variant;
9319 void __iomem *base = host->base;
9320- unsigned long flags;
9321 u32 status;
9322
9323 status = readl(base + MMCISTATUS);
9324
9325 dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
9326
9327- local_irq_save(flags);
9328-
9329 do {
9330 unsigned int remain, len;
9331 char *buffer;
9332@@ -1248,8 +1245,6 @@
9333
9334 sg_miter_stop(sg_miter);
9335
9336- local_irq_restore(flags);
9337-
9338 /*
9339 * If we have less than the fifo 'half-full' threshold to transfer,
9340 * trigger a PIO interrupt as soon as any data is available.
9341diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/ethernet/3com/3c59x.c linux-4.14/drivers/net/ethernet/3com/3c59x.c
9342--- linux-4.14.orig/drivers/net/ethernet/3com/3c59x.c 2017-11-12 19:46:13.000000000 +0100
9343+++ linux-4.14/drivers/net/ethernet/3com/3c59x.c 2018-09-05 11:05:07.000000000 +0200
9344@@ -842,9 +842,9 @@
9345 {
9346 struct vortex_private *vp = netdev_priv(dev);
9347 unsigned long flags;
9348- local_irq_save(flags);
9349+ local_irq_save_nort(flags);
9350 (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
9351- local_irq_restore(flags);
9352+ local_irq_restore_nort(flags);
9353 }
9354 #endif
9355
9356@@ -1908,12 +1908,12 @@
9357 * Block interrupts because vortex_interrupt does a bare spin_lock()
9358 */
9359 unsigned long flags;
9360- local_irq_save(flags);
9361+ local_irq_save_nort(flags);
9362 if (vp->full_bus_master_tx)
9363 boomerang_interrupt(dev->irq, dev);
9364 else
9365 vortex_interrupt(dev->irq, dev);
9366- local_irq_restore(flags);
9367+ local_irq_restore_nort(flags);
9368 }
9369 }
9370
9371diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/ethernet/marvell/mvpp2.c linux-4.14/drivers/net/ethernet/marvell/mvpp2.c
9372--- linux-4.14.orig/drivers/net/ethernet/marvell/mvpp2.c 2018-09-05 11:03:22.000000000 +0200
9373+++ linux-4.14/drivers/net/ethernet/marvell/mvpp2.c 2018-09-05 11:05:07.000000000 +0200
9374@@ -830,9 +830,8 @@
9375 /* Per-CPU port control */
9376 struct mvpp2_port_pcpu {
9377 struct hrtimer tx_done_timer;
9378+ struct net_device *dev;
9379 bool timer_scheduled;
9380- /* Tasklet for egress finalization */
9381- struct tasklet_struct tx_done_tasklet;
9382 };
9383
9384 struct mvpp2_queue_vector {
9385@@ -5954,46 +5953,34 @@
9386 }
9387 }
9388
9389-static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
9390-{
9391- ktime_t interval;
9392-
9393- if (!port_pcpu->timer_scheduled) {
9394- port_pcpu->timer_scheduled = true;
9395- interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
9396- hrtimer_start(&port_pcpu->tx_done_timer, interval,
9397- HRTIMER_MODE_REL_PINNED);
9398- }
9399-}
9400-
9401-static void mvpp2_tx_proc_cb(unsigned long data)
9402+static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9403 {
9404- struct net_device *dev = (struct net_device *)data;
9405- struct mvpp2_port *port = netdev_priv(dev);
9406- struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9407+ struct net_device *dev;
9408+ struct mvpp2_port *port;
9409+ struct mvpp2_port_pcpu *port_pcpu;
9410 unsigned int tx_todo, cause;
9411
9412+ port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer);
9413+ dev = port_pcpu->dev;
9414+
9415 if (!netif_running(dev))
9416- return;
9417+ return HRTIMER_NORESTART;
9418+
9419 port_pcpu->timer_scheduled = false;
9420+ port = netdev_priv(dev);
9421
9422 /* Process all the Tx queues */
9423 cause = (1 << port->ntxqs) - 1;
9424 tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
9425
9426 /* Set the timer in case not all the packets were processed */
9427- if (tx_todo)
9428- mvpp2_timer_set(port_pcpu);
9429-}
9430-
9431-static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9432-{
9433- struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
9434- struct mvpp2_port_pcpu,
9435- tx_done_timer);
9436-
9437- tasklet_schedule(&port_pcpu->tx_done_tasklet);
9438+ if (tx_todo && !port_pcpu->timer_scheduled) {
9439+ port_pcpu->timer_scheduled = true;
9440+ hrtimer_forward_now(&port_pcpu->tx_done_timer,
9441+ MVPP2_TXDONE_HRTIMER_PERIOD_NS);
9442
9443+ return HRTIMER_RESTART;
9444+ }
9445 return HRTIMER_NORESTART;
9446 }
9447
9448@@ -6482,7 +6469,12 @@
9449 txq_pcpu->count > 0) {
9450 struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9451
9452- mvpp2_timer_set(port_pcpu);
9453+ if (!port_pcpu->timer_scheduled) {
9454+ port_pcpu->timer_scheduled = true;
9455+ hrtimer_start(&port_pcpu->tx_done_timer,
9456+ MVPP2_TXDONE_HRTIMER_PERIOD_NS,
9457+ HRTIMER_MODE_REL_PINNED_SOFT);
9458+ }
9459 }
9460
9461 return NETDEV_TX_OK;
9462@@ -6871,7 +6863,6 @@
9463
9464 hrtimer_cancel(&port_pcpu->tx_done_timer);
9465 port_pcpu->timer_scheduled = false;
9466- tasklet_kill(&port_pcpu->tx_done_tasklet);
9467 }
9468 }
9469 mvpp2_cleanup_rxqs(port);
9470@@ -7644,13 +7635,10 @@
9471 port_pcpu = per_cpu_ptr(port->pcpu, cpu);
9472
9473 hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
9474- HRTIMER_MODE_REL_PINNED);
9475+ HRTIMER_MODE_REL_PINNED_SOFT);
9476 port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
9477 port_pcpu->timer_scheduled = false;
9478-
9479- tasklet_init(&port_pcpu->tx_done_tasklet,
9480- mvpp2_tx_proc_cb,
9481- (unsigned long)dev);
9482+ port_pcpu->dev = dev;
9483 }
9484 }
9485
9486diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c linux-4.14/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9487--- linux-4.14.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2017-11-12 19:46:13.000000000 +0100
9488+++ linux-4.14/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2018-09-05 11:05:07.000000000 +0200
9489@@ -697,7 +697,7 @@
9490 while (!ctx->done.done && msecs--)
9491 udelay(1000);
9492 } else {
9493- wait_event_interruptible(ctx->done.wait,
9494+ swait_event_interruptible(ctx->done.wait,
9495 ctx->done.done);
9496 }
9497 break;
9498diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/wireless/mac80211_hwsim.c linux-4.14/drivers/net/wireless/mac80211_hwsim.c
9499--- linux-4.14.orig/drivers/net/wireless/mac80211_hwsim.c 2018-09-05 11:03:22.000000000 +0200
9500+++ linux-4.14/drivers/net/wireless/mac80211_hwsim.c 2018-09-05 11:05:07.000000000 +0200
9501@@ -537,7 +537,7 @@
9502 unsigned int rx_filter;
9503 bool started, idle, scanning;
9504 struct mutex mutex;
9505- struct tasklet_hrtimer beacon_timer;
9506+ struct hrtimer beacon_timer;
9507 enum ps_mode {
9508 PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
9509 } ps;
9510@@ -1423,7 +1423,7 @@
9511 {
9512 struct mac80211_hwsim_data *data = hw->priv;
9513 data->started = false;
9514- tasklet_hrtimer_cancel(&data->beacon_timer);
9515+ hrtimer_cancel(&data->beacon_timer);
9516 wiphy_debug(hw->wiphy, "%s\n", __func__);
9517 }
9518
9519@@ -1546,14 +1546,12 @@
9520 mac80211_hwsim_beacon(struct hrtimer *timer)
9521 {
9522 struct mac80211_hwsim_data *data =
9523- container_of(timer, struct mac80211_hwsim_data,
9524- beacon_timer.timer);
9525+ container_of(timer, struct mac80211_hwsim_data, beacon_timer);
9526 struct ieee80211_hw *hw = data->hw;
9527 u64 bcn_int = data->beacon_int;
9528- ktime_t next_bcn;
9529
9530 if (!data->started)
9531- goto out;
9532+ return HRTIMER_NORESTART;
9533
9534 ieee80211_iterate_active_interfaces_atomic(
9535 hw, IEEE80211_IFACE_ITER_NORMAL,
9536@@ -1565,11 +1563,9 @@
9537 data->bcn_delta = 0;
9538 }
9539
9540- next_bcn = ktime_add(hrtimer_get_expires(timer),
9541- ns_to_ktime(bcn_int * 1000));
9542- tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS);
9543-out:
9544- return HRTIMER_NORESTART;
9545+ hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer),
9546+ ns_to_ktime(bcn_int * NSEC_PER_USEC));
9547+ return HRTIMER_RESTART;
9548 }
9549
9550 static const char * const hwsim_chanwidths[] = {
9551@@ -1643,15 +1639,15 @@
9552 mutex_unlock(&data->mutex);
9553
9554 if (!data->started || !data->beacon_int)
9555- tasklet_hrtimer_cancel(&data->beacon_timer);
9556- else if (!hrtimer_is_queued(&data->beacon_timer.timer)) {
9557+ hrtimer_cancel(&data->beacon_timer);
9558+ else if (!hrtimer_is_queued(&data->beacon_timer)) {
9559 u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
9560 u32 bcn_int = data->beacon_int;
9561 u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9562
9563- tasklet_hrtimer_start(&data->beacon_timer,
9564- ns_to_ktime(until_tbtt * 1000),
9565- HRTIMER_MODE_REL);
9566+ hrtimer_start(&data->beacon_timer,
9567+ ns_to_ktime(until_tbtt * 1000),
9568+ HRTIMER_MODE_REL_SOFT);
9569 }
9570
9571 return 0;
9572@@ -1714,7 +1710,7 @@
9573 info->enable_beacon, info->beacon_int);
9574 vp->bcn_en = info->enable_beacon;
9575 if (data->started &&
9576- !hrtimer_is_queued(&data->beacon_timer.timer) &&
9577+ !hrtimer_is_queued(&data->beacon_timer) &&
9578 info->enable_beacon) {
9579 u64 tsf, until_tbtt;
9580 u32 bcn_int;
9581@@ -1722,9 +1718,9 @@
9582 tsf = mac80211_hwsim_get_tsf(hw, vif);
9583 bcn_int = data->beacon_int;
9584 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9585- tasklet_hrtimer_start(&data->beacon_timer,
9586- ns_to_ktime(until_tbtt * 1000),
9587- HRTIMER_MODE_REL);
9588+ hrtimer_start(&data->beacon_timer,
9589+ ns_to_ktime(until_tbtt * 1000),
9590+ HRTIMER_MODE_REL_SOFT);
9591 } else if (!info->enable_beacon) {
9592 unsigned int count = 0;
9593 ieee80211_iterate_active_interfaces_atomic(
9594@@ -1733,7 +1729,7 @@
9595 wiphy_debug(hw->wiphy, " beaconing vifs remaining: %u",
9596 count);
9597 if (count == 0) {
9598- tasklet_hrtimer_cancel(&data->beacon_timer);
9599+ hrtimer_cancel(&data->beacon_timer);
9600 data->beacon_int = 0;
9601 }
9602 }
9603@@ -2725,9 +2721,9 @@
9604 data->debugfs,
9605 data, &hwsim_simulate_radar);
9606
9607- tasklet_hrtimer_init(&data->beacon_timer,
9608- mac80211_hwsim_beacon,
9609- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
9610+ hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC,
9611+ HRTIMER_MODE_ABS_SOFT);
9612+ data->beacon_timer.function = mac80211_hwsim_beacon;
9613
9614 spin_lock_bh(&hwsim_radio_lock);
9615 list_add_tail(&data->list, &hwsim_radios);
9616diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/pci/switch/switchtec.c linux-4.14/drivers/pci/switch/switchtec.c
9617--- linux-4.14.orig/drivers/pci/switch/switchtec.c 2017-11-12 19:46:13.000000000 +0100
9618+++ linux-4.14/drivers/pci/switch/switchtec.c 2018-09-05 11:05:07.000000000 +0200
9619@@ -306,10 +306,11 @@
9620
9621 enum mrpc_state state;
9622
9623- struct completion comp;
9624+ wait_queue_head_t cmd_comp;
9625 struct kref kref;
9626 struct list_head list;
9627
9628+ bool cmd_done;
9629 u32 cmd;
9630 u32 status;
9631 u32 return_code;
9632@@ -331,7 +332,7 @@
9633 stuser->stdev = stdev;
9634 kref_init(&stuser->kref);
9635 INIT_LIST_HEAD(&stuser->list);
9636- init_completion(&stuser->comp);
9637+ init_waitqueue_head(&stuser->cmd_comp);
9638 stuser->event_cnt = atomic_read(&stdev->event_cnt);
9639
9640 dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
9641@@ -414,7 +415,7 @@
9642 kref_get(&stuser->kref);
9643 stuser->read_len = sizeof(stuser->data);
9644 stuser_set_state(stuser, MRPC_QUEUED);
9645- init_completion(&stuser->comp);
9646+ stuser->cmd_done = false;
9647 list_add_tail(&stuser->list, &stdev->mrpc_queue);
9648
9649 mrpc_cmd_submit(stdev);
9650@@ -451,7 +452,8 @@
9651 stuser->read_len);
9652
9653 out:
9654- complete_all(&stuser->comp);
9655+ stuser->cmd_done = true;
9656+ wake_up_interruptible(&stuser->cmd_comp);
9657 list_del_init(&stuser->list);
9658 stuser_put(stuser);
9659 stdev->mrpc_busy = 0;
9660@@ -721,10 +723,11 @@
9661 mutex_unlock(&stdev->mrpc_mutex);
9662
9663 if (filp->f_flags & O_NONBLOCK) {
9664- if (!try_wait_for_completion(&stuser->comp))
9665+ if (!READ_ONCE(stuser->cmd_done))
9666 return -EAGAIN;
9667 } else {
9668- rc = wait_for_completion_interruptible(&stuser->comp);
9669+ rc = wait_event_interruptible(stuser->cmd_comp,
9670+ stuser->cmd_done);
9671 if (rc < 0)
9672 return rc;
9673 }
9674@@ -772,7 +775,7 @@
9675 struct switchtec_dev *stdev = stuser->stdev;
9676 int ret = 0;
9677
9678- poll_wait(filp, &stuser->comp.wait, wait);
9679+ poll_wait(filp, &stuser->cmd_comp, wait);
9680 poll_wait(filp, &stdev->event_wq, wait);
9681
9682 if (lock_mutex_and_test_alive(stdev))
9683@@ -780,7 +783,7 @@
9684
9685 mutex_unlock(&stdev->mrpc_mutex);
9686
9687- if (try_wait_for_completion(&stuser->comp))
9688+ if (READ_ONCE(stuser->cmd_done))
9689 ret |= POLLIN | POLLRDNORM;
9690
9691 if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
9692@@ -1255,7 +1258,8 @@
9693
9694 /* Wake up and kill any users waiting on an MRPC request */
9695 list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
9696- complete_all(&stuser->comp);
9697+ stuser->cmd_done = true;
9698+ wake_up_interruptible(&stuser->cmd_comp);
9699 list_del_init(&stuser->list);
9700 stuser_put(stuser);
9701 }
9702diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/fcoe/fcoe.c linux-4.14/drivers/scsi/fcoe/fcoe.c
9703--- linux-4.14.orig/drivers/scsi/fcoe/fcoe.c 2017-11-12 19:46:13.000000000 +0100
9704+++ linux-4.14/drivers/scsi/fcoe/fcoe.c 2018-09-05 11:05:07.000000000 +0200
9705@@ -1464,11 +1464,11 @@
9706 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
9707 {
9708 struct fcoe_percpu_s *fps;
9709- int rc;
9710+ int rc, cpu = get_cpu_light();
9711
9712- fps = &get_cpu_var(fcoe_percpu);
9713+ fps = &per_cpu(fcoe_percpu, cpu);
9714 rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
9715- put_cpu_var(fcoe_percpu);
9716+ put_cpu_light();
9717
9718 return rc;
9719 }
9720@@ -1655,11 +1655,11 @@
9721 return 0;
9722 }
9723
9724- stats = per_cpu_ptr(lport->stats, get_cpu());
9725+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
9726 stats->InvalidCRCCount++;
9727 if (stats->InvalidCRCCount < 5)
9728 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
9729- put_cpu();
9730+ put_cpu_light();
9731 return -EINVAL;
9732 }
9733
9734@@ -1702,7 +1702,7 @@
9735 */
9736 hp = (struct fcoe_hdr *) skb_network_header(skb);
9737
9738- stats = per_cpu_ptr(lport->stats, get_cpu());
9739+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
9740 if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
9741 if (stats->ErrorFrames < 5)
9742 printk(KERN_WARNING "fcoe: FCoE version "
9743@@ -1734,13 +1734,13 @@
9744 goto drop;
9745
9746 if (!fcoe_filter_frames(lport, fp)) {
9747- put_cpu();
9748+ put_cpu_light();
9749 fc_exch_recv(lport, fp);
9750 return;
9751 }
9752 drop:
9753 stats->ErrorFrames++;
9754- put_cpu();
9755+ put_cpu_light();
9756 kfree_skb(skb);
9757 }
9758
9759diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-4.14/drivers/scsi/fcoe/fcoe_ctlr.c
9760--- linux-4.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c 2017-11-12 19:46:13.000000000 +0100
9761+++ linux-4.14/drivers/scsi/fcoe/fcoe_ctlr.c 2018-09-05 11:05:07.000000000 +0200
9762@@ -835,7 +835,7 @@
9763
9764 INIT_LIST_HEAD(&del_list);
9765
9766- stats = per_cpu_ptr(fip->lp->stats, get_cpu());
9767+ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
9768
9769 list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
9770 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
9771@@ -871,7 +871,7 @@
9772 sel_time = fcf->time;
9773 }
9774 }
9775- put_cpu();
9776+ put_cpu_light();
9777
9778 list_for_each_entry_safe(fcf, next, &del_list, list) {
9779 /* Removes fcf from current list */
9780diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/libfc/fc_exch.c linux-4.14/drivers/scsi/libfc/fc_exch.c
9781--- linux-4.14.orig/drivers/scsi/libfc/fc_exch.c 2017-11-12 19:46:13.000000000 +0100
9782+++ linux-4.14/drivers/scsi/libfc/fc_exch.c 2018-09-05 11:05:07.000000000 +0200
9783@@ -833,10 +833,10 @@
9784 }
9785 memset(ep, 0, sizeof(*ep));
9786
9787- cpu = get_cpu();
9788+ cpu = get_cpu_light();
9789 pool = per_cpu_ptr(mp->pool, cpu);
9790 spin_lock_bh(&pool->lock);
9791- put_cpu();
9792+ put_cpu_light();
9793
9794 /* peek cache of free slot */
9795 if (pool->left != FC_XID_UNKNOWN) {
9796diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/libsas/sas_ata.c linux-4.14/drivers/scsi/libsas/sas_ata.c
9797--- linux-4.14.orig/drivers/scsi/libsas/sas_ata.c 2017-11-12 19:46:13.000000000 +0100
9798+++ linux-4.14/drivers/scsi/libsas/sas_ata.c 2018-09-05 11:05:07.000000000 +0200
9799@@ -190,7 +190,7 @@
9800 /* TODO: audit callers to ensure they are ready for qc_issue to
9801 * unconditionally re-enable interrupts
9802 */
9803- local_irq_save(flags);
9804+ local_irq_save_nort(flags);
9805 spin_unlock(ap->lock);
9806
9807 /* If the device fell off, no sense in issuing commands */
9808@@ -252,7 +252,7 @@
9809
9810 out:
9811 spin_lock(ap->lock);
9812- local_irq_restore(flags);
9813+ local_irq_restore_nort(flags);
9814 return ret;
9815 }
9816
9817diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/qla2xxx/qla_inline.h linux-4.14/drivers/scsi/qla2xxx/qla_inline.h
9818--- linux-4.14.orig/drivers/scsi/qla2xxx/qla_inline.h 2018-09-05 11:03:22.000000000 +0200
9819+++ linux-4.14/drivers/scsi/qla2xxx/qla_inline.h 2018-09-05 11:05:07.000000000 +0200
9820@@ -59,12 +59,12 @@
9821 {
9822 unsigned long flags;
9823 struct qla_hw_data *ha = rsp->hw;
9824- local_irq_save(flags);
9825+ local_irq_save_nort(flags);
9826 if (IS_P3P_TYPE(ha))
9827 qla82xx_poll(0, rsp);
9828 else
9829 ha->isp_ops->intr_handler(0, rsp);
9830- local_irq_restore(flags);
9831+ local_irq_restore_nort(flags);
9832 }
9833
9834 static inline uint8_t *
9835diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/staging/greybus/audio_manager.c linux-4.14/drivers/staging/greybus/audio_manager.c
9836--- linux-4.14.orig/drivers/staging/greybus/audio_manager.c 2017-11-12 19:46:13.000000000 +0100
9837+++ linux-4.14/drivers/staging/greybus/audio_manager.c 2018-09-05 11:05:07.000000000 +0200
9838@@ -10,7 +10,7 @@
9839 #include <linux/sysfs.h>
9840 #include <linux/module.h>
9841 #include <linux/init.h>
9842-#include <linux/rwlock.h>
9843+#include <linux/spinlock.h>
9844 #include <linux/idr.h>
9845
9846 #include "audio_manager.h"
9847diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/target/target_core_tmr.c linux-4.14/drivers/target/target_core_tmr.c
9848--- linux-4.14.orig/drivers/target/target_core_tmr.c 2018-09-05 11:03:22.000000000 +0200
9849+++ linux-4.14/drivers/target/target_core_tmr.c 2018-09-05 11:05:07.000000000 +0200
9850@@ -114,8 +114,6 @@
9851 {
9852 struct se_session *sess = se_cmd->se_sess;
9853
9854- assert_spin_locked(&sess->sess_cmd_lock);
9855- WARN_ON_ONCE(!irqs_disabled());
9856 /*
9857 * If command already reached CMD_T_COMPLETE state within
9858 * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
9859diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/target/target_core_transport.c linux-4.14/drivers/target/target_core_transport.c
9860--- linux-4.14.orig/drivers/target/target_core_transport.c 2018-09-05 11:03:22.000000000 +0200
9861+++ linux-4.14/drivers/target/target_core_transport.c 2018-09-05 11:05:07.000000000 +0200
9862@@ -2966,9 +2966,6 @@
9863 __acquires(&cmd->t_state_lock)
9864 {
9865
9866- assert_spin_locked(&cmd->t_state_lock);
9867- WARN_ON_ONCE(!irqs_disabled());
9868-
9869 if (fabric_stop)
9870 cmd->transport_state |= CMD_T_FABRIC_STOP;
9871
9872@@ -3238,9 +3235,6 @@
9873 {
9874 int ret;
9875
9876- assert_spin_locked(&cmd->t_state_lock);
9877- WARN_ON_ONCE(!irqs_disabled());
9878-
9879 if (!(cmd->transport_state & CMD_T_ABORTED))
9880 return 0;
9881 /*
9882diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-4.14/drivers/thermal/x86_pkg_temp_thermal.c
9883--- linux-4.14.orig/drivers/thermal/x86_pkg_temp_thermal.c 2017-11-12 19:46:13.000000000 +0100
9884+++ linux-4.14/drivers/thermal/x86_pkg_temp_thermal.c 2018-09-05 11:05:07.000000000 +0200
9885@@ -29,6 +29,7 @@
9886 #include <linux/pm.h>
9887 #include <linux/thermal.h>
9888 #include <linux/debugfs.h>
9889+#include <linux/swork.h>
9890 #include <asm/cpu_device_id.h>
9891 #include <asm/mce.h>
9892
9893@@ -329,7 +330,7 @@
9894 schedule_delayed_work_on(cpu, work, ms);
9895 }
9896
9897-static int pkg_thermal_notify(u64 msr_val)
9898+static void pkg_thermal_notify_work(struct swork_event *event)
9899 {
9900 int cpu = smp_processor_id();
9901 struct pkg_device *pkgdev;
9902@@ -348,8 +349,46 @@
9903 }
9904
9905 spin_unlock_irqrestore(&pkg_temp_lock, flags);
9906+}
9907+
9908+#ifdef CONFIG_PREEMPT_RT_FULL
9909+static struct swork_event notify_work;
9910+
9911+static int pkg_thermal_notify_work_init(void)
9912+{
9913+ int err;
9914+
9915+ err = swork_get();
9916+ if (err)
9917+ return err;
9918+
9919+ INIT_SWORK(&notify_work, pkg_thermal_notify_work);
9920+ return 0;
9921+}
9922+
9923+static void pkg_thermal_notify_work_cleanup(void)
9924+{
9925+ swork_put();
9926+}
9927+
9928+static int pkg_thermal_notify(u64 msr_val)
9929+{
9930+ swork_queue(&notify_work);
9931+ return 0;
9932+}
9933+
9934+#else /* !CONFIG_PREEMPT_RT_FULL */
9935+
9936+static int pkg_thermal_notify_work_init(void) { return 0; }
9937+
9938+static void pkg_thermal_notify_work_cleanup(void) { }
9939+
9940+static int pkg_thermal_notify(u64 msr_val)
9941+{
9942+ pkg_thermal_notify_work(NULL);
9943 return 0;
9944 }
9945+#endif /* CONFIG_PREEMPT_RT_FULL */
9946
9947 static int pkg_temp_thermal_device_add(unsigned int cpu)
9948 {
9949@@ -515,10 +554,15 @@
9950 if (!x86_match_cpu(pkg_temp_thermal_ids))
9951 return -ENODEV;
9952
9953+ if (!pkg_thermal_notify_work_init())
9954+ return -ENODEV;
9955+
9956 max_packages = topology_max_packages();
9957 packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL);
9958- if (!packages)
9959- return -ENOMEM;
9960+ if (!packages) {
9961+ ret = -ENOMEM;
9962+ goto err;
9963+ }
9964
9965 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
9966 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
9967@@ -536,6 +580,7 @@
9968 return 0;
9969
9970 err:
9971+ pkg_thermal_notify_work_cleanup();
9972 kfree(packages);
9973 return ret;
9974 }
9975@@ -549,6 +594,7 @@
9976 cpuhp_remove_state(pkg_thermal_hp_state);
9977 debugfs_remove_recursive(debugfs);
9978 kfree(packages);
9979+ pkg_thermal_notify_work_cleanup();
9980 }
9981 module_exit(pkg_temp_thermal_exit)
9982
9983diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/8250/8250_core.c linux-4.14/drivers/tty/serial/8250/8250_core.c
9984--- linux-4.14.orig/drivers/tty/serial/8250/8250_core.c 2017-11-12 19:46:13.000000000 +0100
9985+++ linux-4.14/drivers/tty/serial/8250/8250_core.c 2018-09-05 11:05:07.000000000 +0200
9986@@ -58,7 +58,16 @@
9987
9988 static unsigned int skip_txen_test; /* force skip of txen test at init time */
9989
9990-#define PASS_LIMIT 512
9991+/*
9992+ * On -rt we can have a more delays, and legitimately
9993+ * so - so don't drop work spuriously and spam the
9994+ * syslog:
9995+ */
9996+#ifdef CONFIG_PREEMPT_RT_FULL
9997+# define PASS_LIMIT 1000000
9998+#else
9999+# define PASS_LIMIT 512
10000+#endif
10001
10002 #include <asm/serial.h>
10003 /*
10004diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/8250/8250_port.c linux-4.14/drivers/tty/serial/8250/8250_port.c
10005--- linux-4.14.orig/drivers/tty/serial/8250/8250_port.c 2018-09-05 11:03:22.000000000 +0200
10006+++ linux-4.14/drivers/tty/serial/8250/8250_port.c 2018-09-05 11:05:07.000000000 +0200
10007@@ -35,6 +35,7 @@
10008 #include <linux/nmi.h>
10009 #include <linux/mutex.h>
10010 #include <linux/slab.h>
10011+#include <linux/kdb.h>
10012 #include <linux/uaccess.h>
10013 #include <linux/pm_runtime.h>
10014 #include <linux/ktime.h>
10015@@ -3224,9 +3225,9 @@
10016
10017 serial8250_rpm_get(up);
10018
10019- if (port->sysrq)
10020+ if (port->sysrq || oops_in_progress)
10021 locked = 0;
10022- else if (oops_in_progress)
10023+ else if (in_kdb_printk())
10024 locked = spin_trylock_irqsave(&port->lock, flags);
10025 else
10026 spin_lock_irqsave(&port->lock, flags);
10027diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/amba-pl011.c linux-4.14/drivers/tty/serial/amba-pl011.c
10028--- linux-4.14.orig/drivers/tty/serial/amba-pl011.c 2018-09-05 11:03:22.000000000 +0200
10029+++ linux-4.14/drivers/tty/serial/amba-pl011.c 2018-09-05 11:05:07.000000000 +0200
10030@@ -2236,13 +2236,19 @@
10031
10032 clk_enable(uap->clk);
10033
10034- local_irq_save(flags);
10035+ /*
10036+ * local_irq_save(flags);
10037+ *
10038+ * This local_irq_save() is nonsense. If we come in via sysrq
10039+ * handling then interrupts are already disabled. Aside of
10040+ * that the port.sysrq check is racy on SMP regardless.
10041+ */
10042 if (uap->port.sysrq)
10043 locked = 0;
10044 else if (oops_in_progress)
10045- locked = spin_trylock(&uap->port.lock);
10046+ locked = spin_trylock_irqsave(&uap->port.lock, flags);
10047 else
10048- spin_lock(&uap->port.lock);
10049+ spin_lock_irqsave(&uap->port.lock, flags);
10050
10051 /*
10052 * First save the CR then disable the interrupts
10053@@ -2268,8 +2274,7 @@
10054 pl011_write(old_cr, uap, REG_CR);
10055
10056 if (locked)
10057- spin_unlock(&uap->port.lock);
10058- local_irq_restore(flags);
10059+ spin_unlock_irqrestore(&uap->port.lock, flags);
10060
10061 clk_disable(uap->clk);
10062 }
10063diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/omap-serial.c linux-4.14/drivers/tty/serial/omap-serial.c
10064--- linux-4.14.orig/drivers/tty/serial/omap-serial.c 2018-09-05 11:03:22.000000000 +0200
10065+++ linux-4.14/drivers/tty/serial/omap-serial.c 2018-09-05 11:05:07.000000000 +0200
10066@@ -1311,13 +1311,10 @@
10067
10068 pm_runtime_get_sync(up->dev);
10069
10070- local_irq_save(flags);
10071- if (up->port.sysrq)
10072- locked = 0;
10073- else if (oops_in_progress)
10074- locked = spin_trylock(&up->port.lock);
10075+ if (up->port.sysrq || oops_in_progress)
10076+ locked = spin_trylock_irqsave(&up->port.lock, flags);
10077 else
10078- spin_lock(&up->port.lock);
10079+ spin_lock_irqsave(&up->port.lock, flags);
10080
10081 /*
10082 * First save the IER then disable the interrupts
10083@@ -1346,8 +1343,7 @@
10084 pm_runtime_mark_last_busy(up->dev);
10085 pm_runtime_put_autosuspend(up->dev);
10086 if (locked)
10087- spin_unlock(&up->port.lock);
10088- local_irq_restore(flags);
10089+ spin_unlock_irqrestore(&up->port.lock, flags);
10090 }
10091
10092 static int __init
10093diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/core/hcd.c linux-4.14/drivers/usb/core/hcd.c
10094--- linux-4.14.orig/drivers/usb/core/hcd.c 2018-09-05 11:03:22.000000000 +0200
10095+++ linux-4.14/drivers/usb/core/hcd.c 2018-09-05 11:05:07.000000000 +0200
10096@@ -1775,9 +1775,9 @@
10097 * and no one may trigger the above deadlock situation when
10098 * running complete() in tasklet.
10099 */
10100- local_irq_save(flags);
10101+ local_irq_save_nort(flags);
10102 urb->complete(urb);
10103- local_irq_restore(flags);
10104+ local_irq_restore_nort(flags);
10105
10106 usb_anchor_resume_wakeups(anchor);
10107 atomic_dec(&urb->use_count);
10108diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/function/f_fs.c linux-4.14/drivers/usb/gadget/function/f_fs.c
10109--- linux-4.14.orig/drivers/usb/gadget/function/f_fs.c 2018-09-05 11:03:22.000000000 +0200
10110+++ linux-4.14/drivers/usb/gadget/function/f_fs.c 2018-09-05 11:05:07.000000000 +0200
10111@@ -1623,7 +1623,7 @@
10112 pr_info("%s(): freeing\n", __func__);
10113 ffs_data_clear(ffs);
10114 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
10115- waitqueue_active(&ffs->ep0req_completion.wait) ||
10116+ swait_active(&ffs->ep0req_completion.wait) ||
10117 waitqueue_active(&ffs->wait));
10118 destroy_workqueue(ffs->io_completion_wq);
10119 kfree(ffs->dev_name);
10120diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/function/f_ncm.c linux-4.14/drivers/usb/gadget/function/f_ncm.c
10121--- linux-4.14.orig/drivers/usb/gadget/function/f_ncm.c 2017-11-12 19:46:13.000000000 +0100
10122+++ linux-4.14/drivers/usb/gadget/function/f_ncm.c 2018-09-05 11:05:07.000000000 +0200
10123@@ -77,9 +77,7 @@
10124 struct sk_buff *skb_tx_ndp;
10125 u16 ndp_dgram_count;
10126 bool timer_force_tx;
10127- struct tasklet_struct tx_tasklet;
10128 struct hrtimer task_timer;
10129-
10130 bool timer_stopping;
10131 };
10132
10133@@ -1108,7 +1106,7 @@
10134
10135 /* Delay the timer. */
10136 hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
10137- HRTIMER_MODE_REL);
10138+ HRTIMER_MODE_REL_SOFT);
10139
10140 /* Add the datagram position entries */
10141 ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
10142@@ -1152,17 +1150,15 @@
10143 }
10144
10145 /*
10146- * This transmits the NTB if there are frames waiting.
10147+ * The transmit should only be run if no skb data has been sent
10148+ * for a certain duration.
10149 */
10150-static void ncm_tx_tasklet(unsigned long data)
10151+static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10152 {
10153- struct f_ncm *ncm = (void *)data;
10154-
10155- if (ncm->timer_stopping)
10156- return;
10157+ struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10158
10159 /* Only send if data is available. */
10160- if (ncm->skb_tx_data) {
10161+ if (!ncm->timer_stopping && ncm->skb_tx_data) {
10162 ncm->timer_force_tx = true;
10163
10164 /* XXX This allowance of a NULL skb argument to ndo_start_xmit
10165@@ -1175,16 +1171,6 @@
10166
10167 ncm->timer_force_tx = false;
10168 }
10169-}
10170-
10171-/*
10172- * The transmit should only be run if no skb data has been sent
10173- * for a certain duration.
10174- */
10175-static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10176-{
10177- struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10178- tasklet_schedule(&ncm->tx_tasklet);
10179 return HRTIMER_NORESTART;
10180 }
10181
10182@@ -1517,8 +1503,7 @@
10183 ncm->port.open = ncm_open;
10184 ncm->port.close = ncm_close;
10185
10186- tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
10187- hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10188+ hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
10189 ncm->task_timer.function = ncm_tx_timeout;
10190
10191 DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
10192@@ -1627,7 +1612,6 @@
10193 DBG(c->cdev, "ncm unbind\n");
10194
10195 hrtimer_cancel(&ncm->task_timer);
10196- tasklet_kill(&ncm->tx_tasklet);
10197
10198 ncm_string_defs[0].id = 0;
10199 usb_free_all_descriptors(f);
10200diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/legacy/inode.c linux-4.14/drivers/usb/gadget/legacy/inode.c
10201--- linux-4.14.orig/drivers/usb/gadget/legacy/inode.c 2017-11-12 19:46:13.000000000 +0100
10202+++ linux-4.14/drivers/usb/gadget/legacy/inode.c 2018-09-05 11:05:07.000000000 +0200
10203@@ -347,7 +347,7 @@
10204 spin_unlock_irq (&epdata->dev->lock);
10205
10206 if (likely (value == 0)) {
10207- value = wait_event_interruptible (done.wait, done.done);
10208+ value = swait_event_interruptible (done.wait, done.done);
10209 if (value != 0) {
10210 spin_lock_irq (&epdata->dev->lock);
10211 if (likely (epdata->ep != NULL)) {
10212@@ -356,7 +356,7 @@
10213 usb_ep_dequeue (epdata->ep, epdata->req);
10214 spin_unlock_irq (&epdata->dev->lock);
10215
10216- wait_event (done.wait, done.done);
10217+ swait_event (done.wait, done.done);
10218 if (epdata->status == -ECONNRESET)
10219 epdata->status = -EINTR;
10220 } else {
10221diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/aio.c linux-4.14/fs/aio.c
10222--- linux-4.14.orig/fs/aio.c 2018-09-05 11:03:22.000000000 +0200
10223+++ linux-4.14/fs/aio.c 2018-09-05 11:05:07.000000000 +0200
10224@@ -40,6 +40,7 @@
10225 #include <linux/ramfs.h>
10226 #include <linux/percpu-refcount.h>
10227 #include <linux/mount.h>
10228+#include <linux/swork.h>
10229
10230 #include <asm/kmap_types.h>
10231 #include <linux/uaccess.h>
10232@@ -117,6 +118,7 @@
10233
10234 struct rcu_head free_rcu;
10235 struct work_struct free_work; /* see free_ioctx() */
10236+ struct swork_event free_swork; /* see free_ioctx() */
10237
10238 /*
10239 * signals when all in-flight requests are done
10240@@ -259,6 +261,7 @@
10241 .mount = aio_mount,
10242 .kill_sb = kill_anon_super,
10243 };
10244+ BUG_ON(swork_get());
10245 aio_mnt = kern_mount(&aio_fs);
10246 if (IS_ERR(aio_mnt))
10247 panic("Failed to create aio fs mount.");
10248@@ -633,9 +636,9 @@
10249 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
10250 * now it's safe to cancel any that need to be.
10251 */
10252-static void free_ioctx_users(struct percpu_ref *ref)
10253+static void free_ioctx_users_work(struct swork_event *sev)
10254 {
10255- struct kioctx *ctx = container_of(ref, struct kioctx, users);
10256+ struct kioctx *ctx = container_of(sev, struct kioctx, free_swork);
10257 struct aio_kiocb *req;
10258
10259 spin_lock_irq(&ctx->ctx_lock);
10260@@ -653,6 +656,14 @@
10261 percpu_ref_put(&ctx->reqs);
10262 }
10263
10264+static void free_ioctx_users(struct percpu_ref *ref)
10265+{
10266+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
10267+
10268+ INIT_SWORK(&ctx->free_swork, free_ioctx_users_work);
10269+ swork_queue(&ctx->free_swork);
10270+}
10271+
10272 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
10273 {
10274 unsigned i, new_nr;
10275diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/autofs4/autofs_i.h linux-4.14/fs/autofs4/autofs_i.h
10276--- linux-4.14.orig/fs/autofs4/autofs_i.h 2017-11-12 19:46:13.000000000 +0100
10277+++ linux-4.14/fs/autofs4/autofs_i.h 2018-09-05 11:05:07.000000000 +0200
10278@@ -20,6 +20,7 @@
10279 #include <linux/sched.h>
10280 #include <linux/mount.h>
10281 #include <linux/namei.h>
10282+#include <linux/delay.h>
10283 #include <linux/uaccess.h>
10284 #include <linux/mutex.h>
10285 #include <linux/spinlock.h>
10286diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/autofs4/expire.c linux-4.14/fs/autofs4/expire.c
10287--- linux-4.14.orig/fs/autofs4/expire.c 2017-11-12 19:46:13.000000000 +0100
10288+++ linux-4.14/fs/autofs4/expire.c 2018-09-05 11:05:07.000000000 +0200
10289@@ -148,7 +148,7 @@
10290 parent = p->d_parent;
10291 if (!spin_trylock(&parent->d_lock)) {
10292 spin_unlock(&p->d_lock);
10293- cpu_relax();
10294+ cpu_chill();
10295 goto relock;
10296 }
10297 spin_unlock(&p->d_lock);
10298diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/buffer.c linux-4.14/fs/buffer.c
10299--- linux-4.14.orig/fs/buffer.c 2018-09-05 11:03:22.000000000 +0200
10300+++ linux-4.14/fs/buffer.c 2018-09-05 11:05:07.000000000 +0200
10301@@ -302,8 +302,7 @@
10302 * decide that the page is now completely done.
10303 */
10304 first = page_buffers(page);
10305- local_irq_save(flags);
10306- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10307+ flags = bh_uptodate_lock_irqsave(first);
10308 clear_buffer_async_read(bh);
10309 unlock_buffer(bh);
10310 tmp = bh;
10311@@ -316,8 +315,7 @@
10312 }
10313 tmp = tmp->b_this_page;
10314 } while (tmp != bh);
10315- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10316- local_irq_restore(flags);
10317+ bh_uptodate_unlock_irqrestore(first, flags);
10318
10319 /*
10320 * If none of the buffers had errors and they are all
10321@@ -329,9 +327,7 @@
10322 return;
10323
10324 still_busy:
10325- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10326- local_irq_restore(flags);
10327- return;
10328+ bh_uptodate_unlock_irqrestore(first, flags);
10329 }
10330
10331 /*
10332@@ -358,8 +354,7 @@
10333 }
10334
10335 first = page_buffers(page);
10336- local_irq_save(flags);
10337- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10338+ flags = bh_uptodate_lock_irqsave(first);
10339
10340 clear_buffer_async_write(bh);
10341 unlock_buffer(bh);
10342@@ -371,15 +366,12 @@
10343 }
10344 tmp = tmp->b_this_page;
10345 }
10346- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10347- local_irq_restore(flags);
10348+ bh_uptodate_unlock_irqrestore(first, flags);
10349 end_page_writeback(page);
10350 return;
10351
10352 still_busy:
10353- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10354- local_irq_restore(flags);
10355- return;
10356+ bh_uptodate_unlock_irqrestore(first, flags);
10357 }
10358 EXPORT_SYMBOL(end_buffer_async_write);
10359
10360@@ -3417,6 +3409,7 @@
10361 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
10362 if (ret) {
10363 INIT_LIST_HEAD(&ret->b_assoc_buffers);
10364+ buffer_head_init_locks(ret);
10365 preempt_disable();
10366 __this_cpu_inc(bh_accounting.nr);
10367 recalc_bh_state();
10368diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/cifs/readdir.c linux-4.14/fs/cifs/readdir.c
10369--- linux-4.14.orig/fs/cifs/readdir.c 2017-11-12 19:46:13.000000000 +0100
10370+++ linux-4.14/fs/cifs/readdir.c 2018-09-05 11:05:07.000000000 +0200
10371@@ -80,7 +80,7 @@
10372 struct inode *inode;
10373 struct super_block *sb = parent->d_sb;
10374 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
10375- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10376+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10377
10378 cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
10379
10380diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/dcache.c linux-4.14/fs/dcache.c
10381--- linux-4.14.orig/fs/dcache.c 2018-09-05 11:03:29.000000000 +0200
10382+++ linux-4.14/fs/dcache.c 2018-09-05 11:05:07.000000000 +0200
10383@@ -19,6 +19,7 @@
10384 #include <linux/mm.h>
10385 #include <linux/fs.h>
10386 #include <linux/fsnotify.h>
10387+#include <linux/delay.h>
10388 #include <linux/slab.h>
10389 #include <linux/init.h>
10390 #include <linux/hash.h>
10391@@ -793,6 +794,8 @@
10392 */
10393 void dput(struct dentry *dentry)
10394 {
10395+ struct dentry *parent;
10396+
10397 if (unlikely(!dentry))
10398 return;
10399
10400@@ -829,9 +832,18 @@
10401 return;
10402
10403 kill_it:
10404- dentry = dentry_kill(dentry);
10405- if (dentry) {
10406- cond_resched();
10407+ parent = dentry_kill(dentry);
10408+ if (parent) {
10409+ int r;
10410+
10411+ if (parent == dentry) {
10412+ /* the task with the highest priority won't schedule */
10413+ r = cond_resched();
10414+ if (!r)
10415+ cpu_chill();
10416+ } else {
10417+ dentry = parent;
10418+ }
10419 goto repeat;
10420 }
10421 }
10422@@ -2394,7 +2406,7 @@
10423 if (dentry->d_lockref.count == 1) {
10424 if (!spin_trylock(&inode->i_lock)) {
10425 spin_unlock(&dentry->d_lock);
10426- cpu_relax();
10427+ cpu_chill();
10428 goto again;
10429 }
10430 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
10431@@ -2439,9 +2451,10 @@
10432 static inline unsigned start_dir_add(struct inode *dir)
10433 {
10434
10435+ preempt_disable_rt();
10436 for (;;) {
10437- unsigned n = dir->i_dir_seq;
10438- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
10439+ unsigned n = dir->__i_dir_seq;
10440+ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
10441 return n;
10442 cpu_relax();
10443 }
10444@@ -2449,26 +2462,30 @@
10445
10446 static inline void end_dir_add(struct inode *dir, unsigned n)
10447 {
10448- smp_store_release(&dir->i_dir_seq, n + 2);
10449+ smp_store_release(&dir->__i_dir_seq, n + 2);
10450+ preempt_enable_rt();
10451 }
10452
10453 static void d_wait_lookup(struct dentry *dentry)
10454 {
10455- if (d_in_lookup(dentry)) {
10456- DECLARE_WAITQUEUE(wait, current);
10457- add_wait_queue(dentry->d_wait, &wait);
10458- do {
10459- set_current_state(TASK_UNINTERRUPTIBLE);
10460- spin_unlock(&dentry->d_lock);
10461- schedule();
10462- spin_lock(&dentry->d_lock);
10463- } while (d_in_lookup(dentry));
10464- }
10465+ struct swait_queue __wait;
10466+
10467+ if (!d_in_lookup(dentry))
10468+ return;
10469+
10470+ INIT_LIST_HEAD(&__wait.task_list);
10471+ do {
10472+ prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
10473+ spin_unlock(&dentry->d_lock);
10474+ schedule();
10475+ spin_lock(&dentry->d_lock);
10476+ } while (d_in_lookup(dentry));
10477+ finish_swait(dentry->d_wait, &__wait);
10478 }
10479
10480 struct dentry *d_alloc_parallel(struct dentry *parent,
10481 const struct qstr *name,
10482- wait_queue_head_t *wq)
10483+ struct swait_queue_head *wq)
10484 {
10485 unsigned int hash = name->hash;
10486 struct hlist_bl_head *b = in_lookup_hash(parent, hash);
10487@@ -2482,7 +2499,7 @@
10488
10489 retry:
10490 rcu_read_lock();
10491- seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
10492+ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
10493 r_seq = read_seqbegin(&rename_lock);
10494 dentry = __d_lookup_rcu(parent, name, &d_seq);
10495 if (unlikely(dentry)) {
10496@@ -2510,7 +2527,7 @@
10497 }
10498
10499 hlist_bl_lock(b);
10500- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
10501+ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
10502 hlist_bl_unlock(b);
10503 rcu_read_unlock();
10504 goto retry;
10505@@ -2583,7 +2600,7 @@
10506 hlist_bl_lock(b);
10507 dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
10508 __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
10509- wake_up_all(dentry->d_wait);
10510+ swake_up_all(dentry->d_wait);
10511 dentry->d_wait = NULL;
10512 hlist_bl_unlock(b);
10513 INIT_HLIST_NODE(&dentry->d_u.d_alias);
10514@@ -3619,6 +3636,8 @@
10515
10516 static void __init dcache_init_early(void)
10517 {
10518+ unsigned int loop;
10519+
10520 /* If hashes are distributed across NUMA nodes, defer
10521 * hash allocation until vmalloc space is available.
10522 */
10523@@ -3635,10 +3654,14 @@
10524 &d_hash_mask,
10525 0,
10526 0);
10527+
10528+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
10529+ INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10530 }
10531
10532 static void __init dcache_init(void)
10533 {
10534+ unsigned int loop;
10535 /*
10536 * A constructor could be added for stable state like the lists,
10537 * but it is probably not worth it because of the cache nature
10538@@ -3661,6 +3684,10 @@
10539 &d_hash_mask,
10540 0,
10541 0);
10542+
10543+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
10544+ INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10545+
10546 }
10547
10548 /* SLAB cache for __getname() consumers */
10549diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/eventpoll.c linux-4.14/fs/eventpoll.c
10550--- linux-4.14.orig/fs/eventpoll.c 2017-11-12 19:46:13.000000000 +0100
10551+++ linux-4.14/fs/eventpoll.c 2018-09-05 11:05:07.000000000 +0200
10552@@ -587,12 +587,12 @@
10553 */
10554 static void ep_poll_safewake(wait_queue_head_t *wq)
10555 {
10556- int this_cpu = get_cpu();
10557+ int this_cpu = get_cpu_light();
10558
10559 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
10560 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
10561
10562- put_cpu();
10563+ put_cpu_light();
10564 }
10565
10566 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
10567diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/exec.c linux-4.14/fs/exec.c
10568--- linux-4.14.orig/fs/exec.c 2018-09-05 11:03:29.000000000 +0200
10569+++ linux-4.14/fs/exec.c 2018-09-05 11:05:07.000000000 +0200
10570@@ -1025,12 +1025,14 @@
10571 }
10572 }
10573 task_lock(tsk);
10574+ preempt_disable_rt();
10575 active_mm = tsk->active_mm;
10576 tsk->mm = mm;
10577 tsk->active_mm = mm;
10578 activate_mm(active_mm, mm);
10579 tsk->mm->vmacache_seqnum = 0;
10580 vmacache_flush(tsk);
10581+ preempt_enable_rt();
10582 task_unlock(tsk);
10583 if (old_mm) {
10584 up_read(&old_mm->mmap_sem);
10585diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/ext4/page-io.c linux-4.14/fs/ext4/page-io.c
10586--- linux-4.14.orig/fs/ext4/page-io.c 2017-11-12 19:46:13.000000000 +0100
10587+++ linux-4.14/fs/ext4/page-io.c 2018-09-05 11:05:07.000000000 +0200
10588@@ -95,8 +95,7 @@
10589 * We check all buffers in the page under BH_Uptodate_Lock
10590 * to avoid races with other end io clearing async_write flags
10591 */
10592- local_irq_save(flags);
10593- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
10594+ flags = bh_uptodate_lock_irqsave(head);
10595 do {
10596 if (bh_offset(bh) < bio_start ||
10597 bh_offset(bh) + bh->b_size > bio_end) {
10598@@ -108,8 +107,7 @@
10599 if (bio->bi_status)
10600 buffer_io_error(bh);
10601 } while ((bh = bh->b_this_page) != head);
10602- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
10603- local_irq_restore(flags);
10604+ bh_uptodate_unlock_irqrestore(head, flags);
10605 if (!under_io) {
10606 #ifdef CONFIG_EXT4_FS_ENCRYPTION
10607 if (data_page)
10608diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/fuse/dir.c linux-4.14/fs/fuse/dir.c
10609--- linux-4.14.orig/fs/fuse/dir.c 2018-09-05 11:03:22.000000000 +0200
10610+++ linux-4.14/fs/fuse/dir.c 2018-09-05 11:05:07.000000000 +0200
10611@@ -1187,7 +1187,7 @@
10612 struct inode *dir = d_inode(parent);
10613 struct fuse_conn *fc;
10614 struct inode *inode;
10615- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10616+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10617
10618 if (!o->nodeid) {
10619 /*
10620diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/inode.c linux-4.14/fs/inode.c
10621--- linux-4.14.orig/fs/inode.c 2018-09-05 11:03:29.000000000 +0200
10622+++ linux-4.14/fs/inode.c 2018-09-05 11:05:07.000000000 +0200
10623@@ -154,7 +154,7 @@
10624 inode->i_bdev = NULL;
10625 inode->i_cdev = NULL;
10626 inode->i_link = NULL;
10627- inode->i_dir_seq = 0;
10628+ inode->__i_dir_seq = 0;
10629 inode->i_rdev = 0;
10630 inode->dirtied_when = 0;
10631
10632diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/libfs.c linux-4.14/fs/libfs.c
10633--- linux-4.14.orig/fs/libfs.c 2017-11-12 19:46:13.000000000 +0100
10634+++ linux-4.14/fs/libfs.c 2018-09-05 11:05:07.000000000 +0200
10635@@ -90,7 +90,7 @@
10636 struct list_head *from,
10637 int count)
10638 {
10639- unsigned *seq = &parent->d_inode->i_dir_seq, n;
10640+ unsigned *seq = &parent->d_inode->__i_dir_seq, n;
10641 struct dentry *res;
10642 struct list_head *p;
10643 bool skipped;
10644@@ -123,8 +123,9 @@
10645 static void move_cursor(struct dentry *cursor, struct list_head *after)
10646 {
10647 struct dentry *parent = cursor->d_parent;
10648- unsigned n, *seq = &parent->d_inode->i_dir_seq;
10649+ unsigned n, *seq = &parent->d_inode->__i_dir_seq;
10650 spin_lock(&parent->d_lock);
10651+ preempt_disable_rt();
10652 for (;;) {
10653 n = *seq;
10654 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
10655@@ -137,6 +138,7 @@
10656 else
10657 list_add_tail(&cursor->d_child, &parent->d_subdirs);
10658 smp_store_release(seq, n + 2);
10659+ preempt_enable_rt();
10660 spin_unlock(&parent->d_lock);
10661 }
10662
10663diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/locks.c linux-4.14/fs/locks.c
10664--- linux-4.14.orig/fs/locks.c 2017-11-12 19:46:13.000000000 +0100
10665+++ linux-4.14/fs/locks.c 2018-09-05 11:05:07.000000000 +0200
10666@@ -945,7 +945,7 @@
10667 return -ENOMEM;
10668 }
10669
10670- percpu_down_read_preempt_disable(&file_rwsem);
10671+ percpu_down_read(&file_rwsem);
10672 spin_lock(&ctx->flc_lock);
10673 if (request->fl_flags & FL_ACCESS)
10674 goto find_conflict;
10675@@ -986,7 +986,7 @@
10676
10677 out:
10678 spin_unlock(&ctx->flc_lock);
10679- percpu_up_read_preempt_enable(&file_rwsem);
10680+ percpu_up_read(&file_rwsem);
10681 if (new_fl)
10682 locks_free_lock(new_fl);
10683 locks_dispose_list(&dispose);
10684@@ -1023,7 +1023,7 @@
10685 new_fl2 = locks_alloc_lock();
10686 }
10687
10688- percpu_down_read_preempt_disable(&file_rwsem);
10689+ percpu_down_read(&file_rwsem);
10690 spin_lock(&ctx->flc_lock);
10691 /*
10692 * New lock request. Walk all POSIX locks and look for conflicts. If
10693@@ -1195,7 +1195,7 @@
10694 }
10695 out:
10696 spin_unlock(&ctx->flc_lock);
10697- percpu_up_read_preempt_enable(&file_rwsem);
10698+ percpu_up_read(&file_rwsem);
10699 /*
10700 * Free any unused locks.
10701 */
10702@@ -1470,7 +1470,7 @@
10703 return error;
10704 }
10705
10706- percpu_down_read_preempt_disable(&file_rwsem);
10707+ percpu_down_read(&file_rwsem);
10708 spin_lock(&ctx->flc_lock);
10709
10710 time_out_leases(inode, &dispose);
10711@@ -1522,13 +1522,13 @@
10712 locks_insert_block(fl, new_fl);
10713 trace_break_lease_block(inode, new_fl);
10714 spin_unlock(&ctx->flc_lock);
10715- percpu_up_read_preempt_enable(&file_rwsem);
10716+ percpu_up_read(&file_rwsem);
10717
10718 locks_dispose_list(&dispose);
10719 error = wait_event_interruptible_timeout(new_fl->fl_wait,
10720 !new_fl->fl_next, break_time);
10721
10722- percpu_down_read_preempt_disable(&file_rwsem);
10723+ percpu_down_read(&file_rwsem);
10724 spin_lock(&ctx->flc_lock);
10725 trace_break_lease_unblock(inode, new_fl);
10726 locks_delete_block(new_fl);
10727@@ -1545,7 +1545,7 @@
10728 }
10729 out:
10730 spin_unlock(&ctx->flc_lock);
10731- percpu_up_read_preempt_enable(&file_rwsem);
10732+ percpu_up_read(&file_rwsem);
10733 locks_dispose_list(&dispose);
10734 locks_free_lock(new_fl);
10735 return error;
10736@@ -1619,7 +1619,7 @@
10737
10738 ctx = smp_load_acquire(&inode->i_flctx);
10739 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
10740- percpu_down_read_preempt_disable(&file_rwsem);
10741+ percpu_down_read(&file_rwsem);
10742 spin_lock(&ctx->flc_lock);
10743 time_out_leases(inode, &dispose);
10744 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
10745@@ -1629,7 +1629,7 @@
10746 break;
10747 }
10748 spin_unlock(&ctx->flc_lock);
10749- percpu_up_read_preempt_enable(&file_rwsem);
10750+ percpu_up_read(&file_rwsem);
10751
10752 locks_dispose_list(&dispose);
10753 }
10754@@ -1704,7 +1704,7 @@
10755 return -EINVAL;
10756 }
10757
10758- percpu_down_read_preempt_disable(&file_rwsem);
10759+ percpu_down_read(&file_rwsem);
10760 spin_lock(&ctx->flc_lock);
10761 time_out_leases(inode, &dispose);
10762 error = check_conflicting_open(dentry, arg, lease->fl_flags);
10763@@ -1775,7 +1775,7 @@
10764 lease->fl_lmops->lm_setup(lease, priv);
10765 out:
10766 spin_unlock(&ctx->flc_lock);
10767- percpu_up_read_preempt_enable(&file_rwsem);
10768+ percpu_up_read(&file_rwsem);
10769 locks_dispose_list(&dispose);
10770 if (is_deleg)
10771 inode_unlock(inode);
10772@@ -1798,7 +1798,7 @@
10773 return error;
10774 }
10775
10776- percpu_down_read_preempt_disable(&file_rwsem);
10777+ percpu_down_read(&file_rwsem);
10778 spin_lock(&ctx->flc_lock);
10779 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
10780 if (fl->fl_file == filp &&
10781@@ -1811,7 +1811,7 @@
10782 if (victim)
10783 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
10784 spin_unlock(&ctx->flc_lock);
10785- percpu_up_read_preempt_enable(&file_rwsem);
10786+ percpu_up_read(&file_rwsem);
10787 locks_dispose_list(&dispose);
10788 return error;
10789 }
10790@@ -2535,13 +2535,13 @@
10791 if (list_empty(&ctx->flc_lease))
10792 return;
10793
10794- percpu_down_read_preempt_disable(&file_rwsem);
10795+ percpu_down_read(&file_rwsem);
10796 spin_lock(&ctx->flc_lock);
10797 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
10798 if (filp == fl->fl_file)
10799 lease_modify(fl, F_UNLCK, &dispose);
10800 spin_unlock(&ctx->flc_lock);
10801- percpu_up_read_preempt_enable(&file_rwsem);
10802+ percpu_up_read(&file_rwsem);
10803
10804 locks_dispose_list(&dispose);
10805 }
10806diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/namei.c linux-4.14/fs/namei.c
10807--- linux-4.14.orig/fs/namei.c 2018-09-05 11:03:22.000000000 +0200
10808+++ linux-4.14/fs/namei.c 2018-09-05 11:05:07.000000000 +0200
10809@@ -1627,7 +1627,7 @@
10810 {
10811 struct dentry *dentry = ERR_PTR(-ENOENT), *old;
10812 struct inode *inode = dir->d_inode;
10813- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10814+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10815
10816 inode_lock_shared(inode);
10817 /* Don't go there if it's already dead */
10818@@ -3100,7 +3100,7 @@
10819 struct dentry *dentry;
10820 int error, create_error = 0;
10821 umode_t mode = op->mode;
10822- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10823+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10824
10825 if (unlikely(IS_DEADDIR(dir_inode)))
10826 return -ENOENT;
10827diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/namespace.c linux-4.14/fs/namespace.c
10828--- linux-4.14.orig/fs/namespace.c 2018-09-05 11:03:29.000000000 +0200
10829+++ linux-4.14/fs/namespace.c 2018-09-05 11:05:07.000000000 +0200
10830@@ -14,6 +14,7 @@
10831 #include <linux/mnt_namespace.h>
10832 #include <linux/user_namespace.h>
10833 #include <linux/namei.h>
10834+#include <linux/delay.h>
10835 #include <linux/security.h>
10836 #include <linux/cred.h>
10837 #include <linux/idr.h>
10838@@ -353,8 +354,11 @@
10839 * incremented count after it has set MNT_WRITE_HOLD.
10840 */
10841 smp_mb();
10842- while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
10843- cpu_relax();
10844+ while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
10845+ preempt_enable();
10846+ cpu_chill();
10847+ preempt_disable();
10848+ }
10849 /*
10850 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
10851 * be set to match its requirements. So we must not load that until
10852diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/delegation.c linux-4.14/fs/nfs/delegation.c
10853--- linux-4.14.orig/fs/nfs/delegation.c 2017-11-12 19:46:13.000000000 +0100
10854+++ linux-4.14/fs/nfs/delegation.c 2018-09-05 11:05:07.000000000 +0200
10855@@ -150,11 +150,11 @@
10856 sp = state->owner;
10857 /* Block nfs4_proc_unlck */
10858 mutex_lock(&sp->so_delegreturn_mutex);
10859- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
10860+ seq = read_seqbegin(&sp->so_reclaim_seqlock);
10861 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
10862 if (!err)
10863 err = nfs_delegation_claim_locks(ctx, state, stateid);
10864- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
10865+ if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
10866 err = -EAGAIN;
10867 mutex_unlock(&sp->so_delegreturn_mutex);
10868 put_nfs_open_context(ctx);
10869diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/dir.c linux-4.14/fs/nfs/dir.c
10870--- linux-4.14.orig/fs/nfs/dir.c 2018-09-05 11:03:22.000000000 +0200
10871+++ linux-4.14/fs/nfs/dir.c 2018-09-05 11:05:07.000000000 +0200
10872@@ -452,7 +452,7 @@
10873 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
10874 {
10875 struct qstr filename = QSTR_INIT(entry->name, entry->len);
10876- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10877+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10878 struct dentry *dentry;
10879 struct dentry *alias;
10880 struct inode *dir = d_inode(parent);
10881@@ -1443,7 +1443,7 @@
10882 struct file *file, unsigned open_flags,
10883 umode_t mode, int *opened)
10884 {
10885- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10886+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10887 struct nfs_open_context *ctx;
10888 struct dentry *res;
10889 struct iattr attr = { .ia_valid = ATTR_OPEN };
10890@@ -1763,7 +1763,11 @@
10891
10892 trace_nfs_rmdir_enter(dir, dentry);
10893 if (d_really_is_positive(dentry)) {
10894+#ifdef CONFIG_PREEMPT_RT_BASE
10895+ down(&NFS_I(d_inode(dentry))->rmdir_sem);
10896+#else
10897 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
10898+#endif
10899 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
10900 /* Ensure the VFS deletes this inode */
10901 switch (error) {
10902@@ -1773,7 +1777,11 @@
10903 case -ENOENT:
10904 nfs_dentry_handle_enoent(dentry);
10905 }
10906+#ifdef CONFIG_PREEMPT_RT_BASE
10907+ up(&NFS_I(d_inode(dentry))->rmdir_sem);
10908+#else
10909 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
10910+#endif
10911 } else
10912 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
10913 trace_nfs_rmdir_exit(dir, dentry, error);
10914diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/inode.c linux-4.14/fs/nfs/inode.c
10915--- linux-4.14.orig/fs/nfs/inode.c 2017-11-12 19:46:13.000000000 +0100
10916+++ linux-4.14/fs/nfs/inode.c 2018-09-05 11:05:07.000000000 +0200
10917@@ -2014,7 +2014,11 @@
10918 atomic_long_set(&nfsi->nrequests, 0);
10919 atomic_long_set(&nfsi->commit_info.ncommit, 0);
10920 atomic_set(&nfsi->commit_info.rpcs_out, 0);
10921+#ifdef CONFIG_PREEMPT_RT_BASE
10922+ sema_init(&nfsi->rmdir_sem, 1);
10923+#else
10924 init_rwsem(&nfsi->rmdir_sem);
10925+#endif
10926 mutex_init(&nfsi->commit_mutex);
10927 nfs4_init_once(nfsi);
10928 }
10929diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4_fs.h linux-4.14/fs/nfs/nfs4_fs.h
10930--- linux-4.14.orig/fs/nfs/nfs4_fs.h 2018-09-05 11:03:22.000000000 +0200
10931+++ linux-4.14/fs/nfs/nfs4_fs.h 2018-09-05 11:05:07.000000000 +0200
10932@@ -112,7 +112,7 @@
10933 unsigned long so_flags;
10934 struct list_head so_states;
10935 struct nfs_seqid_counter so_seqid;
10936- seqcount_t so_reclaim_seqcount;
10937+ seqlock_t so_reclaim_seqlock;
10938 struct mutex so_delegreturn_mutex;
10939 };
10940
10941diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4proc.c linux-4.14/fs/nfs/nfs4proc.c
10942--- linux-4.14.orig/fs/nfs/nfs4proc.c 2018-09-05 11:03:22.000000000 +0200
10943+++ linux-4.14/fs/nfs/nfs4proc.c 2018-09-05 11:05:07.000000000 +0200
10944@@ -2689,7 +2689,7 @@
10945 unsigned int seq;
10946 int ret;
10947
10948- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
10949+ seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
10950
10951 ret = _nfs4_proc_open(opendata);
10952 if (ret != 0)
10953@@ -2727,7 +2727,7 @@
10954
10955 if (d_inode(dentry) == state->inode) {
10956 nfs_inode_attach_open_context(ctx);
10957- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
10958+ if (read_seqretry(&sp->so_reclaim_seqlock, seq))
10959 nfs4_schedule_stateid_recovery(server, state);
10960 }
10961 out:
10962diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4state.c linux-4.14/fs/nfs/nfs4state.c
10963--- linux-4.14.orig/fs/nfs/nfs4state.c 2018-09-05 11:03:22.000000000 +0200
10964+++ linux-4.14/fs/nfs/nfs4state.c 2018-09-05 11:05:07.000000000 +0200
10965@@ -494,7 +494,7 @@
10966 nfs4_init_seqid_counter(&sp->so_seqid);
10967 atomic_set(&sp->so_count, 1);
10968 INIT_LIST_HEAD(&sp->so_lru);
10969- seqcount_init(&sp->so_reclaim_seqcount);
10970+ seqlock_init(&sp->so_reclaim_seqlock);
10971 mutex_init(&sp->so_delegreturn_mutex);
10972 return sp;
10973 }
10974@@ -1519,8 +1519,12 @@
10975 * recovering after a network partition or a reboot from a
10976 * server that doesn't support a grace period.
10977 */
10978+#ifdef CONFIG_PREEMPT_RT_FULL
10979+ write_seqlock(&sp->so_reclaim_seqlock);
10980+#else
10981+ write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
10982+#endif
10983 spin_lock(&sp->so_lock);
10984- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
10985 restart:
10986 list_for_each_entry(state, &sp->so_states, open_states) {
10987 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
10988@@ -1589,14 +1593,20 @@
10989 spin_lock(&sp->so_lock);
10990 goto restart;
10991 }
10992- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
10993 spin_unlock(&sp->so_lock);
10994+#ifdef CONFIG_PREEMPT_RT_FULL
10995+ write_sequnlock(&sp->so_reclaim_seqlock);
10996+#else
10997+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
10998+#endif
10999 return 0;
11000 out_err:
11001 nfs4_put_open_state(state);
11002- spin_lock(&sp->so_lock);
11003- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11004- spin_unlock(&sp->so_lock);
11005+#ifdef CONFIG_PREEMPT_RT_FULL
11006+ write_sequnlock(&sp->so_reclaim_seqlock);
11007+#else
11008+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11009+#endif
11010 return status;
11011 }
11012
11013diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/unlink.c linux-4.14/fs/nfs/unlink.c
11014--- linux-4.14.orig/fs/nfs/unlink.c 2017-11-12 19:46:13.000000000 +0100
11015+++ linux-4.14/fs/nfs/unlink.c 2018-09-05 11:05:07.000000000 +0200
11016@@ -13,7 +13,7 @@
11017 #include <linux/sunrpc/clnt.h>
11018 #include <linux/nfs_fs.h>
11019 #include <linux/sched.h>
11020-#include <linux/wait.h>
11021+#include <linux/swait.h>
11022 #include <linux/namei.h>
11023 #include <linux/fsnotify.h>
11024
11025@@ -52,6 +52,29 @@
11026 rpc_restart_call_prepare(task);
11027 }
11028
11029+#ifdef CONFIG_PREEMPT_RT_BASE
11030+static void nfs_down_anon(struct semaphore *sema)
11031+{
11032+ down(sema);
11033+}
11034+
11035+static void nfs_up_anon(struct semaphore *sema)
11036+{
11037+ up(sema);
11038+}
11039+
11040+#else
11041+static void nfs_down_anon(struct rw_semaphore *rwsem)
11042+{
11043+ down_read_non_owner(rwsem);
11044+}
11045+
11046+static void nfs_up_anon(struct rw_semaphore *rwsem)
11047+{
11048+ up_read_non_owner(rwsem);
11049+}
11050+#endif
11051+
11052 /**
11053 * nfs_async_unlink_release - Release the sillydelete data.
11054 * @task: rpc_task of the sillydelete
11055@@ -65,7 +88,7 @@
11056 struct dentry *dentry = data->dentry;
11057 struct super_block *sb = dentry->d_sb;
11058
11059- up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11060+ nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11061 d_lookup_done(dentry);
11062 nfs_free_unlinkdata(data);
11063 dput(dentry);
11064@@ -118,10 +141,10 @@
11065 struct inode *dir = d_inode(dentry->d_parent);
11066 struct dentry *alias;
11067
11068- down_read_non_owner(&NFS_I(dir)->rmdir_sem);
11069+ nfs_down_anon(&NFS_I(dir)->rmdir_sem);
11070 alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
11071 if (IS_ERR(alias)) {
11072- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11073+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11074 return 0;
11075 }
11076 if (!d_in_lookup(alias)) {
11077@@ -143,7 +166,7 @@
11078 ret = 0;
11079 spin_unlock(&alias->d_lock);
11080 dput(alias);
11081- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11082+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11083 /*
11084 * If we'd displaced old cached devname, free it. At that
11085 * point dentry is definitely not a root, so we won't need
11086@@ -183,7 +206,7 @@
11087 goto out_free_name;
11088 }
11089 data->res.dir_attr = &data->dir_attr;
11090- init_waitqueue_head(&data->wq);
11091+ init_swait_queue_head(&data->wq);
11092
11093 status = -EBUSY;
11094 spin_lock(&dentry->d_lock);
11095diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/ntfs/aops.c linux-4.14/fs/ntfs/aops.c
11096--- linux-4.14.orig/fs/ntfs/aops.c 2017-11-12 19:46:13.000000000 +0100
11097+++ linux-4.14/fs/ntfs/aops.c 2018-09-05 11:05:07.000000000 +0200
11098@@ -93,13 +93,13 @@
11099 ofs = 0;
11100 if (file_ofs < init_size)
11101 ofs = init_size - file_ofs;
11102- local_irq_save(flags);
11103+ local_irq_save_nort(flags);
11104 kaddr = kmap_atomic(page);
11105 memset(kaddr + bh_offset(bh) + ofs, 0,
11106 bh->b_size - ofs);
11107 flush_dcache_page(page);
11108 kunmap_atomic(kaddr);
11109- local_irq_restore(flags);
11110+ local_irq_restore_nort(flags);
11111 }
11112 } else {
11113 clear_buffer_uptodate(bh);
11114@@ -108,8 +108,7 @@
11115 "0x%llx.", (unsigned long long)bh->b_blocknr);
11116 }
11117 first = page_buffers(page);
11118- local_irq_save(flags);
11119- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11120+ flags = bh_uptodate_lock_irqsave(first);
11121 clear_buffer_async_read(bh);
11122 unlock_buffer(bh);
11123 tmp = bh;
11124@@ -124,8 +123,7 @@
11125 }
11126 tmp = tmp->b_this_page;
11127 } while (tmp != bh);
11128- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11129- local_irq_restore(flags);
11130+ bh_uptodate_unlock_irqrestore(first, flags);
11131 /*
11132 * If none of the buffers had errors then we can set the page uptodate,
11133 * but we first have to perform the post read mst fixups, if the
11134@@ -146,13 +144,13 @@
11135 recs = PAGE_SIZE / rec_size;
11136 /* Should have been verified before we got here... */
11137 BUG_ON(!recs);
11138- local_irq_save(flags);
11139+ local_irq_save_nort(flags);
11140 kaddr = kmap_atomic(page);
11141 for (i = 0; i < recs; i++)
11142 post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11143 i * rec_size), rec_size);
11144 kunmap_atomic(kaddr);
11145- local_irq_restore(flags);
11146+ local_irq_restore_nort(flags);
11147 flush_dcache_page(page);
11148 if (likely(page_uptodate && !PageError(page)))
11149 SetPageUptodate(page);
11150@@ -160,9 +158,7 @@
11151 unlock_page(page);
11152 return;
11153 still_busy:
11154- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11155- local_irq_restore(flags);
11156- return;
11157+ bh_uptodate_unlock_irqrestore(first, flags);
11158 }
11159
11160 /**
11161diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/array.c linux-4.14/fs/proc/array.c
11162--- linux-4.14.orig/fs/proc/array.c 2018-09-05 11:03:22.000000000 +0200
11163+++ linux-4.14/fs/proc/array.c 2018-09-05 11:05:07.000000000 +0200
11164@@ -386,9 +386,9 @@
11165 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
11166 {
11167 seq_printf(m, "Cpus_allowed:\t%*pb\n",
11168- cpumask_pr_args(&task->cpus_allowed));
11169+ cpumask_pr_args(task->cpus_ptr));
11170 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
11171- cpumask_pr_args(&task->cpus_allowed));
11172+ cpumask_pr_args(task->cpus_ptr));
11173 }
11174
11175 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
11176diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/base.c linux-4.14/fs/proc/base.c
11177--- linux-4.14.orig/fs/proc/base.c 2018-09-05 11:03:28.000000000 +0200
11178+++ linux-4.14/fs/proc/base.c 2018-09-05 11:05:07.000000000 +0200
11179@@ -1886,7 +1886,7 @@
11180
11181 child = d_hash_and_lookup(dir, &qname);
11182 if (!child) {
11183- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11184+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11185 child = d_alloc_parallel(dir, &qname, &wq);
11186 if (IS_ERR(child))
11187 goto end_instantiate;
11188diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/proc_sysctl.c linux-4.14/fs/proc/proc_sysctl.c
11189--- linux-4.14.orig/fs/proc/proc_sysctl.c 2018-09-05 11:03:22.000000000 +0200
11190+++ linux-4.14/fs/proc/proc_sysctl.c 2018-09-05 11:05:07.000000000 +0200
11191@@ -679,7 +679,7 @@
11192
11193 child = d_lookup(dir, &qname);
11194 if (!child) {
11195- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11196+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11197 child = d_alloc_parallel(dir, &qname, &wq);
11198 if (IS_ERR(child))
11199 return false;
11200diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/timerfd.c linux-4.14/fs/timerfd.c
11201--- linux-4.14.orig/fs/timerfd.c 2017-11-12 19:46:13.000000000 +0100
11202+++ linux-4.14/fs/timerfd.c 2018-09-05 11:05:07.000000000 +0200
11203@@ -471,7 +471,10 @@
11204 break;
11205 }
11206 spin_unlock_irq(&ctx->wqh.lock);
11207- cpu_relax();
11208+ if (isalarm(ctx))
11209+ hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11210+ else
11211+ hrtimer_wait_for_timer(&ctx->t.tmr);
11212 }
11213
11214 /*
11215diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/xfs/xfs_aops.c linux-4.14/fs/xfs/xfs_aops.c
11216--- linux-4.14.orig/fs/xfs/xfs_aops.c 2018-09-05 11:03:22.000000000 +0200
11217+++ linux-4.14/fs/xfs/xfs_aops.c 2018-09-05 11:05:07.000000000 +0200
11218@@ -120,8 +120,7 @@
11219 ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
11220 ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
11221
11222- local_irq_save(flags);
11223- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11224+ flags = bh_uptodate_lock_irqsave(head);
11225 do {
11226 if (off >= bvec->bv_offset &&
11227 off < bvec->bv_offset + bvec->bv_len) {
11228@@ -143,8 +142,7 @@
11229 }
11230 off += bh->b_size;
11231 } while ((bh = bh->b_this_page) != head);
11232- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11233- local_irq_restore(flags);
11234+ bh_uptodate_unlock_irqrestore(head, flags);
11235
11236 if (!busy)
11237 end_page_writeback(bvec->bv_page);
11238diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/acpi/platform/aclinux.h linux-4.14/include/acpi/platform/aclinux.h
11239--- linux-4.14.orig/include/acpi/platform/aclinux.h 2017-11-12 19:46:13.000000000 +0100
11240+++ linux-4.14/include/acpi/platform/aclinux.h 2018-09-05 11:05:07.000000000 +0200
11241@@ -134,6 +134,7 @@
11242
11243 #define acpi_cache_t struct kmem_cache
11244 #define acpi_spinlock spinlock_t *
11245+#define acpi_raw_spinlock raw_spinlock_t *
11246 #define acpi_cpu_flags unsigned long
11247
11248 /* Use native linux version of acpi_os_allocate_zeroed */
11249@@ -152,6 +153,20 @@
11250 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11251 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11252
11253+#define acpi_os_create_raw_lock(__handle) \
11254+({ \
11255+ raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
11256+ \
11257+ if (lock) { \
11258+ *(__handle) = lock; \
11259+ raw_spin_lock_init(*(__handle)); \
11260+ } \
11261+ lock ? AE_OK : AE_NO_MEMORY; \
11262+ })
11263+
11264+#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
11265+
11266+
11267 /*
11268 * OSL interfaces used by debugger/disassembler
11269 */
11270diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/asm-generic/bug.h linux-4.14/include/asm-generic/bug.h
11271--- linux-4.14.orig/include/asm-generic/bug.h 2018-09-05 11:03:22.000000000 +0200
11272+++ linux-4.14/include/asm-generic/bug.h 2018-09-05 11:05:07.000000000 +0200
11273@@ -234,6 +234,20 @@
11274 # define WARN_ON_SMP(x) ({0;})
11275 #endif
11276
11277+#ifdef CONFIG_PREEMPT_RT_BASE
11278+# define BUG_ON_RT(c) BUG_ON(c)
11279+# define BUG_ON_NONRT(c) do { } while (0)
11280+# define WARN_ON_RT(condition) WARN_ON(condition)
11281+# define WARN_ON_NONRT(condition) do { } while (0)
11282+# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11283+#else
11284+# define BUG_ON_RT(c) do { } while (0)
11285+# define BUG_ON_NONRT(c) BUG_ON(c)
11286+# define WARN_ON_RT(condition) do { } while (0)
11287+# define WARN_ON_NONRT(condition) WARN_ON(condition)
11288+# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11289+#endif
11290+
11291 #endif /* __ASSEMBLY__ */
11292
11293 #endif
11294diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/blkdev.h linux-4.14/include/linux/blkdev.h
11295--- linux-4.14.orig/include/linux/blkdev.h 2018-09-05 11:03:22.000000000 +0200
11296+++ linux-4.14/include/linux/blkdev.h 2018-09-05 11:05:07.000000000 +0200
11297@@ -27,6 +27,7 @@
11298 #include <linux/percpu-refcount.h>
11299 #include <linux/scatterlist.h>
11300 #include <linux/blkzoned.h>
11301+#include <linux/swork.h>
11302
11303 struct module;
11304 struct scsi_ioctl_command;
11305@@ -134,6 +135,9 @@
11306 */
11307 struct request {
11308 struct list_head queuelist;
11309+#ifdef CONFIG_PREEMPT_RT_FULL
11310+ struct work_struct work;
11311+#endif
11312 union {
11313 struct __call_single_data csd;
11314 u64 fifo_time;
11315@@ -596,6 +600,7 @@
11316 #endif
11317 struct rcu_head rcu_head;
11318 wait_queue_head_t mq_freeze_wq;
11319+ struct swork_event mq_pcpu_wake;
11320 struct percpu_ref q_usage_counter;
11321 struct list_head all_q_node;
11322
11323diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/blk-mq.h linux-4.14/include/linux/blk-mq.h
11324--- linux-4.14.orig/include/linux/blk-mq.h 2017-11-12 19:46:13.000000000 +0100
11325+++ linux-4.14/include/linux/blk-mq.h 2018-09-05 11:05:07.000000000 +0200
11326@@ -226,7 +226,7 @@
11327 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
11328 }
11329
11330-
11331+void __blk_mq_complete_request_remote_work(struct work_struct *work);
11332 int blk_mq_request_started(struct request *rq);
11333 void blk_mq_start_request(struct request *rq);
11334 void blk_mq_end_request(struct request *rq, blk_status_t error);
11335diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/bottom_half.h linux-4.14/include/linux/bottom_half.h
11336--- linux-4.14.orig/include/linux/bottom_half.h 2017-11-12 19:46:13.000000000 +0100
11337+++ linux-4.14/include/linux/bottom_half.h 2018-09-05 11:05:07.000000000 +0200
11338@@ -4,6 +4,39 @@
11339
11340 #include <linux/preempt.h>
11341
11342+#ifdef CONFIG_PREEMPT_RT_FULL
11343+
11344+extern void __local_bh_disable(void);
11345+extern void _local_bh_enable(void);
11346+extern void __local_bh_enable(void);
11347+
11348+static inline void local_bh_disable(void)
11349+{
11350+ __local_bh_disable();
11351+}
11352+
11353+static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11354+{
11355+ __local_bh_disable();
11356+}
11357+
11358+static inline void local_bh_enable(void)
11359+{
11360+ __local_bh_enable();
11361+}
11362+
11363+static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11364+{
11365+ __local_bh_enable();
11366+}
11367+
11368+static inline void local_bh_enable_ip(unsigned long ip)
11369+{
11370+ __local_bh_enable();
11371+}
11372+
11373+#else
11374+
11375 #ifdef CONFIG_TRACE_IRQFLAGS
11376 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11377 #else
11378@@ -31,5 +64,6 @@
11379 {
11380 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11381 }
11382+#endif
11383
11384 #endif /* _LINUX_BH_H */
11385diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/buffer_head.h linux-4.14/include/linux/buffer_head.h
11386--- linux-4.14.orig/include/linux/buffer_head.h 2017-11-12 19:46:13.000000000 +0100
11387+++ linux-4.14/include/linux/buffer_head.h 2018-09-05 11:05:07.000000000 +0200
11388@@ -76,8 +76,50 @@
11389 struct address_space *b_assoc_map; /* mapping this buffer is
11390 associated with */
11391 atomic_t b_count; /* users using this buffer_head */
11392+#ifdef CONFIG_PREEMPT_RT_BASE
11393+ spinlock_t b_uptodate_lock;
11394+#if IS_ENABLED(CONFIG_JBD2)
11395+ spinlock_t b_state_lock;
11396+ spinlock_t b_journal_head_lock;
11397+#endif
11398+#endif
11399 };
11400
11401+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11402+{
11403+ unsigned long flags;
11404+
11405+#ifndef CONFIG_PREEMPT_RT_BASE
11406+ local_irq_save(flags);
11407+ bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11408+#else
11409+ spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11410+#endif
11411+ return flags;
11412+}
11413+
11414+static inline void
11415+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11416+{
11417+#ifndef CONFIG_PREEMPT_RT_BASE
11418+ bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11419+ local_irq_restore(flags);
11420+#else
11421+ spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11422+#endif
11423+}
11424+
11425+static inline void buffer_head_init_locks(struct buffer_head *bh)
11426+{
11427+#ifdef CONFIG_PREEMPT_RT_BASE
11428+ spin_lock_init(&bh->b_uptodate_lock);
11429+#if IS_ENABLED(CONFIG_JBD2)
11430+ spin_lock_init(&bh->b_state_lock);
11431+ spin_lock_init(&bh->b_journal_head_lock);
11432+#endif
11433+#endif
11434+}
11435+
11436 /*
11437 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11438 * and buffer_foo() functions.
11439diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/cgroup-defs.h linux-4.14/include/linux/cgroup-defs.h
11440--- linux-4.14.orig/include/linux/cgroup-defs.h 2018-09-05 11:03:22.000000000 +0200
11441+++ linux-4.14/include/linux/cgroup-defs.h 2018-09-05 11:05:07.000000000 +0200
11442@@ -19,6 +19,7 @@
11443 #include <linux/percpu-rwsem.h>
11444 #include <linux/workqueue.h>
11445 #include <linux/bpf-cgroup.h>
11446+#include <linux/swork.h>
11447
11448 #ifdef CONFIG_CGROUPS
11449
11450@@ -152,6 +153,7 @@
11451 /* percpu_ref killing and RCU release */
11452 struct rcu_head rcu_head;
11453 struct work_struct destroy_work;
11454+ struct swork_event destroy_swork;
11455
11456 /*
11457 * PI: the parent css. Placed here for cache proximity to following
11458diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/completion.h linux-4.14/include/linux/completion.h
11459--- linux-4.14.orig/include/linux/completion.h 2017-11-12 19:46:13.000000000 +0100
11460+++ linux-4.14/include/linux/completion.h 2018-09-05 11:05:07.000000000 +0200
11461@@ -9,7 +9,7 @@
11462 * See kernel/sched/completion.c for details.
11463 */
11464
11465-#include <linux/wait.h>
11466+#include <linux/swait.h>
11467 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11468 #include <linux/lockdep.h>
11469 #endif
11470@@ -28,7 +28,7 @@
11471 */
11472 struct completion {
11473 unsigned int done;
11474- wait_queue_head_t wait;
11475+ struct swait_queue_head wait;
11476 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11477 struct lockdep_map_cross map;
11478 #endif
11479@@ -67,11 +67,11 @@
11480
11481 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11482 #define COMPLETION_INITIALIZER(work) \
11483- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11484+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11485 STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
11486 #else
11487 #define COMPLETION_INITIALIZER(work) \
11488- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11489+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11490 #endif
11491
11492 #define COMPLETION_INITIALIZER_ONSTACK(work) \
11493@@ -117,7 +117,7 @@
11494 static inline void __init_completion(struct completion *x)
11495 {
11496 x->done = 0;
11497- init_waitqueue_head(&x->wait);
11498+ init_swait_queue_head(&x->wait);
11499 }
11500
11501 /**
11502diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/cpu.h linux-4.14/include/linux/cpu.h
11503--- linux-4.14.orig/include/linux/cpu.h 2018-09-05 11:03:22.000000000 +0200
11504+++ linux-4.14/include/linux/cpu.h 2018-09-05 11:05:07.000000000 +0200
11505@@ -120,6 +120,8 @@
11506 extern void cpu_hotplug_enable(void);
11507 void clear_tasks_mm_cpumask(int cpu);
11508 int cpu_down(unsigned int cpu);
11509+extern void pin_current_cpu(void);
11510+extern void unpin_current_cpu(void);
11511
11512 #else /* CONFIG_HOTPLUG_CPU */
11513
11514@@ -130,6 +132,9 @@
11515 static inline void lockdep_assert_cpus_held(void) { }
11516 static inline void cpu_hotplug_disable(void) { }
11517 static inline void cpu_hotplug_enable(void) { }
11518+static inline void pin_current_cpu(void) { }
11519+static inline void unpin_current_cpu(void) { }
11520+
11521 #endif /* !CONFIG_HOTPLUG_CPU */
11522
11523 /* Wrappers which go away once all code is converted */
11524diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/dcache.h linux-4.14/include/linux/dcache.h
11525--- linux-4.14.orig/include/linux/dcache.h 2018-09-05 11:03:22.000000000 +0200
11526+++ linux-4.14/include/linux/dcache.h 2018-09-05 11:05:07.000000000 +0200
11527@@ -107,7 +107,7 @@
11528
11529 union {
11530 struct list_head d_lru; /* LRU list */
11531- wait_queue_head_t *d_wait; /* in-lookup ones only */
11532+ struct swait_queue_head *d_wait; /* in-lookup ones only */
11533 };
11534 struct list_head d_child; /* child of parent list */
11535 struct list_head d_subdirs; /* our children */
11536@@ -238,7 +238,7 @@
11537 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
11538 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
11539 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
11540- wait_queue_head_t *);
11541+ struct swait_queue_head *);
11542 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
11543 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
11544 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
11545diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/delay.h linux-4.14/include/linux/delay.h
11546--- linux-4.14.orig/include/linux/delay.h 2017-11-12 19:46:13.000000000 +0100
11547+++ linux-4.14/include/linux/delay.h 2018-09-05 11:05:07.000000000 +0200
11548@@ -64,4 +64,10 @@
11549 msleep(seconds * 1000);
11550 }
11551
11552+#ifdef CONFIG_PREEMPT_RT_FULL
11553+extern void cpu_chill(void);
11554+#else
11555+# define cpu_chill() cpu_relax()
11556+#endif
11557+
11558 #endif /* defined(_LINUX_DELAY_H) */
11559diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/fs.h linux-4.14/include/linux/fs.h
11560--- linux-4.14.orig/include/linux/fs.h 2018-09-05 11:03:29.000000000 +0200
11561+++ linux-4.14/include/linux/fs.h 2018-09-05 11:05:07.000000000 +0200
11562@@ -655,7 +655,7 @@
11563 struct block_device *i_bdev;
11564 struct cdev *i_cdev;
11565 char *i_link;
11566- unsigned i_dir_seq;
11567+ unsigned __i_dir_seq;
11568 };
11569
11570 __u32 i_generation;
11571diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/highmem.h linux-4.14/include/linux/highmem.h
11572--- linux-4.14.orig/include/linux/highmem.h 2017-11-12 19:46:13.000000000 +0100
11573+++ linux-4.14/include/linux/highmem.h 2018-09-05 11:05:07.000000000 +0200
11574@@ -8,6 +8,7 @@
11575 #include <linux/mm.h>
11576 #include <linux/uaccess.h>
11577 #include <linux/hardirq.h>
11578+#include <linux/sched.h>
11579
11580 #include <asm/cacheflush.h>
11581
11582@@ -66,7 +67,7 @@
11583
11584 static inline void *kmap_atomic(struct page *page)
11585 {
11586- preempt_disable();
11587+ preempt_disable_nort();
11588 pagefault_disable();
11589 return page_address(page);
11590 }
11591@@ -75,7 +76,7 @@
11592 static inline void __kunmap_atomic(void *addr)
11593 {
11594 pagefault_enable();
11595- preempt_enable();
11596+ preempt_enable_nort();
11597 }
11598
11599 #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
11600@@ -87,32 +88,51 @@
11601
11602 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
11603
11604+#ifndef CONFIG_PREEMPT_RT_FULL
11605 DECLARE_PER_CPU(int, __kmap_atomic_idx);
11606+#endif
11607
11608 static inline int kmap_atomic_idx_push(void)
11609 {
11610+#ifndef CONFIG_PREEMPT_RT_FULL
11611 int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
11612
11613-#ifdef CONFIG_DEBUG_HIGHMEM
11614+# ifdef CONFIG_DEBUG_HIGHMEM
11615 WARN_ON_ONCE(in_irq() && !irqs_disabled());
11616 BUG_ON(idx >= KM_TYPE_NR);
11617-#endif
11618+# endif
11619 return idx;
11620+#else
11621+ current->kmap_idx++;
11622+ BUG_ON(current->kmap_idx > KM_TYPE_NR);
11623+ return current->kmap_idx - 1;
11624+#endif
11625 }
11626
11627 static inline int kmap_atomic_idx(void)
11628 {
11629+#ifndef CONFIG_PREEMPT_RT_FULL
11630 return __this_cpu_read(__kmap_atomic_idx) - 1;
11631+#else
11632+ return current->kmap_idx - 1;
11633+#endif
11634 }
11635
11636 static inline void kmap_atomic_idx_pop(void)
11637 {
11638-#ifdef CONFIG_DEBUG_HIGHMEM
11639+#ifndef CONFIG_PREEMPT_RT_FULL
11640+# ifdef CONFIG_DEBUG_HIGHMEM
11641 int idx = __this_cpu_dec_return(__kmap_atomic_idx);
11642
11643 BUG_ON(idx < 0);
11644-#else
11645+# else
11646 __this_cpu_dec(__kmap_atomic_idx);
11647+# endif
11648+#else
11649+ current->kmap_idx--;
11650+# ifdef CONFIG_DEBUG_HIGHMEM
11651+ BUG_ON(current->kmap_idx < 0);
11652+# endif
11653 #endif
11654 }
11655
11656diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/hrtimer.h linux-4.14/include/linux/hrtimer.h
11657--- linux-4.14.orig/include/linux/hrtimer.h 2017-11-12 19:46:13.000000000 +0100
11658+++ linux-4.14/include/linux/hrtimer.h 2018-09-05 11:05:07.000000000 +0200
11659@@ -22,19 +22,42 @@
11660 #include <linux/percpu.h>
11661 #include <linux/timer.h>
11662 #include <linux/timerqueue.h>
11663+#include <linux/wait.h>
11664
11665 struct hrtimer_clock_base;
11666 struct hrtimer_cpu_base;
11667
11668 /*
11669 * Mode arguments of xxx_hrtimer functions:
11670+ *
11671+ * HRTIMER_MODE_ABS - Time value is absolute
11672+ * HRTIMER_MODE_REL - Time value is relative to now
11673+ * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered
11674+ * when starting the timer)
11675+ * HRTIMER_MODE_SOFT - Timer callback function will be executed in
11676+ * soft irq context
11677 */
11678 enum hrtimer_mode {
11679- HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */
11680- HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */
11681- HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */
11682- HRTIMER_MODE_ABS_PINNED = 0x02,
11683- HRTIMER_MODE_REL_PINNED = 0x03,
11684+ HRTIMER_MODE_ABS = 0x00,
11685+ HRTIMER_MODE_REL = 0x01,
11686+ HRTIMER_MODE_PINNED = 0x02,
11687+ HRTIMER_MODE_SOFT = 0x04,
11688+ HRTIMER_MODE_HARD = 0x08,
11689+
11690+ HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
11691+ HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
11692+
11693+ HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
11694+ HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
11695+
11696+ HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
11697+ HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
11698+
11699+ HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
11700+ HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
11701+
11702+ HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
11703+ HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
11704 };
11705
11706 /*
11707@@ -87,6 +110,7 @@
11708 * @base: pointer to the timer base (per cpu and per clock)
11709 * @state: state information (See bit values above)
11710 * @is_rel: Set if the timer was armed relative
11711+ * @is_soft: Set if hrtimer will be expired in soft interrupt context.
11712 *
11713 * The hrtimer structure must be initialized by hrtimer_init()
11714 */
11715@@ -97,6 +121,7 @@
11716 struct hrtimer_clock_base *base;
11717 u8 state;
11718 u8 is_rel;
11719+ u8 is_soft;
11720 };
11721
11722 /**
11723@@ -112,9 +137,9 @@
11724 };
11725
11726 #ifdef CONFIG_64BIT
11727-# define HRTIMER_CLOCK_BASE_ALIGN 64
11728+# define __hrtimer_clock_base_align ____cacheline_aligned
11729 #else
11730-# define HRTIMER_CLOCK_BASE_ALIGN 32
11731+# define __hrtimer_clock_base_align
11732 #endif
11733
11734 /**
11735@@ -123,48 +148,57 @@
11736 * @index: clock type index for per_cpu support when moving a
11737 * timer to a base on another cpu.
11738 * @clockid: clock id for per_cpu support
11739+ * @seq: seqcount around __run_hrtimer
11740+ * @running: pointer to the currently running hrtimer
11741 * @active: red black tree root node for the active timers
11742 * @get_time: function to retrieve the current time of the clock
11743 * @offset: offset of this clock to the monotonic base
11744 */
11745 struct hrtimer_clock_base {
11746 struct hrtimer_cpu_base *cpu_base;
11747- int index;
11748+ unsigned int index;
11749 clockid_t clockid;
11750+ seqcount_t seq;
11751+ struct hrtimer *running;
11752 struct timerqueue_head active;
11753 ktime_t (*get_time)(void);
11754 ktime_t offset;
11755-} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
11756+} __hrtimer_clock_base_align;
11757
11758 enum hrtimer_base_type {
11759 HRTIMER_BASE_MONOTONIC,
11760 HRTIMER_BASE_REALTIME,
11761 HRTIMER_BASE_BOOTTIME,
11762 HRTIMER_BASE_TAI,
11763+ HRTIMER_BASE_MONOTONIC_SOFT,
11764+ HRTIMER_BASE_REALTIME_SOFT,
11765+ HRTIMER_BASE_BOOTTIME_SOFT,
11766+ HRTIMER_BASE_TAI_SOFT,
11767 HRTIMER_MAX_CLOCK_BASES,
11768 };
11769
11770-/*
11771+/**
11772 * struct hrtimer_cpu_base - the per cpu clock bases
11773 * @lock: lock protecting the base and associated clock bases
11774 * and timers
11775- * @seq: seqcount around __run_hrtimer
11776- * @running: pointer to the currently running hrtimer
11777 * @cpu: cpu number
11778 * @active_bases: Bitfield to mark bases with active timers
11779 * @clock_was_set_seq: Sequence counter of clock was set events
11780- * @migration_enabled: The migration of hrtimers to other cpus is enabled
11781- * @nohz_active: The nohz functionality is enabled
11782- * @expires_next: absolute time of the next event which was scheduled
11783- * via clock_set_next_event()
11784- * @next_timer: Pointer to the first expiring timer
11785- * @in_hrtirq: hrtimer_interrupt() is currently executing
11786 * @hres_active: State of high resolution mode
11787+ * @in_hrtirq: hrtimer_interrupt() is currently executing
11788 * @hang_detected: The last hrtimer interrupt detected a hang
11789+ * @softirq_activated: displays, if the softirq is raised - update of softirq
11790+ * related settings is not required then.
11791 * @nr_events: Total number of hrtimer interrupt events
11792 * @nr_retries: Total number of hrtimer interrupt retries
11793 * @nr_hangs: Total number of hrtimer interrupt hangs
11794 * @max_hang_time: Maximum time spent in hrtimer_interrupt
11795+ * @expires_next: absolute time of the next event, is required for remote
11796+ * hrtimer enqueue; it is the total first expiry time (hard
11797+ * and soft hrtimer are taken into account)
11798+ * @next_timer: Pointer to the first expiring timer
11799+ * @softirq_expires_next: Time to check, if soft queues needs also to be expired
11800+ * @softirq_next_timer: Pointer to the first expiring softirq based timer
11801 * @clock_base: array of clock bases for this cpu
11802 *
11803 * Note: next_timer is just an optimization for __remove_hrtimer().
11804@@ -173,31 +207,31 @@
11805 */
11806 struct hrtimer_cpu_base {
11807 raw_spinlock_t lock;
11808- seqcount_t seq;
11809- struct hrtimer *running;
11810 unsigned int cpu;
11811 unsigned int active_bases;
11812 unsigned int clock_was_set_seq;
11813- bool migration_enabled;
11814- bool nohz_active;
11815+ unsigned int hres_active : 1,
11816+ in_hrtirq : 1,
11817+ hang_detected : 1,
11818+ softirq_activated : 1;
11819 #ifdef CONFIG_HIGH_RES_TIMERS
11820- unsigned int in_hrtirq : 1,
11821- hres_active : 1,
11822- hang_detected : 1;
11823- ktime_t expires_next;
11824- struct hrtimer *next_timer;
11825 unsigned int nr_events;
11826- unsigned int nr_retries;
11827- unsigned int nr_hangs;
11828+ unsigned short nr_retries;
11829+ unsigned short nr_hangs;
11830 unsigned int max_hang_time;
11831 #endif
11832+ ktime_t expires_next;
11833+ struct hrtimer *next_timer;
11834+ ktime_t softirq_expires_next;
11835+#ifdef CONFIG_PREEMPT_RT_BASE
11836+ wait_queue_head_t wait;
11837+#endif
11838+ struct hrtimer *softirq_next_timer;
11839 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
11840 } ____cacheline_aligned;
11841
11842 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
11843 {
11844- BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
11845-
11846 timer->node.expires = time;
11847 timer->_softexpires = time;
11848 }
11849@@ -266,16 +300,17 @@
11850 return timer->base->get_time();
11851 }
11852
11853+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11854+{
11855+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
11856+ timer->base->cpu_base->hres_active : 0;
11857+}
11858+
11859 #ifdef CONFIG_HIGH_RES_TIMERS
11860 struct clock_event_device;
11861
11862 extern void hrtimer_interrupt(struct clock_event_device *dev);
11863
11864-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11865-{
11866- return timer->base->cpu_base->hres_active;
11867-}
11868-
11869 /*
11870 * The resolution of the clocks. The resolution value is returned in
11871 * the clock_getres() system call to give application programmers an
11872@@ -298,11 +333,6 @@
11873
11874 #define hrtimer_resolution (unsigned int)LOW_RES_NSEC
11875
11876-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11877-{
11878- return 0;
11879-}
11880-
11881 static inline void clock_was_set_delayed(void) { }
11882
11883 #endif
11884@@ -344,10 +374,17 @@
11885 /* Initialize timers: */
11886 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
11887 enum hrtimer_mode mode);
11888+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
11889+ enum hrtimer_mode mode,
11890+ struct task_struct *task);
11891
11892 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
11893 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
11894 enum hrtimer_mode mode);
11895+extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
11896+ clockid_t clock_id,
11897+ enum hrtimer_mode mode,
11898+ struct task_struct *task);
11899
11900 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
11901 #else
11902@@ -357,6 +394,15 @@
11903 {
11904 hrtimer_init(timer, which_clock, mode);
11905 }
11906+
11907+static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
11908+ clockid_t clock_id,
11909+ enum hrtimer_mode mode,
11910+ struct task_struct *task)
11911+{
11912+ hrtimer_init_sleeper(sl, clock_id, mode, task);
11913+}
11914+
11915 static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
11916 #endif
11917
11918@@ -365,11 +411,12 @@
11919 u64 range_ns, const enum hrtimer_mode mode);
11920
11921 /**
11922- * hrtimer_start - (re)start an hrtimer on the current CPU
11923+ * hrtimer_start - (re)start an hrtimer
11924 * @timer: the timer to be added
11925 * @tim: expiry time
11926- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
11927- * relative (HRTIMER_MODE_REL)
11928+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
11929+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
11930+ * softirq based mode is considered for debug purpose only!
11931 */
11932 static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
11933 const enum hrtimer_mode mode)
11934@@ -396,6 +443,13 @@
11935 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
11936 }
11937
11938+/* Softirq preemption could deadlock timer removal */
11939+#ifdef CONFIG_PREEMPT_RT_BASE
11940+ extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
11941+#else
11942+# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
11943+#endif
11944+
11945 /* Query timers: */
11946 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
11947
11948@@ -420,9 +474,9 @@
11949 * Helper function to check, whether the timer is running the callback
11950 * function
11951 */
11952-static inline int hrtimer_callback_running(struct hrtimer *timer)
11953+static inline int hrtimer_callback_running(const struct hrtimer *timer)
11954 {
11955- return timer->base->cpu_base->running == timer;
11956+ return timer->base->running == timer;
11957 }
11958
11959 /* Forward a hrtimer so it expires after now: */
11960@@ -458,15 +512,12 @@
11961 const enum hrtimer_mode mode,
11962 const clockid_t clockid);
11963
11964-extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
11965- struct task_struct *tsk);
11966-
11967 extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
11968 const enum hrtimer_mode mode);
11969 extern int schedule_hrtimeout_range_clock(ktime_t *expires,
11970 u64 delta,
11971 const enum hrtimer_mode mode,
11972- int clock);
11973+ clockid_t clock_id);
11974 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
11975
11976 /* Soft interrupt function to run the hrtimer queues: */
11977diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/idr.h linux-4.14/include/linux/idr.h
11978--- linux-4.14.orig/include/linux/idr.h 2017-11-12 19:46:13.000000000 +0100
11979+++ linux-4.14/include/linux/idr.h 2018-09-05 11:05:07.000000000 +0200
11980@@ -167,10 +167,7 @@
11981 * Each idr_preload() should be matched with an invocation of this
11982 * function. See idr_preload() for details.
11983 */
11984-static inline void idr_preload_end(void)
11985-{
11986- preempt_enable();
11987-}
11988+void idr_preload_end(void);
11989
11990 /**
11991 * idr_find - return pointer for given id
11992diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/init_task.h linux-4.14/include/linux/init_task.h
11993--- linux-4.14.orig/include/linux/init_task.h 2017-11-12 19:46:13.000000000 +0100
11994+++ linux-4.14/include/linux/init_task.h 2018-09-05 11:05:07.000000000 +0200
11995@@ -163,6 +163,12 @@
11996 # define INIT_PERF_EVENTS(tsk)
11997 #endif
11998
11999+#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE)
12000+# define INIT_TIMER_LIST .posix_timer_list = NULL,
12001+#else
12002+# define INIT_TIMER_LIST
12003+#endif
12004+
12005 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12006 # define INIT_VTIME(tsk) \
12007 .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
12008@@ -234,7 +240,8 @@
12009 .static_prio = MAX_PRIO-20, \
12010 .normal_prio = MAX_PRIO-20, \
12011 .policy = SCHED_NORMAL, \
12012- .cpus_allowed = CPU_MASK_ALL, \
12013+ .cpus_ptr = &tsk.cpus_mask, \
12014+ .cpus_mask = CPU_MASK_ALL, \
12015 .nr_cpus_allowed= NR_CPUS, \
12016 .mm = NULL, \
12017 .active_mm = &init_mm, \
12018@@ -276,6 +283,7 @@
12019 INIT_CPU_TIMERS(tsk) \
12020 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
12021 .timer_slack_ns = 50000, /* 50 usec default slack */ \
12022+ INIT_TIMER_LIST \
12023 .pids = { \
12024 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
12025 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
12026diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/interrupt.h linux-4.14/include/linux/interrupt.h
12027--- linux-4.14.orig/include/linux/interrupt.h 2018-09-05 11:03:22.000000000 +0200
12028+++ linux-4.14/include/linux/interrupt.h 2018-09-05 11:05:07.000000000 +0200
12029@@ -15,6 +15,7 @@
12030 #include <linux/hrtimer.h>
12031 #include <linux/kref.h>
12032 #include <linux/workqueue.h>
12033+#include <linux/swork.h>
12034
12035 #include <linux/atomic.h>
12036 #include <asm/ptrace.h>
12037@@ -63,6 +64,7 @@
12038 * interrupt handler after suspending interrupts. For system
12039 * wakeup devices users need to implement wakeup detection in
12040 * their interrupt handlers.
12041+ * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12042 */
12043 #define IRQF_SHARED 0x00000080
12044 #define IRQF_PROBE_SHARED 0x00000100
12045@@ -76,6 +78,7 @@
12046 #define IRQF_NO_THREAD 0x00010000
12047 #define IRQF_EARLY_RESUME 0x00020000
12048 #define IRQF_COND_SUSPEND 0x00040000
12049+#define IRQF_NO_SOFTIRQ_CALL 0x00080000
12050
12051 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12052
12053@@ -207,7 +210,7 @@
12054 #ifdef CONFIG_LOCKDEP
12055 # define local_irq_enable_in_hardirq() do { } while (0)
12056 #else
12057-# define local_irq_enable_in_hardirq() local_irq_enable()
12058+# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12059 #endif
12060
12061 extern void disable_irq_nosync(unsigned int irq);
12062@@ -227,6 +230,7 @@
12063 * struct irq_affinity_notify - context for notification of IRQ affinity changes
12064 * @irq: Interrupt to which notification applies
12065 * @kref: Reference count, for internal use
12066+ * @swork: Swork item, for internal use
12067 * @work: Work item, for internal use
12068 * @notify: Function to be called on change. This will be
12069 * called in process context.
12070@@ -238,7 +242,11 @@
12071 struct irq_affinity_notify {
12072 unsigned int irq;
12073 struct kref kref;
12074+#ifdef CONFIG_PREEMPT_RT_BASE
12075+ struct swork_event swork;
12076+#else
12077 struct work_struct work;
12078+#endif
12079 void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12080 void (*release)(struct kref *ref);
12081 };
12082@@ -429,9 +437,13 @@
12083 bool state);
12084
12085 #ifdef CONFIG_IRQ_FORCED_THREADING
12086+# ifndef CONFIG_PREEMPT_RT_BASE
12087 extern bool force_irqthreads;
12088+# else
12089+# define force_irqthreads (true)
12090+# endif
12091 #else
12092-#define force_irqthreads (0)
12093+#define force_irqthreads (false)
12094 #endif
12095
12096 #ifndef __ARCH_SET_SOFTIRQ_PENDING
12097@@ -488,9 +500,10 @@
12098 void (*action)(struct softirq_action *);
12099 };
12100
12101+#ifndef CONFIG_PREEMPT_RT_FULL
12102 asmlinkage void do_softirq(void);
12103 asmlinkage void __do_softirq(void);
12104-
12105+static inline void thread_do_softirq(void) { do_softirq(); }
12106 #ifdef __ARCH_HAS_DO_SOFTIRQ
12107 void do_softirq_own_stack(void);
12108 #else
12109@@ -499,13 +512,25 @@
12110 __do_softirq();
12111 }
12112 #endif
12113+#else
12114+extern void thread_do_softirq(void);
12115+#endif
12116
12117 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12118 extern void softirq_init(void);
12119 extern void __raise_softirq_irqoff(unsigned int nr);
12120+#ifdef CONFIG_PREEMPT_RT_FULL
12121+extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12122+#else
12123+static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12124+{
12125+ __raise_softirq_irqoff(nr);
12126+}
12127+#endif
12128
12129 extern void raise_softirq_irqoff(unsigned int nr);
12130 extern void raise_softirq(unsigned int nr);
12131+extern void softirq_check_pending_idle(void);
12132
12133 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12134
12135@@ -527,8 +552,9 @@
12136 to be executed on some cpu at least once after this.
12137 * If the tasklet is already scheduled, but its execution is still not
12138 started, it will be executed only once.
12139- * If this tasklet is already running on another CPU (or schedule is called
12140- from tasklet itself), it is rescheduled for later.
12141+ * If this tasklet is already running on another CPU, it is rescheduled
12142+ for later.
12143+ * Schedule must not be called from the tasklet itself (a lockup occurs)
12144 * Tasklet is strictly serialized wrt itself, but not
12145 wrt another tasklets. If client needs some intertask synchronization,
12146 he makes it with spinlocks.
12147@@ -553,27 +579,36 @@
12148 enum
12149 {
12150 TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
12151- TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
12152+ TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
12153+ TASKLET_STATE_PENDING /* Tasklet is pending */
12154 };
12155
12156-#ifdef CONFIG_SMP
12157+#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
12158+#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
12159+#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12160+
12161+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12162 static inline int tasklet_trylock(struct tasklet_struct *t)
12163 {
12164 return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12165 }
12166
12167+static inline int tasklet_tryunlock(struct tasklet_struct *t)
12168+{
12169+ return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12170+}
12171+
12172 static inline void tasklet_unlock(struct tasklet_struct *t)
12173 {
12174 smp_mb__before_atomic();
12175 clear_bit(TASKLET_STATE_RUN, &(t)->state);
12176 }
12177
12178-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12179-{
12180- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12181-}
12182+extern void tasklet_unlock_wait(struct tasklet_struct *t);
12183+
12184 #else
12185 #define tasklet_trylock(t) 1
12186+#define tasklet_tryunlock(t) 1
12187 #define tasklet_unlock_wait(t) do { } while (0)
12188 #define tasklet_unlock(t) do { } while (0)
12189 #endif
12190@@ -607,41 +642,17 @@
12191 smp_mb();
12192 }
12193
12194-static inline void tasklet_enable(struct tasklet_struct *t)
12195-{
12196- smp_mb__before_atomic();
12197- atomic_dec(&t->count);
12198-}
12199-
12200+extern void tasklet_enable(struct tasklet_struct *t);
12201 extern void tasklet_kill(struct tasklet_struct *t);
12202 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12203 extern void tasklet_init(struct tasklet_struct *t,
12204 void (*func)(unsigned long), unsigned long data);
12205
12206-struct tasklet_hrtimer {
12207- struct hrtimer timer;
12208- struct tasklet_struct tasklet;
12209- enum hrtimer_restart (*function)(struct hrtimer *);
12210-};
12211-
12212-extern void
12213-tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
12214- enum hrtimer_restart (*function)(struct hrtimer *),
12215- clockid_t which_clock, enum hrtimer_mode mode);
12216-
12217-static inline
12218-void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
12219- const enum hrtimer_mode mode)
12220-{
12221- hrtimer_start(&ttimer->timer, time, mode);
12222-}
12223-
12224-static inline
12225-void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12226-{
12227- hrtimer_cancel(&ttimer->timer);
12228- tasklet_kill(&ttimer->tasklet);
12229-}
12230+#ifdef CONFIG_PREEMPT_RT_FULL
12231+extern void softirq_early_init(void);
12232+#else
12233+static inline void softirq_early_init(void) { }
12234+#endif
12235
12236 /*
12237 * Autoprobing for irqs:
12238diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irqdesc.h linux-4.14/include/linux/irqdesc.h
12239--- linux-4.14.orig/include/linux/irqdesc.h 2017-11-12 19:46:13.000000000 +0100
12240+++ linux-4.14/include/linux/irqdesc.h 2018-09-05 11:05:07.000000000 +0200
12241@@ -70,6 +70,7 @@
12242 unsigned int irqs_unhandled;
12243 atomic_t threads_handled;
12244 int threads_handled_last;
12245+ u64 random_ip;
12246 raw_spinlock_t lock;
12247 struct cpumask *percpu_enabled;
12248 const struct cpumask *percpu_affinity;
12249diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irqflags.h linux-4.14/include/linux/irqflags.h
12250--- linux-4.14.orig/include/linux/irqflags.h 2017-11-12 19:46:13.000000000 +0100
12251+++ linux-4.14/include/linux/irqflags.h 2018-09-05 11:05:07.000000000 +0200
12252@@ -34,16 +34,6 @@
12253 current->hardirq_context--; \
12254 crossrelease_hist_end(XHLOCK_HARD); \
12255 } while (0)
12256-# define lockdep_softirq_enter() \
12257-do { \
12258- current->softirq_context++; \
12259- crossrelease_hist_start(XHLOCK_SOFT); \
12260-} while (0)
12261-# define lockdep_softirq_exit() \
12262-do { \
12263- current->softirq_context--; \
12264- crossrelease_hist_end(XHLOCK_SOFT); \
12265-} while (0)
12266 # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
12267 #else
12268 # define trace_hardirqs_on() do { } while (0)
12269@@ -56,9 +46,23 @@
12270 # define trace_softirqs_enabled(p) 0
12271 # define trace_hardirq_enter() do { } while (0)
12272 # define trace_hardirq_exit() do { } while (0)
12273+# define INIT_TRACE_IRQFLAGS
12274+#endif
12275+
12276+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12277+# define lockdep_softirq_enter() \
12278+do { \
12279+ current->softirq_context++; \
12280+ crossrelease_hist_start(XHLOCK_SOFT); \
12281+} while (0)
12282+# define lockdep_softirq_exit() \
12283+do { \
12284+ current->softirq_context--; \
12285+ crossrelease_hist_end(XHLOCK_SOFT); \
12286+} while (0)
12287+#else
12288 # define lockdep_softirq_enter() do { } while (0)
12289 # define lockdep_softirq_exit() do { } while (0)
12290-# define INIT_TRACE_IRQFLAGS
12291 #endif
12292
12293 #if defined(CONFIG_IRQSOFF_TRACER) || \
12294@@ -165,4 +169,23 @@
12295
12296 #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12297
12298+/*
12299+ * local_irq* variants depending on RT/!RT
12300+ */
12301+#ifdef CONFIG_PREEMPT_RT_FULL
12302+# define local_irq_disable_nort() do { } while (0)
12303+# define local_irq_enable_nort() do { } while (0)
12304+# define local_irq_save_nort(flags) local_save_flags(flags)
12305+# define local_irq_restore_nort(flags) (void)(flags)
12306+# define local_irq_disable_rt() local_irq_disable()
12307+# define local_irq_enable_rt() local_irq_enable()
12308+#else
12309+# define local_irq_disable_nort() local_irq_disable()
12310+# define local_irq_enable_nort() local_irq_enable()
12311+# define local_irq_save_nort(flags) local_irq_save(flags)
12312+# define local_irq_restore_nort(flags) local_irq_restore(flags)
12313+# define local_irq_disable_rt() do { } while (0)
12314+# define local_irq_enable_rt() do { } while (0)
12315+#endif
12316+
12317 #endif
12318diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irq.h linux-4.14/include/linux/irq.h
12319--- linux-4.14.orig/include/linux/irq.h 2018-09-05 11:03:22.000000000 +0200
12320+++ linux-4.14/include/linux/irq.h 2018-09-05 11:05:07.000000000 +0200
12321@@ -74,6 +74,7 @@
12322 * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
12323 * it from the spurious interrupt detection
12324 * mechanism and from core side polling.
12325+ * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
12326 * IRQ_DISABLE_UNLAZY - Disable lazy irq disable
12327 */
12328 enum {
12329@@ -101,13 +102,14 @@
12330 IRQ_PER_CPU_DEVID = (1 << 17),
12331 IRQ_IS_POLLED = (1 << 18),
12332 IRQ_DISABLE_UNLAZY = (1 << 19),
12333+ IRQ_NO_SOFTIRQ_CALL = (1 << 20),
12334 };
12335
12336 #define IRQF_MODIFY_MASK \
12337 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12338 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12339 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12340- IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12341+ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12342
12343 #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
12344
12345diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irq_work.h linux-4.14/include/linux/irq_work.h
12346--- linux-4.14.orig/include/linux/irq_work.h 2017-11-12 19:46:13.000000000 +0100
12347+++ linux-4.14/include/linux/irq_work.h 2018-09-05 11:05:07.000000000 +0200
12348@@ -17,6 +17,7 @@
12349 #define IRQ_WORK_BUSY 2UL
12350 #define IRQ_WORK_FLAGS 3UL
12351 #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
12352+#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
12353
12354 struct irq_work {
12355 unsigned long flags;
12356@@ -52,4 +53,10 @@
12357 static inline void irq_work_run(void) { }
12358 #endif
12359
12360+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12361+void irq_work_tick_soft(void);
12362+#else
12363+static inline void irq_work_tick_soft(void) { }
12364+#endif
12365+
12366 #endif /* _LINUX_IRQ_WORK_H */
12367diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/jbd2.h linux-4.14/include/linux/jbd2.h
12368--- linux-4.14.orig/include/linux/jbd2.h 2018-09-05 11:03:22.000000000 +0200
12369+++ linux-4.14/include/linux/jbd2.h 2018-09-05 11:05:07.000000000 +0200
12370@@ -347,32 +347,56 @@
12371
12372 static inline void jbd_lock_bh_state(struct buffer_head *bh)
12373 {
12374+#ifndef CONFIG_PREEMPT_RT_BASE
12375 bit_spin_lock(BH_State, &bh->b_state);
12376+#else
12377+ spin_lock(&bh->b_state_lock);
12378+#endif
12379 }
12380
12381 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12382 {
12383+#ifndef CONFIG_PREEMPT_RT_BASE
12384 return bit_spin_trylock(BH_State, &bh->b_state);
12385+#else
12386+ return spin_trylock(&bh->b_state_lock);
12387+#endif
12388 }
12389
12390 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12391 {
12392+#ifndef CONFIG_PREEMPT_RT_BASE
12393 return bit_spin_is_locked(BH_State, &bh->b_state);
12394+#else
12395+ return spin_is_locked(&bh->b_state_lock);
12396+#endif
12397 }
12398
12399 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12400 {
12401+#ifndef CONFIG_PREEMPT_RT_BASE
12402 bit_spin_unlock(BH_State, &bh->b_state);
12403+#else
12404+ spin_unlock(&bh->b_state_lock);
12405+#endif
12406 }
12407
12408 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12409 {
12410+#ifndef CONFIG_PREEMPT_RT_BASE
12411 bit_spin_lock(BH_JournalHead, &bh->b_state);
12412+#else
12413+ spin_lock(&bh->b_journal_head_lock);
12414+#endif
12415 }
12416
12417 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12418 {
12419+#ifndef CONFIG_PREEMPT_RT_BASE
12420 bit_spin_unlock(BH_JournalHead, &bh->b_state);
12421+#else
12422+ spin_unlock(&bh->b_journal_head_lock);
12423+#endif
12424 }
12425
12426 #define J_ASSERT(assert) BUG_ON(!(assert))
12427diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/kdb.h linux-4.14/include/linux/kdb.h
12428--- linux-4.14.orig/include/linux/kdb.h 2017-11-12 19:46:13.000000000 +0100
12429+++ linux-4.14/include/linux/kdb.h 2018-09-05 11:05:07.000000000 +0200
12430@@ -167,6 +167,7 @@
12431 extern __printf(1, 2) int kdb_printf(const char *, ...);
12432 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12433
12434+#define in_kdb_printk() (kdb_trap_printk)
12435 extern void kdb_init(int level);
12436
12437 /* Access to kdb specific polling devices */
12438@@ -201,6 +202,7 @@
12439 extern int kdb_unregister(char *);
12440 #else /* ! CONFIG_KGDB_KDB */
12441 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12442+#define in_kdb_printk() (0)
12443 static inline void kdb_init(int level) {}
12444 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12445 char *help, short minlen) { return 0; }
12446diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/kernel.h linux-4.14/include/linux/kernel.h
12447--- linux-4.14.orig/include/linux/kernel.h 2017-11-12 19:46:13.000000000 +0100
12448+++ linux-4.14/include/linux/kernel.h 2018-09-05 11:05:07.000000000 +0200
12449@@ -225,6 +225,9 @@
12450 */
12451 # define might_sleep() \
12452 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12453+
12454+# define might_sleep_no_state_check() \
12455+ do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12456 # define sched_annotate_sleep() (current->task_state_change = 0)
12457 #else
12458 static inline void ___might_sleep(const char *file, int line,
12459@@ -232,6 +235,7 @@
12460 static inline void __might_sleep(const char *file, int line,
12461 int preempt_offset) { }
12462 # define might_sleep() do { might_resched(); } while (0)
12463+# define might_sleep_no_state_check() do { might_resched(); } while (0)
12464 # define sched_annotate_sleep() do { } while (0)
12465 #endif
12466
12467@@ -531,6 +535,7 @@
12468 SYSTEM_HALT,
12469 SYSTEM_POWER_OFF,
12470 SYSTEM_RESTART,
12471+ SYSTEM_SUSPEND,
12472 } system_state;
12473
12474 #define TAINT_PROPRIETARY_MODULE 0
12475diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/list_bl.h linux-4.14/include/linux/list_bl.h
12476--- linux-4.14.orig/include/linux/list_bl.h 2017-11-12 19:46:13.000000000 +0100
12477+++ linux-4.14/include/linux/list_bl.h 2018-09-05 11:05:07.000000000 +0200
12478@@ -3,6 +3,7 @@
12479 #define _LINUX_LIST_BL_H
12480
12481 #include <linux/list.h>
12482+#include <linux/spinlock.h>
12483 #include <linux/bit_spinlock.h>
12484
12485 /*
12486@@ -33,13 +34,24 @@
12487
12488 struct hlist_bl_head {
12489 struct hlist_bl_node *first;
12490+#ifdef CONFIG_PREEMPT_RT_BASE
12491+ raw_spinlock_t lock;
12492+#endif
12493 };
12494
12495 struct hlist_bl_node {
12496 struct hlist_bl_node *next, **pprev;
12497 };
12498-#define INIT_HLIST_BL_HEAD(ptr) \
12499- ((ptr)->first = NULL)
12500+
12501+#ifdef CONFIG_PREEMPT_RT_BASE
12502+#define INIT_HLIST_BL_HEAD(h) \
12503+do { \
12504+ (h)->first = NULL; \
12505+ raw_spin_lock_init(&(h)->lock); \
12506+} while (0)
12507+#else
12508+#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12509+#endif
12510
12511 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12512 {
12513@@ -119,12 +131,26 @@
12514
12515 static inline void hlist_bl_lock(struct hlist_bl_head *b)
12516 {
12517+#ifndef CONFIG_PREEMPT_RT_BASE
12518 bit_spin_lock(0, (unsigned long *)b);
12519+#else
12520+ raw_spin_lock(&b->lock);
12521+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12522+ __set_bit(0, (unsigned long *)b);
12523+#endif
12524+#endif
12525 }
12526
12527 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12528 {
12529+#ifndef CONFIG_PREEMPT_RT_BASE
12530 __bit_spin_unlock(0, (unsigned long *)b);
12531+#else
12532+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12533+ __clear_bit(0, (unsigned long *)b);
12534+#endif
12535+ raw_spin_unlock(&b->lock);
12536+#endif
12537 }
12538
12539 static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
12540diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/locallock.h linux-4.14/include/linux/locallock.h
12541--- linux-4.14.orig/include/linux/locallock.h 1970-01-01 01:00:00.000000000 +0100
12542+++ linux-4.14/include/linux/locallock.h 2018-09-05 11:05:07.000000000 +0200
12543@@ -0,0 +1,271 @@
12544+#ifndef _LINUX_LOCALLOCK_H
12545+#define _LINUX_LOCALLOCK_H
12546+
12547+#include <linux/percpu.h>
12548+#include <linux/spinlock.h>
12549+
12550+#ifdef CONFIG_PREEMPT_RT_BASE
12551+
12552+#ifdef CONFIG_DEBUG_SPINLOCK
12553+# define LL_WARN(cond) WARN_ON(cond)
12554+#else
12555+# define LL_WARN(cond) do { } while (0)
12556+#endif
12557+
12558+/*
12559+ * per cpu lock based substitute for local_irq_*()
12560+ */
12561+struct local_irq_lock {
12562+ spinlock_t lock;
12563+ struct task_struct *owner;
12564+ int nestcnt;
12565+ unsigned long flags;
12566+};
12567+
12568+#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
12569+ DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
12570+ .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
12571+
12572+#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
12573+ DECLARE_PER_CPU(struct local_irq_lock, lvar)
12574+
12575+#define local_irq_lock_init(lvar) \
12576+ do { \
12577+ int __cpu; \
12578+ for_each_possible_cpu(__cpu) \
12579+ spin_lock_init(&per_cpu(lvar, __cpu).lock); \
12580+ } while (0)
12581+
12582+static inline void __local_lock(struct local_irq_lock *lv)
12583+{
12584+ if (lv->owner != current) {
12585+ spin_lock(&lv->lock);
12586+ LL_WARN(lv->owner);
12587+ LL_WARN(lv->nestcnt);
12588+ lv->owner = current;
12589+ }
12590+ lv->nestcnt++;
12591+}
12592+
12593+#define local_lock(lvar) \
12594+ do { __local_lock(&get_local_var(lvar)); } while (0)
12595+
12596+#define local_lock_on(lvar, cpu) \
12597+ do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
12598+
12599+static inline int __local_trylock(struct local_irq_lock *lv)
12600+{
12601+ if (lv->owner != current && spin_trylock(&lv->lock)) {
12602+ LL_WARN(lv->owner);
12603+ LL_WARN(lv->nestcnt);
12604+ lv->owner = current;
12605+ lv->nestcnt = 1;
12606+ return 1;
12607+ } else if (lv->owner == current) {
12608+ lv->nestcnt++;
12609+ return 1;
12610+ }
12611+ return 0;
12612+}
12613+
12614+#define local_trylock(lvar) \
12615+ ({ \
12616+ int __locked; \
12617+ __locked = __local_trylock(&get_local_var(lvar)); \
12618+ if (!__locked) \
12619+ put_local_var(lvar); \
12620+ __locked; \
12621+ })
12622+
12623+static inline void __local_unlock(struct local_irq_lock *lv)
12624+{
12625+ LL_WARN(lv->nestcnt == 0);
12626+ LL_WARN(lv->owner != current);
12627+ if (--lv->nestcnt)
12628+ return;
12629+
12630+ lv->owner = NULL;
12631+ spin_unlock(&lv->lock);
12632+}
12633+
12634+#define local_unlock(lvar) \
12635+ do { \
12636+ __local_unlock(this_cpu_ptr(&lvar)); \
12637+ put_local_var(lvar); \
12638+ } while (0)
12639+
12640+#define local_unlock_on(lvar, cpu) \
12641+ do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
12642+
12643+static inline void __local_lock_irq(struct local_irq_lock *lv)
12644+{
12645+ spin_lock_irqsave(&lv->lock, lv->flags);
12646+ LL_WARN(lv->owner);
12647+ LL_WARN(lv->nestcnt);
12648+ lv->owner = current;
12649+ lv->nestcnt = 1;
12650+}
12651+
12652+#define local_lock_irq(lvar) \
12653+ do { __local_lock_irq(&get_local_var(lvar)); } while (0)
12654+
12655+#define local_lock_irq_on(lvar, cpu) \
12656+ do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
12657+
12658+static inline void __local_unlock_irq(struct local_irq_lock *lv)
12659+{
12660+ LL_WARN(!lv->nestcnt);
12661+ LL_WARN(lv->owner != current);
12662+ lv->owner = NULL;
12663+ lv->nestcnt = 0;
12664+ spin_unlock_irq(&lv->lock);
12665+}
12666+
12667+#define local_unlock_irq(lvar) \
12668+ do { \
12669+ __local_unlock_irq(this_cpu_ptr(&lvar)); \
12670+ put_local_var(lvar); \
12671+ } while (0)
12672+
12673+#define local_unlock_irq_on(lvar, cpu) \
12674+ do { \
12675+ __local_unlock_irq(&per_cpu(lvar, cpu)); \
12676+ } while (0)
12677+
12678+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
12679+{
12680+ if (lv->owner != current) {
12681+ __local_lock_irq(lv);
12682+ return 0;
12683+ } else {
12684+ lv->nestcnt++;
12685+ return 1;
12686+ }
12687+}
12688+
12689+#define local_lock_irqsave(lvar, _flags) \
12690+ do { \
12691+ if (__local_lock_irqsave(&get_local_var(lvar))) \
12692+ put_local_var(lvar); \
12693+ _flags = __this_cpu_read(lvar.flags); \
12694+ } while (0)
12695+
12696+#define local_lock_irqsave_on(lvar, _flags, cpu) \
12697+ do { \
12698+ __local_lock_irqsave(&per_cpu(lvar, cpu)); \
12699+ _flags = per_cpu(lvar, cpu).flags; \
12700+ } while (0)
12701+
12702+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
12703+ unsigned long flags)
12704+{
12705+ LL_WARN(!lv->nestcnt);
12706+ LL_WARN(lv->owner != current);
12707+ if (--lv->nestcnt)
12708+ return 0;
12709+
12710+ lv->owner = NULL;
12711+ spin_unlock_irqrestore(&lv->lock, lv->flags);
12712+ return 1;
12713+}
12714+
12715+#define local_unlock_irqrestore(lvar, flags) \
12716+ do { \
12717+ if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
12718+ put_local_var(lvar); \
12719+ } while (0)
12720+
12721+#define local_unlock_irqrestore_on(lvar, flags, cpu) \
12722+ do { \
12723+ __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
12724+ } while (0)
12725+
12726+#define local_spin_trylock_irq(lvar, lock) \
12727+ ({ \
12728+ int __locked; \
12729+ local_lock_irq(lvar); \
12730+ __locked = spin_trylock(lock); \
12731+ if (!__locked) \
12732+ local_unlock_irq(lvar); \
12733+ __locked; \
12734+ })
12735+
12736+#define local_spin_lock_irq(lvar, lock) \
12737+ do { \
12738+ local_lock_irq(lvar); \
12739+ spin_lock(lock); \
12740+ } while (0)
12741+
12742+#define local_spin_unlock_irq(lvar, lock) \
12743+ do { \
12744+ spin_unlock(lock); \
12745+ local_unlock_irq(lvar); \
12746+ } while (0)
12747+
12748+#define local_spin_lock_irqsave(lvar, lock, flags) \
12749+ do { \
12750+ local_lock_irqsave(lvar, flags); \
12751+ spin_lock(lock); \
12752+ } while (0)
12753+
12754+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
12755+ do { \
12756+ spin_unlock(lock); \
12757+ local_unlock_irqrestore(lvar, flags); \
12758+ } while (0)
12759+
12760+#define get_locked_var(lvar, var) \
12761+ (*({ \
12762+ local_lock(lvar); \
12763+ this_cpu_ptr(&var); \
12764+ }))
12765+
12766+#define put_locked_var(lvar, var) local_unlock(lvar);
12767+
12768+#define local_lock_cpu(lvar) \
12769+ ({ \
12770+ local_lock(lvar); \
12771+ smp_processor_id(); \
12772+ })
12773+
12774+#define local_unlock_cpu(lvar) local_unlock(lvar)
12775+
12776+#else /* PREEMPT_RT_BASE */
12777+
12778+#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
12779+#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
12780+
12781+static inline void local_irq_lock_init(int lvar) { }
12782+
12783+#define local_trylock(lvar) \
12784+ ({ \
12785+ preempt_disable(); \
12786+ 1; \
12787+ })
12788+
12789+#define local_lock(lvar) preempt_disable()
12790+#define local_unlock(lvar) preempt_enable()
12791+#define local_lock_irq(lvar) local_irq_disable()
12792+#define local_lock_irq_on(lvar, cpu) local_irq_disable()
12793+#define local_unlock_irq(lvar) local_irq_enable()
12794+#define local_unlock_irq_on(lvar, cpu) local_irq_enable()
12795+#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
12796+#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
12797+
12798+#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
12799+#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
12800+#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
12801+#define local_spin_lock_irqsave(lvar, lock, flags) \
12802+ spin_lock_irqsave(lock, flags)
12803+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
12804+ spin_unlock_irqrestore(lock, flags)
12805+
12806+#define get_locked_var(lvar, var) get_cpu_var(var)
12807+#define put_locked_var(lvar, var) put_cpu_var(var)
12808+
12809+#define local_lock_cpu(lvar) get_cpu()
12810+#define local_unlock_cpu(lvar) put_cpu()
12811+
12812+#endif
12813+
12814+#endif
12815diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mm_types.h linux-4.14/include/linux/mm_types.h
12816--- linux-4.14.orig/include/linux/mm_types.h 2018-09-05 11:03:28.000000000 +0200
12817+++ linux-4.14/include/linux/mm_types.h 2018-09-05 11:05:07.000000000 +0200
12818@@ -12,6 +12,7 @@
12819 #include <linux/completion.h>
12820 #include <linux/cpumask.h>
12821 #include <linux/uprobes.h>
12822+#include <linux/rcupdate.h>
12823 #include <linux/page-flags-layout.h>
12824 #include <linux/workqueue.h>
12825
12826@@ -498,6 +499,9 @@
12827 bool tlb_flush_batched;
12828 #endif
12829 struct uprobes_state uprobes_state;
12830+#ifdef CONFIG_PREEMPT_RT_BASE
12831+ struct rcu_head delayed_drop;
12832+#endif
12833 #ifdef CONFIG_HUGETLB_PAGE
12834 atomic_long_t hugetlb_usage;
12835 #endif
12836diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mutex.h linux-4.14/include/linux/mutex.h
12837--- linux-4.14.orig/include/linux/mutex.h 2017-11-12 19:46:13.000000000 +0100
12838+++ linux-4.14/include/linux/mutex.h 2018-09-05 11:05:07.000000000 +0200
12839@@ -23,6 +23,17 @@
1a6e0f06 12840
e4b2b4a8 12841 struct ww_acquire_ctx;
1a6e0f06 12842
e4b2b4a8
JK
12843+#ifdef CONFIG_DEBUG_LOCK_ALLOC
12844+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
12845+ , .dep_map = { .name = #lockname }
12846+#else
12847+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
12848+#endif
12849+
12850+#ifdef CONFIG_PREEMPT_RT_FULL
12851+# include <linux/mutex_rt.h>
12852+#else
12853+
12854 /*
12855 * Simple, straightforward mutexes with strict semantics:
12856 *
12857@@ -114,13 +125,6 @@
12858 __mutex_init((mutex), #mutex, &__key); \
12859 } while (0)
1a6e0f06 12860
e4b2b4a8
JK
12861-#ifdef CONFIG_DEBUG_LOCK_ALLOC
12862-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
12863- , .dep_map = { .name = #lockname }
12864-#else
12865-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
12866-#endif
12867-
12868 #define __MUTEX_INITIALIZER(lockname) \
12869 { .owner = ATOMIC_LONG_INIT(0) \
12870 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
12871@@ -228,4 +232,6 @@
12872 return mutex_trylock(lock);
12873 }
1a6e0f06 12874
e4b2b4a8
JK
12875+#endif /* !PREEMPT_RT_FULL */
12876+
12877 #endif /* __LINUX_MUTEX_H */
12878diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mutex_rt.h linux-4.14/include/linux/mutex_rt.h
12879--- linux-4.14.orig/include/linux/mutex_rt.h 1970-01-01 01:00:00.000000000 +0100
12880+++ linux-4.14/include/linux/mutex_rt.h 2018-09-05 11:05:07.000000000 +0200
12881@@ -0,0 +1,130 @@
12882+#ifndef __LINUX_MUTEX_RT_H
12883+#define __LINUX_MUTEX_RT_H
12884+
12885+#ifndef __LINUX_MUTEX_H
12886+#error "Please include mutex.h"
12887+#endif
12888+
12889+#include <linux/rtmutex.h>
12890+
12891+/* FIXME: Just for __lockfunc */
12892+#include <linux/spinlock.h>
12893+
12894+struct mutex {
12895+ struct rt_mutex lock;
12896+#ifdef CONFIG_DEBUG_LOCK_ALLOC
12897+ struct lockdep_map dep_map;
12898+#endif
12899+};
12900+
12901+#define __MUTEX_INITIALIZER(mutexname) \
12902+ { \
12903+ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
12904+ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
12905+ }
12906+
12907+#define DEFINE_MUTEX(mutexname) \
12908+ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
12909+
12910+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
12911+extern void __lockfunc _mutex_lock(struct mutex *lock);
12912+extern void __lockfunc _mutex_lock_io(struct mutex *lock);
12913+extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
12914+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
12915+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
12916+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
12917+extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
12918+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
12919+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
12920+extern int __lockfunc _mutex_trylock(struct mutex *lock);
12921+extern void __lockfunc _mutex_unlock(struct mutex *lock);
12922+
12923+#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
12924+#define mutex_lock(l) _mutex_lock(l)
12925+#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
12926+#define mutex_lock_killable(l) _mutex_lock_killable(l)
12927+#define mutex_trylock(l) _mutex_trylock(l)
12928+#define mutex_unlock(l) _mutex_unlock(l)
12929+#define mutex_lock_io(l) _mutex_lock_io(l);
12930+
12931+#define __mutex_owner(l) ((l)->lock.owner)
12932+
12933+#ifdef CONFIG_DEBUG_MUTEXES
12934+#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
12935+#else
12936+static inline void mutex_destroy(struct mutex *lock) {}
12937+#endif
12938+
12939+#ifdef CONFIG_DEBUG_LOCK_ALLOC
12940+# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
12941+# define mutex_lock_interruptible_nested(l, s) \
12942+ _mutex_lock_interruptible_nested(l, s)
12943+# define mutex_lock_killable_nested(l, s) \
12944+ _mutex_lock_killable_nested(l, s)
12945+# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s)
12946+
12947+# define mutex_lock_nest_lock(lock, nest_lock) \
12948+do { \
12949+ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
12950+ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
12951+} while (0)
12952+
12953+#else
12954+# define mutex_lock_nested(l, s) _mutex_lock(l)
12955+# define mutex_lock_interruptible_nested(l, s) \
12956+ _mutex_lock_interruptible(l)
12957+# define mutex_lock_killable_nested(l, s) \
12958+ _mutex_lock_killable(l)
12959+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
12960+# define mutex_lock_io_nested(l, s) _mutex_lock_io(l)
12961+#endif
12962+
12963+# define mutex_init(mutex) \
12964+do { \
12965+ static struct lock_class_key __key; \
12966+ \
12967+ rt_mutex_init(&(mutex)->lock); \
12968+ __mutex_do_init((mutex), #mutex, &__key); \
12969+} while (0)
12970+
12971+# define __mutex_init(mutex, name, key) \
12972+do { \
12973+ rt_mutex_init(&(mutex)->lock); \
12974+ __mutex_do_init((mutex), name, key); \
12975+} while (0)
12976+
12977+/**
12978+ * These values are chosen such that FAIL and SUCCESS match the
12979+ * values of the regular mutex_trylock().
12980+ */
12981+enum mutex_trylock_recursive_enum {
12982+ MUTEX_TRYLOCK_FAILED = 0,
12983+ MUTEX_TRYLOCK_SUCCESS = 1,
12984+ MUTEX_TRYLOCK_RECURSIVE,
12985+};
12986+/**
12987+ * mutex_trylock_recursive - trylock variant that allows recursive locking
12988+ * @lock: mutex to be locked
12989+ *
12990+ * This function should not be used, _ever_. It is purely for hysterical GEM
12991+ * raisins, and once those are gone this will be removed.
12992+ *
12993+ * Returns:
12994+ * MUTEX_TRYLOCK_FAILED - trylock failed,
12995+ * MUTEX_TRYLOCK_SUCCESS - lock acquired,
12996+ * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
12997+ */
12998+int __rt_mutex_owner_current(struct rt_mutex *lock);
12999+
13000+static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
13001+mutex_trylock_recursive(struct mutex *lock)
13002+{
13003+ if (unlikely(__rt_mutex_owner_current(&lock->lock)))
13004+ return MUTEX_TRYLOCK_RECURSIVE;
13005+
13006+ return mutex_trylock(lock);
13007+}
13008+
13009+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13010+
13011+#endif
13012diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/netdevice.h linux-4.14/include/linux/netdevice.h
13013--- linux-4.14.orig/include/linux/netdevice.h 2018-09-05 11:03:22.000000000 +0200
13014+++ linux-4.14/include/linux/netdevice.h 2018-09-05 11:05:07.000000000 +0200
13015@@ -409,7 +409,19 @@
13016 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
1a6e0f06 13017
e4b2b4a8
JK
13018 void __napi_schedule(struct napi_struct *n);
13019+
13020+/*
13021+ * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13022+ * run as threads, and they can also be preempted (without PREEMPT_RT
13023+ * interrupt threads can not be preempted). Which means that calling
13024+ * __napi_schedule_irqoff() from an interrupt handler can be preempted
13025+ * and can corrupt the napi->poll_list.
13026+ */
13027+#ifdef CONFIG_PREEMPT_RT_FULL
13028+#define __napi_schedule_irqoff(n) __napi_schedule(n)
13029+#else
13030 void __napi_schedule_irqoff(struct napi_struct *n);
13031+#endif
1a6e0f06 13032
e4b2b4a8
JK
13033 static inline bool napi_disable_pending(struct napi_struct *n)
13034 {
13035@@ -571,7 +583,11 @@
13036 * write-mostly part
13037 */
13038 spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
13039+#ifdef CONFIG_PREEMPT_RT_FULL
13040+ struct task_struct *xmit_lock_owner;
13041+#else
13042 int xmit_lock_owner;
13043+#endif
1a6e0f06 13044 /*
e4b2b4a8 13045 * Time (in jiffies) of last Tx
1a6e0f06 13046 */
e4b2b4a8
JK
13047@@ -2433,14 +2449,53 @@
13048 void synchronize_net(void);
13049 int init_dummy_netdev(struct net_device *dev);
1a6e0f06 13050
e4b2b4a8
JK
13051-DECLARE_PER_CPU(int, xmit_recursion);
13052 #define XMIT_RECURSION_LIMIT 10
13053+#ifdef CONFIG_PREEMPT_RT_FULL
13054+static inline int dev_recursion_level(void)
13055+{
13056+ return current->xmit_recursion;
13057+}
13058+
13059+static inline int xmit_rec_read(void)
13060+{
13061+ return current->xmit_recursion;
13062+}
13063+
13064+static inline void xmit_rec_inc(void)
13065+{
13066+ current->xmit_recursion++;
13067+}
13068+
13069+static inline void xmit_rec_dec(void)
13070+{
13071+ current->xmit_recursion--;
13072+}
13073+
13074+#else
13075+
13076+DECLARE_PER_CPU(int, xmit_recursion);
1a6e0f06 13077
e4b2b4a8
JK
13078 static inline int dev_recursion_level(void)
13079 {
13080 return this_cpu_read(xmit_recursion);
13081 }
1a6e0f06 13082
e4b2b4a8
JK
13083+static inline int xmit_rec_read(void)
13084+{
13085+ return __this_cpu_read(xmit_recursion);
13086+}
13087+
13088+static inline void xmit_rec_inc(void)
13089+{
13090+ __this_cpu_inc(xmit_recursion);
13091+}
13092+
13093+static inline void xmit_rec_dec(void)
13094+{
13095+ __this_cpu_dec(xmit_recursion);
13096+}
13097+#endif
13098+
13099 struct net_device *dev_get_by_index(struct net *net, int ifindex);
13100 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13101 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
13102@@ -2792,6 +2847,7 @@
13103 unsigned int dropped;
13104 struct sk_buff_head input_pkt_queue;
13105 struct napi_struct backlog;
13106+ struct sk_buff_head tofree_queue;
1a6e0f06 13107
e4b2b4a8 13108 };
1a6e0f06 13109
e4b2b4a8
JK
13110@@ -3515,10 +3571,48 @@
13111 return (1 << debug_value) - 1;
1a6e0f06
JK
13112 }
13113
e4b2b4a8
JK
13114+#ifdef CONFIG_PREEMPT_RT_FULL
13115+static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13116+{
13117+ txq->xmit_lock_owner = current;
13118+}
13119+
13120+static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13121+{
13122+ txq->xmit_lock_owner = NULL;
13123+}
13124+
13125+static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13126+{
13127+ if (txq->xmit_lock_owner != NULL)
13128+ return true;
13129+ return false;
13130+}
13131+
13132+#else
13133+
13134+static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13135+{
13136+ txq->xmit_lock_owner = cpu;
13137+}
13138+
13139+static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13140+{
13141+ txq->xmit_lock_owner = -1;
13142+}
13143+
13144+static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13145+{
13146+ if (txq->xmit_lock_owner != -1)
13147+ return true;
13148+ return false;
13149+}
13150+#endif
13151+
13152 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
13153 {
13154 spin_lock(&txq->_xmit_lock);
13155- txq->xmit_lock_owner = cpu;
13156+ netdev_queue_set_owner(txq, cpu);
13157 }
1a6e0f06 13158
e4b2b4a8
JK
13159 static inline bool __netif_tx_acquire(struct netdev_queue *txq)
13160@@ -3535,32 +3629,32 @@
13161 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
13162 {
13163 spin_lock_bh(&txq->_xmit_lock);
13164- txq->xmit_lock_owner = smp_processor_id();
13165+ netdev_queue_set_owner(txq, smp_processor_id());
13166 }
1a6e0f06 13167
e4b2b4a8
JK
13168 static inline bool __netif_tx_trylock(struct netdev_queue *txq)
13169 {
13170 bool ok = spin_trylock(&txq->_xmit_lock);
13171 if (likely(ok))
13172- txq->xmit_lock_owner = smp_processor_id();
13173+ netdev_queue_set_owner(txq, smp_processor_id());
13174 return ok;
13175 }
1a6e0f06 13176
e4b2b4a8
JK
13177 static inline void __netif_tx_unlock(struct netdev_queue *txq)
13178 {
13179- txq->xmit_lock_owner = -1;
13180+ netdev_queue_clear_owner(txq);
13181 spin_unlock(&txq->_xmit_lock);
13182 }
1a6e0f06 13183
e4b2b4a8
JK
13184 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
13185 {
13186- txq->xmit_lock_owner = -1;
13187+ netdev_queue_clear_owner(txq);
13188 spin_unlock_bh(&txq->_xmit_lock);
13189 }
1a6e0f06 13190
e4b2b4a8
JK
13191 static inline void txq_trans_update(struct netdev_queue *txq)
13192 {
13193- if (txq->xmit_lock_owner != -1)
13194+ if (netdev_queue_has_owner(txq))
13195 txq->trans_start = jiffies;
13196 }
1a6e0f06 13197
e4b2b4a8
JK
13198diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/netfilter/x_tables.h linux-4.14/include/linux/netfilter/x_tables.h
13199--- linux-4.14.orig/include/linux/netfilter/x_tables.h 2018-09-05 11:03:22.000000000 +0200
13200+++ linux-4.14/include/linux/netfilter/x_tables.h 2018-09-05 11:05:07.000000000 +0200
13201@@ -6,6 +6,7 @@
13202 #include <linux/netdevice.h>
13203 #include <linux/static_key.h>
13204 #include <linux/netfilter.h>
13205+#include <linux/locallock.h>
13206 #include <uapi/linux/netfilter/x_tables.h>
1a6e0f06 13207
e4b2b4a8
JK
13208 /* Test a struct->invflags and a boolean for inequality */
13209@@ -341,6 +342,8 @@
13210 */
13211 DECLARE_PER_CPU(seqcount_t, xt_recseq);
1a6e0f06 13212
e4b2b4a8
JK
13213+DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13214+
13215 /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13216 *
13217 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13218@@ -361,6 +364,9 @@
13219 {
13220 unsigned int addend;
1a6e0f06 13221
e4b2b4a8
JK
13222+ /* RT protection */
13223+ local_lock(xt_write_lock);
13224+
13225 /*
13226 * Low order bit of sequence is set if we already
13227 * called xt_write_recseq_begin().
13228@@ -391,6 +397,7 @@
13229 /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13230 smp_wmb();
13231 __this_cpu_add(xt_recseq.sequence, addend);
13232+ local_unlock(xt_write_lock);
13233 }
1a6e0f06 13234
e4b2b4a8
JK
13235 /*
13236diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/nfs_fs.h linux-4.14/include/linux/nfs_fs.h
13237--- linux-4.14.orig/include/linux/nfs_fs.h 2017-11-12 19:46:13.000000000 +0100
13238+++ linux-4.14/include/linux/nfs_fs.h 2018-09-05 11:05:07.000000000 +0200
13239@@ -162,7 +162,11 @@
1a6e0f06 13240
e4b2b4a8
JK
13241 /* Readers: in-flight sillydelete RPC calls */
13242 /* Writers: rmdir */
13243+#ifdef CONFIG_PREEMPT_RT_BASE
13244+ struct semaphore rmdir_sem;
13245+#else
13246 struct rw_semaphore rmdir_sem;
13247+#endif
13248 struct mutex commit_mutex;
1a6e0f06 13249
e4b2b4a8
JK
13250 #if IS_ENABLED(CONFIG_NFS_V4)
13251diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/nfs_xdr.h linux-4.14/include/linux/nfs_xdr.h
13252--- linux-4.14.orig/include/linux/nfs_xdr.h 2017-11-12 19:46:13.000000000 +0100
13253+++ linux-4.14/include/linux/nfs_xdr.h 2018-09-05 11:05:07.000000000 +0200
13254@@ -1530,7 +1530,7 @@
13255 struct nfs_removeargs args;
13256 struct nfs_removeres res;
13257 struct dentry *dentry;
13258- wait_queue_head_t wq;
13259+ struct swait_queue_head wq;
13260 struct rpc_cred *cred;
13261 struct nfs_fattr dir_attr;
13262 long timeout;
13263diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/notifier.h linux-4.14/include/linux/notifier.h
13264--- linux-4.14.orig/include/linux/notifier.h 2017-11-12 19:46:13.000000000 +0100
13265+++ linux-4.14/include/linux/notifier.h 2018-09-05 11:05:07.000000000 +0200
13266@@ -7,7 +7,7 @@
13267 *
13268 * Alan Cox <Alan.Cox@linux.org>
13269 */
13270-
13271+
13272 #ifndef _LINUX_NOTIFIER_H
13273 #define _LINUX_NOTIFIER_H
13274 #include <linux/errno.h>
13275@@ -43,9 +43,7 @@
13276 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13277 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13278 * SRCU notifier chains should be used when the chain will be called very
13279- * often but notifier_blocks will seldom be removed. Also, SRCU notifier
13280- * chains are slightly more difficult to use because they require special
13281- * runtime initialization.
13282+ * often but notifier_blocks will seldom be removed.
13283 */
1a6e0f06 13284
e4b2b4a8
JK
13285 struct notifier_block;
13286@@ -91,7 +89,7 @@
13287 (name)->head = NULL; \
13288 } while (0)
1a6e0f06 13289
e4b2b4a8
JK
13290-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13291+/* srcu_notifier_heads must be cleaned up dynamically */
13292 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13293 #define srcu_cleanup_notifier_head(name) \
13294 cleanup_srcu_struct(&(name)->srcu);
13295@@ -104,7 +102,13 @@
13296 .head = NULL }
13297 #define RAW_NOTIFIER_INIT(name) { \
13298 .head = NULL }
13299-/* srcu_notifier_heads cannot be initialized statically */
13300+
13301+#define SRCU_NOTIFIER_INIT(name, pcpu) \
13302+ { \
13303+ .mutex = __MUTEX_INITIALIZER(name.mutex), \
13304+ .head = NULL, \
13305+ .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
13306+ }
1a6e0f06 13307
e4b2b4a8
JK
13308 #define ATOMIC_NOTIFIER_HEAD(name) \
13309 struct atomic_notifier_head name = \
13310@@ -116,6 +120,26 @@
13311 struct raw_notifier_head name = \
13312 RAW_NOTIFIER_INIT(name)
1a6e0f06 13313
e4b2b4a8
JK
13314+#ifdef CONFIG_TREE_SRCU
13315+#define _SRCU_NOTIFIER_HEAD(name, mod) \
13316+ static DEFINE_PER_CPU(struct srcu_data, \
13317+ name##_head_srcu_data); \
13318+ mod struct srcu_notifier_head name = \
13319+ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
13320+
13321+#else
13322+#define _SRCU_NOTIFIER_HEAD(name, mod) \
13323+ mod struct srcu_notifier_head name = \
13324+ SRCU_NOTIFIER_INIT(name, name)
13325+
13326+#endif
13327+
13328+#define SRCU_NOTIFIER_HEAD(name) \
13329+ _SRCU_NOTIFIER_HEAD(name, )
13330+
13331+#define SRCU_NOTIFIER_HEAD_STATIC(name) \
13332+ _SRCU_NOTIFIER_HEAD(name, static)
13333+
13334 #ifdef __KERNEL__
1a6e0f06 13335
e4b2b4a8
JK
13336 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13337@@ -185,12 +209,12 @@
c7c16703 13338
e4b2b4a8
JK
13339 /*
13340 * Declared notifiers so far. I can imagine quite a few more chains
13341- * over time (eg laptop power reset chains, reboot chain (to clean
13342+ * over time (eg laptop power reset chains, reboot chain (to clean
13343 * device units up), device [un]mount chain, module load/unload chain,
13344- * low memory chain, screenblank chain (for plug in modular screenblankers)
13345+ * low memory chain, screenblank chain (for plug in modular screenblankers)
13346 * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13347 */
13348-
13349+
13350 /* CPU notfiers are defined in include/linux/cpu.h. */
c7c16703 13351
e4b2b4a8
JK
13352 /* netdevice notifiers are defined in include/linux/netdevice.h */
13353diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/percpu.h linux-4.14/include/linux/percpu.h
13354--- linux-4.14.orig/include/linux/percpu.h 2017-11-12 19:46:13.000000000 +0100
13355+++ linux-4.14/include/linux/percpu.h 2018-09-05 11:05:07.000000000 +0200
13356@@ -19,6 +19,35 @@
13357 #define PERCPU_MODULE_RESERVE 0
13358 #endif
c7c16703 13359
e4b2b4a8
JK
13360+#ifdef CONFIG_PREEMPT_RT_FULL
13361+
13362+#define get_local_var(var) (*({ \
13363+ migrate_disable(); \
13364+ this_cpu_ptr(&var); }))
1a6e0f06 13365+
e4b2b4a8
JK
13366+#define put_local_var(var) do { \
13367+ (void)&(var); \
13368+ migrate_enable(); \
13369+} while (0)
1a6e0f06 13370+
e4b2b4a8
JK
13371+# define get_local_ptr(var) ({ \
13372+ migrate_disable(); \
13373+ this_cpu_ptr(var); })
13374+
13375+# define put_local_ptr(var) do { \
13376+ (void)(var); \
13377+ migrate_enable(); \
13378+} while (0)
13379+
13380+#else
13381+
13382+#define get_local_var(var) get_cpu_var(var)
13383+#define put_local_var(var) put_cpu_var(var)
13384+#define get_local_ptr(var) get_cpu_ptr(var)
13385+#define put_local_ptr(var) put_cpu_ptr(var)
13386+
13387+#endif
13388+
13389 /* minimum unit size, also is the maximum supported allocation size */
13390 #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
1a6e0f06 13391
e4b2b4a8
JK
13392diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/percpu-rwsem.h linux-4.14/include/linux/percpu-rwsem.h
13393--- linux-4.14.orig/include/linux/percpu-rwsem.h 2018-09-05 11:03:22.000000000 +0200
13394+++ linux-4.14/include/linux/percpu-rwsem.h 2018-09-05 11:05:07.000000000 +0200
13395@@ -29,7 +29,7 @@
13396 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
13397 extern void __percpu_up_read(struct percpu_rw_semaphore *);
1a6e0f06 13398
e4b2b4a8
JK
13399-static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
13400+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13401 {
13402 might_sleep();
1a6e0f06 13403
e4b2b4a8
JK
13404@@ -47,16 +47,10 @@
13405 __this_cpu_inc(*sem->read_count);
13406 if (unlikely(!rcu_sync_is_idle(&sem->rss)))
13407 __percpu_down_read(sem, false); /* Unconditional memory barrier */
13408- barrier();
13409 /*
13410- * The barrier() prevents the compiler from
13411+ * The preempt_enable() prevents the compiler from
13412 * bleeding the critical section out.
13413 */
13414-}
1a6e0f06 13415-
e4b2b4a8
JK
13416-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13417-{
13418- percpu_down_read_preempt_disable(sem);
13419 preempt_enable();
13420 }
1a6e0f06 13421
e4b2b4a8
JK
13422@@ -83,13 +77,9 @@
13423 return ret;
13424 }
1a6e0f06 13425
e4b2b4a8
JK
13426-static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
13427+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
1a6e0f06 13428 {
e4b2b4a8
JK
13429- /*
13430- * The barrier() prevents the compiler from
13431- * bleeding the critical section out.
13432- */
13433- barrier();
13434+ preempt_disable();
13435 /*
13436 * Same as in percpu_down_read().
13437 */
13438@@ -102,12 +92,6 @@
13439 rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
1a6e0f06 13440 }
1f39f580 13441
e4b2b4a8
JK
13442-static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13443-{
13444- preempt_disable();
13445- percpu_up_read_preempt_enable(sem);
13446-}
13447-
13448 extern void percpu_down_write(struct percpu_rw_semaphore *);
13449 extern void percpu_up_write(struct percpu_rw_semaphore *);
1f39f580 13450
e4b2b4a8
JK
13451diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/pid.h linux-4.14/include/linux/pid.h
13452--- linux-4.14.orig/include/linux/pid.h 2017-11-12 19:46:13.000000000 +0100
13453+++ linux-4.14/include/linux/pid.h 2018-09-05 11:05:07.000000000 +0200
13454@@ -3,6 +3,7 @@
13455 #define _LINUX_PID_H
1f39f580 13456
e4b2b4a8
JK
13457 #include <linux/rculist.h>
13458+#include <linux/atomic.h>
1f39f580 13459
e4b2b4a8
JK
13460 enum pid_type
13461 {
13462diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/posix-timers.h linux-4.14/include/linux/posix-timers.h
13463--- linux-4.14.orig/include/linux/posix-timers.h 2017-11-12 19:46:13.000000000 +0100
13464+++ linux-4.14/include/linux/posix-timers.h 2018-09-05 11:05:07.000000000 +0200
13465@@ -101,8 +101,8 @@
13466 struct {
13467 struct alarm alarmtimer;
13468 } alarm;
13469- struct rcu_head rcu;
13470 } it;
13471+ struct rcu_head rcu;
13472 };
1f39f580 13473
e4b2b4a8
JK
13474 void run_posix_cpu_timers(struct task_struct *task);
13475diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/preempt.h linux-4.14/include/linux/preempt.h
13476--- linux-4.14.orig/include/linux/preempt.h 2017-11-12 19:46:13.000000000 +0100
13477+++ linux-4.14/include/linux/preempt.h 2018-09-05 11:05:07.000000000 +0200
13478@@ -51,7 +51,11 @@
13479 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13480 #define NMI_OFFSET (1UL << NMI_SHIFT)
1f39f580 13481
e4b2b4a8
JK
13482-#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13483+#ifndef CONFIG_PREEMPT_RT_FULL
13484+# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13485+#else
13486+# define SOFTIRQ_DISABLE_OFFSET (0)
13487+#endif
1f39f580 13488
e4b2b4a8
JK
13489 /* We use the MSB mostly because its available */
13490 #define PREEMPT_NEED_RESCHED 0x80000000
13491@@ -81,9 +85,15 @@
13492 #include <asm/preempt.h>
1f39f580 13493
e4b2b4a8
JK
13494 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
13495-#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13496 #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13497 | NMI_MASK))
13498+#ifndef CONFIG_PREEMPT_RT_FULL
13499+# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13500+# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
13501+#else
13502+# define softirq_count() (0UL)
13503+extern int in_serving_softirq(void);
13504+#endif
1f39f580 13505
e4b2b4a8
JK
13506 /*
13507 * Are we doing bottom half or hardware interrupt processing?
13508@@ -101,7 +111,6 @@
13509 #define in_irq() (hardirq_count())
13510 #define in_softirq() (softirq_count())
13511 #define in_interrupt() (irq_count())
13512-#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
13513 #define in_nmi() (preempt_count() & NMI_MASK)
13514 #define in_task() (!(preempt_count() & \
13515 (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
13516@@ -118,7 +127,11 @@
13517 /*
13518 * The preempt_count offset after spin_lock()
13519 */
13520+#if !defined(CONFIG_PREEMPT_RT_FULL)
13521 #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
13522+#else
13523+#define PREEMPT_LOCK_OFFSET 0
13524+#endif
1f39f580 13525
e4b2b4a8
JK
13526 /*
13527 * The preempt_count offset needed for things like:
13528@@ -167,6 +180,20 @@
13529 #define preempt_count_inc() preempt_count_add(1)
13530 #define preempt_count_dec() preempt_count_sub(1)
1f39f580 13531
e4b2b4a8
JK
13532+#ifdef CONFIG_PREEMPT_LAZY
13533+#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
13534+#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
13535+#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
13536+#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
13537+#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
13538+#else
13539+#define add_preempt_lazy_count(val) do { } while (0)
13540+#define sub_preempt_lazy_count(val) do { } while (0)
13541+#define inc_preempt_lazy_count() do { } while (0)
13542+#define dec_preempt_lazy_count() do { } while (0)
13543+#define preempt_lazy_count() (0)
13544+#endif
13545+
13546 #ifdef CONFIG_PREEMPT_COUNT
1f39f580 13547
e4b2b4a8
JK
13548 #define preempt_disable() \
13549@@ -175,16 +202,53 @@
13550 barrier(); \
13551 } while (0)
1f39f580 13552
e4b2b4a8
JK
13553+#define preempt_lazy_disable() \
13554+do { \
13555+ inc_preempt_lazy_count(); \
13556+ barrier(); \
13557+} while (0)
13558+
13559 #define sched_preempt_enable_no_resched() \
13560 do { \
13561 barrier(); \
13562 preempt_count_dec(); \
13563 } while (0)
1f39f580 13564
e4b2b4a8
JK
13565-#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13566+#ifdef CONFIG_PREEMPT_RT_BASE
13567+# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13568+# define preempt_check_resched_rt() preempt_check_resched()
13569+#else
13570+# define preempt_enable_no_resched() preempt_enable()
13571+# define preempt_check_resched_rt() barrier();
13572+#endif
1f39f580 13573
e4b2b4a8 13574 #define preemptible() (preempt_count() == 0 && !irqs_disabled())
1f39f580 13575
e4b2b4a8
JK
13576+#ifdef CONFIG_SMP
13577+
13578+extern void migrate_disable(void);
13579+extern void migrate_enable(void);
13580+
13581+int __migrate_disabled(struct task_struct *p);
13582+
13583+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
13584+
13585+extern void migrate_disable(void);
13586+extern void migrate_enable(void);
13587+static inline int __migrate_disabled(struct task_struct *p)
13588+{
13589+ return 0;
13590+}
13591+
13592+#else
13593+#define migrate_disable() barrier()
13594+#define migrate_enable() barrier()
13595+static inline int __migrate_disabled(struct task_struct *p)
13596+{
13597+ return 0;
13598+}
13599+#endif
13600+
13601 #ifdef CONFIG_PREEMPT
13602 #define preempt_enable() \
13603 do { \
13604@@ -206,6 +270,13 @@
13605 __preempt_schedule(); \
13606 } while (0)
1f39f580 13607
e4b2b4a8
JK
13608+#define preempt_lazy_enable() \
13609+do { \
13610+ dec_preempt_lazy_count(); \
13611+ barrier(); \
13612+ preempt_check_resched(); \
13613+} while (0)
13614+
13615 #else /* !CONFIG_PREEMPT */
13616 #define preempt_enable() \
13617 do { \
13618@@ -213,6 +284,12 @@
13619 preempt_count_dec(); \
13620 } while (0)
1f39f580 13621
e4b2b4a8
JK
13622+#define preempt_lazy_enable() \
13623+do { \
13624+ dec_preempt_lazy_count(); \
13625+ barrier(); \
13626+} while (0)
13627+
13628 #define preempt_enable_notrace() \
13629 do { \
13630 barrier(); \
13631@@ -251,8 +328,16 @@
13632 #define preempt_disable_notrace() barrier()
13633 #define preempt_enable_no_resched_notrace() barrier()
13634 #define preempt_enable_notrace() barrier()
13635+#define preempt_check_resched_rt() barrier()
13636 #define preemptible() 0
1f39f580 13637
e4b2b4a8
JK
13638+#define migrate_disable() barrier()
13639+#define migrate_enable() barrier()
13640+
13641+static inline int __migrate_disabled(struct task_struct *p)
13642+{
13643+ return 0;
13644+}
13645 #endif /* CONFIG_PREEMPT_COUNT */
1f39f580 13646
e4b2b4a8
JK
13647 #ifdef MODULE
13648@@ -271,10 +356,22 @@
13649 } while (0)
13650 #define preempt_fold_need_resched() \
13651 do { \
13652- if (tif_need_resched()) \
13653+ if (tif_need_resched_now()) \
13654 set_preempt_need_resched(); \
13655 } while (0)
1f39f580 13656
e4b2b4a8
JK
13657+#ifdef CONFIG_PREEMPT_RT_FULL
13658+# define preempt_disable_rt() preempt_disable()
13659+# define preempt_enable_rt() preempt_enable()
13660+# define preempt_disable_nort() barrier()
13661+# define preempt_enable_nort() barrier()
13662+#else
13663+# define preempt_disable_rt() barrier()
13664+# define preempt_enable_rt() barrier()
13665+# define preempt_disable_nort() preempt_disable()
13666+# define preempt_enable_nort() preempt_enable()
13667+#endif
13668+
13669 #ifdef CONFIG_PREEMPT_NOTIFIERS
1f39f580 13670
e4b2b4a8
JK
13671 struct preempt_notifier;
13672diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/printk.h linux-4.14/include/linux/printk.h
13673--- linux-4.14.orig/include/linux/printk.h 2017-11-12 19:46:13.000000000 +0100
13674+++ linux-4.14/include/linux/printk.h 2018-09-05 11:05:07.000000000 +0200
13675@@ -142,9 +142,11 @@
13676 #ifdef CONFIG_EARLY_PRINTK
13677 extern asmlinkage __printf(1, 2)
13678 void early_printk(const char *fmt, ...);
13679+extern void printk_kill(void);
13680 #else
13681 static inline __printf(1, 2) __cold
13682 void early_printk(const char *s, ...) { }
13683+static inline void printk_kill(void) { }
13684 #endif
1f39f580 13685
e4b2b4a8
JK
13686 #ifdef CONFIG_PRINTK_NMI
13687diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/radix-tree.h linux-4.14/include/linux/radix-tree.h
13688--- linux-4.14.orig/include/linux/radix-tree.h 2017-11-12 19:46:13.000000000 +0100
13689+++ linux-4.14/include/linux/radix-tree.h 2018-09-05 11:05:07.000000000 +0200
13690@@ -328,6 +328,8 @@
13691 int radix_tree_preload(gfp_t gfp_mask);
13692 int radix_tree_maybe_preload(gfp_t gfp_mask);
13693 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
13694+void radix_tree_preload_end(void);
13695+
13696 void radix_tree_init(void);
13697 void *radix_tree_tag_set(struct radix_tree_root *,
13698 unsigned long index, unsigned int tag);
13699@@ -347,11 +349,6 @@
13700 unsigned int max_items, unsigned int tag);
13701 int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
1f39f580 13702
e4b2b4a8
JK
13703-static inline void radix_tree_preload_end(void)
13704-{
13705- preempt_enable();
13706-}
13707-
13708 int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
13709 int radix_tree_split(struct radix_tree_root *, unsigned long index,
13710 unsigned new_order);
13711diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/random.h linux-4.14/include/linux/random.h
13712--- linux-4.14.orig/include/linux/random.h 2017-11-12 19:46:13.000000000 +0100
13713+++ linux-4.14/include/linux/random.h 2018-09-05 11:05:07.000000000 +0200
13714@@ -32,7 +32,7 @@
1f39f580 13715
e4b2b4a8
JK
13716 extern void add_input_randomness(unsigned int type, unsigned int code,
13717 unsigned int value) __latent_entropy;
13718-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
13719+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
1f39f580 13720
e4b2b4a8
JK
13721 extern void get_random_bytes(void *buf, int nbytes);
13722 extern int wait_for_random_bytes(void);
13723diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree_augmented.h linux-4.14/include/linux/rbtree_augmented.h
13724--- linux-4.14.orig/include/linux/rbtree_augmented.h 2017-11-12 19:46:13.000000000 +0100
13725+++ linux-4.14/include/linux/rbtree_augmented.h 2018-09-05 11:05:07.000000000 +0200
13726@@ -26,6 +26,7 @@
1f39f580 13727
e4b2b4a8
JK
13728 #include <linux/compiler.h>
13729 #include <linux/rbtree.h>
13730+#include <linux/rcupdate.h>
1f39f580 13731
e4b2b4a8
JK
13732 /*
13733 * Please note - only struct rb_augment_callbacks and the prototypes for
13734diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree.h linux-4.14/include/linux/rbtree.h
13735--- linux-4.14.orig/include/linux/rbtree.h 2017-11-12 19:46:13.000000000 +0100
13736+++ linux-4.14/include/linux/rbtree.h 2018-09-05 11:05:07.000000000 +0200
13737@@ -31,7 +31,7 @@
1f39f580 13738
e4b2b4a8
JK
13739 #include <linux/kernel.h>
13740 #include <linux/stddef.h>
13741-#include <linux/rcupdate.h>
13742+#include <linux/rcu_assign_pointer.h>
1f39f580 13743
e4b2b4a8
JK
13744 struct rb_node {
13745 unsigned long __rb_parent_color;
13746diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree_latch.h linux-4.14/include/linux/rbtree_latch.h
13747--- linux-4.14.orig/include/linux/rbtree_latch.h 2017-11-12 19:46:13.000000000 +0100
13748+++ linux-4.14/include/linux/rbtree_latch.h 2018-09-05 11:05:07.000000000 +0200
13749@@ -35,6 +35,7 @@
1f39f580 13750
e4b2b4a8
JK
13751 #include <linux/rbtree.h>
13752 #include <linux/seqlock.h>
13753+#include <linux/rcupdate.h>
1f39f580 13754
e4b2b4a8
JK
13755 struct latch_tree_node {
13756 struct rb_node node[2];
13757diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcu_assign_pointer.h linux-4.14/include/linux/rcu_assign_pointer.h
13758--- linux-4.14.orig/include/linux/rcu_assign_pointer.h 1970-01-01 01:00:00.000000000 +0100
13759+++ linux-4.14/include/linux/rcu_assign_pointer.h 2018-09-05 11:05:07.000000000 +0200
13760@@ -0,0 +1,54 @@
13761+#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
13762+#define __LINUX_RCU_ASSIGN_POINTER_H__
13763+#include <linux/compiler.h>
13764+#include <asm/barrier.h>
13765+
13766+/**
13767+ * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
13768+ * @v: The value to statically initialize with.
13769+ */
13770+#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
13771+
13772+/**
13773+ * rcu_assign_pointer() - assign to RCU-protected pointer
13774+ * @p: pointer to assign to
13775+ * @v: value to assign (publish)
13776+ *
13777+ * Assigns the specified value to the specified RCU-protected
13778+ * pointer, ensuring that any concurrent RCU readers will see
13779+ * any prior initialization.
13780+ *
13781+ * Inserts memory barriers on architectures that require them
13782+ * (which is most of them), and also prevents the compiler from
13783+ * reordering the code that initializes the structure after the pointer
13784+ * assignment. More importantly, this call documents which pointers
13785+ * will be dereferenced by RCU read-side code.
13786+ *
13787+ * In some special cases, you may use RCU_INIT_POINTER() instead
13788+ * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
13789+ * to the fact that it does not constrain either the CPU or the compiler.
13790+ * That said, using RCU_INIT_POINTER() when you should have used
13791+ * rcu_assign_pointer() is a very bad thing that results in
13792+ * impossible-to-diagnose memory corruption. So please be careful.
13793+ * See the RCU_INIT_POINTER() comment header for details.
13794+ *
13795+ * Note that rcu_assign_pointer() evaluates each of its arguments only
13796+ * once, appearances notwithstanding. One of the "extra" evaluations
13797+ * is in typeof() and the other visible only to sparse (__CHECKER__),
13798+ * neither of which actually execute the argument. As with most cpp
13799+ * macros, this execute-arguments-only-once property is important, so
13800+ * please be careful when making changes to rcu_assign_pointer() and the
13801+ * other macros that it invokes.
13802+ */
13803+#define rcu_assign_pointer(p, v) \
13804+({ \
13805+ uintptr_t _r_a_p__v = (uintptr_t)(v); \
13806+ \
13807+ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
13808+ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
13809+ else \
13810+ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
13811+ _r_a_p__v; \
13812+})
13813+
13814+#endif
13815diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcupdate.h linux-4.14/include/linux/rcupdate.h
13816--- linux-4.14.orig/include/linux/rcupdate.h 2018-09-05 11:03:22.000000000 +0200
13817+++ linux-4.14/include/linux/rcupdate.h 2018-09-05 11:05:07.000000000 +0200
13818@@ -42,6 +42,7 @@
13819 #include <linux/lockdep.h>
13820 #include <asm/processor.h>
13821 #include <linux/cpumask.h>
13822+#include <linux/rcu_assign_pointer.h>
1f39f580 13823
e4b2b4a8
JK
13824 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
13825 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
13826@@ -55,7 +56,11 @@
13827 #define call_rcu call_rcu_sched
13828 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1f39f580 13829
e4b2b4a8
JK
13830+#ifdef CONFIG_PREEMPT_RT_FULL
13831+#define call_rcu_bh call_rcu
13832+#else
13833 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
13834+#endif
13835 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
13836 void synchronize_sched(void);
13837 void rcu_barrier_tasks(void);
13838@@ -74,6 +79,11 @@
13839 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
13840 */
13841 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
13842+#ifndef CONFIG_PREEMPT_RT_FULL
13843+#define sched_rcu_preempt_depth() rcu_preempt_depth()
13844+#else
13845+static inline int sched_rcu_preempt_depth(void) { return 0; }
13846+#endif
1f39f580 13847
e4b2b4a8 13848 #else /* #ifdef CONFIG_PREEMPT_RCU */
1f39f580 13849
e4b2b4a8
JK
13850@@ -99,6 +109,8 @@
13851 return 0;
1f39f580
JK
13852 }
13853
e4b2b4a8
JK
13854+#define sched_rcu_preempt_depth() rcu_preempt_depth()
13855+
13856 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1f39f580 13857
e4b2b4a8
JK
13858 /* Internal to kernel */
13859@@ -255,7 +267,14 @@
13860 extern struct lockdep_map rcu_callback_map;
13861 int debug_lockdep_rcu_enabled(void);
13862 int rcu_read_lock_held(void);
13863+#ifdef CONFIG_PREEMPT_RT_FULL
13864+static inline int rcu_read_lock_bh_held(void)
13865+{
13866+ return rcu_read_lock_held();
13867+}
13868+#else
13869 int rcu_read_lock_bh_held(void);
13870+#endif
13871 int rcu_read_lock_sched_held(void);
1f39f580 13872
e4b2b4a8
JK
13873 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
13874@@ -365,54 +384,6 @@
13875 })
1f39f580 13876
e4b2b4a8
JK
13877 /**
13878- * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
13879- * @v: The value to statically initialize with.
13880- */
13881-#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
13882-
13883-/**
13884- * rcu_assign_pointer() - assign to RCU-protected pointer
13885- * @p: pointer to assign to
13886- * @v: value to assign (publish)
13887- *
13888- * Assigns the specified value to the specified RCU-protected
13889- * pointer, ensuring that any concurrent RCU readers will see
13890- * any prior initialization.
13891- *
13892- * Inserts memory barriers on architectures that require them
13893- * (which is most of them), and also prevents the compiler from
13894- * reordering the code that initializes the structure after the pointer
13895- * assignment. More importantly, this call documents which pointers
13896- * will be dereferenced by RCU read-side code.
13897- *
13898- * In some special cases, you may use RCU_INIT_POINTER() instead
13899- * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
13900- * to the fact that it does not constrain either the CPU or the compiler.
13901- * That said, using RCU_INIT_POINTER() when you should have used
13902- * rcu_assign_pointer() is a very bad thing that results in
13903- * impossible-to-diagnose memory corruption. So please be careful.
13904- * See the RCU_INIT_POINTER() comment header for details.
13905- *
13906- * Note that rcu_assign_pointer() evaluates each of its arguments only
13907- * once, appearances notwithstanding. One of the "extra" evaluations
13908- * is in typeof() and the other visible only to sparse (__CHECKER__),
13909- * neither of which actually execute the argument. As with most cpp
13910- * macros, this execute-arguments-only-once property is important, so
13911- * please be careful when making changes to rcu_assign_pointer() and the
13912- * other macros that it invokes.
13913- */
13914-#define rcu_assign_pointer(p, v) \
13915-({ \
13916- uintptr_t _r_a_p__v = (uintptr_t)(v); \
13917- \
13918- if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
13919- WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
13920- else \
13921- smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
13922- _r_a_p__v; \
13923-})
13924-
13925-/**
13926 * rcu_swap_protected() - swap an RCU and a regular pointer
13927 * @rcu_ptr: RCU pointer
13928 * @ptr: regular pointer
13929@@ -707,10 +678,14 @@
13930 static inline void rcu_read_lock_bh(void)
13931 {
13932 local_bh_disable();
13933+#ifdef CONFIG_PREEMPT_RT_FULL
13934+ rcu_read_lock();
13935+#else
13936 __acquire(RCU_BH);
13937 rcu_lock_acquire(&rcu_bh_lock_map);
13938 RCU_LOCKDEP_WARN(!rcu_is_watching(),
13939 "rcu_read_lock_bh() used illegally while idle");
13940+#endif
1f39f580 13941 }
1f39f580 13942
e4b2b4a8
JK
13943 /*
13944@@ -720,10 +695,14 @@
13945 */
13946 static inline void rcu_read_unlock_bh(void)
1a6e0f06 13947 {
e4b2b4a8
JK
13948+#ifdef CONFIG_PREEMPT_RT_FULL
13949+ rcu_read_unlock();
13950+#else
13951 RCU_LOCKDEP_WARN(!rcu_is_watching(),
13952 "rcu_read_unlock_bh() used illegally while idle");
13953 rcu_lock_release(&rcu_bh_lock_map);
13954 __release(RCU_BH);
13955+#endif
13956 local_bh_enable();
1a6e0f06 13957 }
1a6e0f06 13958
e4b2b4a8
JK
13959diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcutree.h linux-4.14/include/linux/rcutree.h
13960--- linux-4.14.orig/include/linux/rcutree.h 2017-11-12 19:46:13.000000000 +0100
13961+++ linux-4.14/include/linux/rcutree.h 2018-09-05 11:05:07.000000000 +0200
13962@@ -44,7 +44,11 @@
13963 rcu_note_context_switch(false);
1a6e0f06
JK
13964 }
13965
e4b2b4a8
JK
13966+#ifdef CONFIG_PREEMPT_RT_FULL
13967+# define synchronize_rcu_bh synchronize_rcu
13968+#else
13969 void synchronize_rcu_bh(void);
13970+#endif
13971 void synchronize_sched_expedited(void);
13972 void synchronize_rcu_expedited(void);
1a6e0f06 13973
e4b2b4a8 13974@@ -72,7 +76,11 @@
1a6e0f06
JK
13975 }
13976
e4b2b4a8
JK
13977 void rcu_barrier(void);
13978+#ifdef CONFIG_PREEMPT_RT_FULL
13979+# define rcu_barrier_bh rcu_barrier
13980+#else
13981 void rcu_barrier_bh(void);
13982+#endif
13983 void rcu_barrier_sched(void);
13984 unsigned long get_state_synchronize_rcu(void);
13985 void cond_synchronize_rcu(unsigned long oldstate);
13986diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/ring_buffer.h linux-4.14/include/linux/ring_buffer.h
13987--- linux-4.14.orig/include/linux/ring_buffer.h 2018-09-05 11:03:22.000000000 +0200
13988+++ linux-4.14/include/linux/ring_buffer.h 2018-09-05 11:05:07.000000000 +0200
13989@@ -34,10 +34,12 @@
13990 * array[0] = time delta (28 .. 59)
13991 * size = 8 bytes
13992 *
13993- * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
13994- * array[0] = tv_nsec
13995- * array[1..2] = tv_sec
13996- * size = 16 bytes
13997+ * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp
13998+ * Same format as TIME_EXTEND except that the
13999+ * value is an absolute timestamp, not a delta
14000+ * event.time_delta contains bottom 27 bits
14001+ * array[0] = top (28 .. 59) bits
14002+ * size = 8 bytes
14003 *
14004 * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
14005 * Data record
14006@@ -54,12 +56,12 @@
14007 RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
14008 RINGBUF_TYPE_PADDING,
14009 RINGBUF_TYPE_TIME_EXTEND,
14010- /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
14011 RINGBUF_TYPE_TIME_STAMP,
14012 };
1a6e0f06 14013
e4b2b4a8
JK
14014 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
14015 void *ring_buffer_event_data(struct ring_buffer_event *event);
14016+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
1a6e0f06 14017
e4b2b4a8
JK
14018 /*
14019 * ring_buffer_discard_commit will remove an event that has not
14020@@ -115,6 +117,9 @@
14021 int ring_buffer_write(struct ring_buffer *buffer,
14022 unsigned long length, void *data);
14023
14024+void ring_buffer_nest_start(struct ring_buffer *buffer);
14025+void ring_buffer_nest_end(struct ring_buffer *buffer);
14026+
14027 struct ring_buffer_event *
14028 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
14029 unsigned long *lost_events);
14030@@ -179,6 +184,8 @@
14031 int cpu, u64 *ts);
14032 void ring_buffer_set_clock(struct ring_buffer *buffer,
14033 u64 (*clock)(void));
14034+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
14035+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
14036
14037 size_t ring_buffer_page_len(void *page);
14038
14039diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rtmutex.h linux-4.14/include/linux/rtmutex.h
14040--- linux-4.14.orig/include/linux/rtmutex.h 2017-11-12 19:46:13.000000000 +0100
14041+++ linux-4.14/include/linux/rtmutex.h 2018-09-05 11:05:07.000000000 +0200
14042@@ -14,11 +14,15 @@
14043 #define __LINUX_RT_MUTEX_H
1a6e0f06 14044
e4b2b4a8
JK
14045 #include <linux/linkage.h>
14046+#include <linux/spinlock_types_raw.h>
14047 #include <linux/rbtree.h>
14048-#include <linux/spinlock_types.h>
1a6e0f06 14049
e4b2b4a8 14050 extern int max_lock_depth; /* for sysctl */
1a6e0f06 14051
e4b2b4a8
JK
14052+#ifdef CONFIG_DEBUG_MUTEXES
14053+#include <linux/debug_locks.h>
14054+#endif
14055+
14056 /**
14057 * The rt_mutex structure
14058 *
14059@@ -31,8 +35,8 @@
14060 raw_spinlock_t wait_lock;
14061 struct rb_root_cached waiters;
14062 struct task_struct *owner;
14063-#ifdef CONFIG_DEBUG_RT_MUTEXES
14064 int save_state;
14065+#ifdef CONFIG_DEBUG_RT_MUTEXES
14066 const char *name, *file;
14067 int line;
14068 void *magic;
14069@@ -82,16 +86,23 @@
14070 #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14071 #endif
1a6e0f06 14072
e4b2b4a8
JK
14073-#define __RT_MUTEX_INITIALIZER(mutexname) \
14074- { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14075+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14076+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14077 , .waiters = RB_ROOT_CACHED \
14078 , .owner = NULL \
14079 __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14080- __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
14081+ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14082+
14083+#define __RT_MUTEX_INITIALIZER(mutexname) \
14084+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
1a6e0f06 14085
e4b2b4a8
JK
14086 #define DEFINE_RT_MUTEX(mutexname) \
14087 struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
1a6e0f06 14088
e4b2b4a8
JK
14089+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14090+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14091+ , .save_state = 1 }
14092+
14093 /**
14094 * rt_mutex_is_locked - is the mutex locked
14095 * @lock: the mutex to be queried
14096@@ -108,6 +119,7 @@
1a6e0f06 14097
e4b2b4a8
JK
14098 extern void rt_mutex_lock(struct rt_mutex *lock);
14099 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14100+extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14101 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14102 struct hrtimer_sleeper *timeout);
1a6e0f06 14103
e4b2b4a8
JK
14104diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_rt.h linux-4.14/include/linux/rwlock_rt.h
14105--- linux-4.14.orig/include/linux/rwlock_rt.h 1970-01-01 01:00:00.000000000 +0100
14106+++ linux-4.14/include/linux/rwlock_rt.h 2018-09-05 11:05:07.000000000 +0200
14107@@ -0,0 +1,119 @@
14108+#ifndef __LINUX_RWLOCK_RT_H
14109+#define __LINUX_RWLOCK_RT_H
14110+
14111+#ifndef __LINUX_SPINLOCK_H
14112+#error Do not include directly. Use spinlock.h
1a6e0f06 14113+#endif
e4b2b4a8
JK
14114+
14115+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14116+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14117+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14118+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14119+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14120+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14121+extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock);
14122+extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock);
14123+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14124+
14125+#define read_can_lock(rwlock) rt_read_can_lock(rwlock)
14126+#define write_can_lock(rwlock) rt_write_can_lock(rwlock)
14127+
14128+#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
14129+#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
14130+
14131+static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
14132+{
14133+ /* XXX ARCH_IRQ_ENABLED */
14134+ *flags = 0;
14135+ return rt_write_trylock(lock);
1a6e0f06
JK
14136+}
14137+
e4b2b4a8
JK
14138+#define write_trylock_irqsave(lock, flags) \
14139+ __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags)))
14140+
14141+#define read_lock_irqsave(lock, flags) \
14142+ do { \
14143+ typecheck(unsigned long, flags); \
14144+ rt_read_lock(lock); \
14145+ flags = 0; \
14146+ } while (0)
14147+
14148+#define write_lock_irqsave(lock, flags) \
14149+ do { \
14150+ typecheck(unsigned long, flags); \
14151+ rt_write_lock(lock); \
14152+ flags = 0; \
14153+ } while (0)
1a6e0f06 14154+
e4b2b4a8 14155+#define read_lock(lock) rt_read_lock(lock)
1a6e0f06 14156+
e4b2b4a8
JK
14157+#define read_lock_bh(lock) \
14158+ do { \
14159+ local_bh_disable(); \
14160+ rt_read_lock(lock); \
14161+ } while (0)
1a6e0f06 14162+
e4b2b4a8 14163+#define read_lock_irq(lock) read_lock(lock)
1a6e0f06 14164+
e4b2b4a8 14165+#define write_lock(lock) rt_write_lock(lock)
1a6e0f06 14166+
e4b2b4a8
JK
14167+#define write_lock_bh(lock) \
14168+ do { \
14169+ local_bh_disable(); \
14170+ rt_write_lock(lock); \
14171+ } while (0)
1a6e0f06 14172+
e4b2b4a8 14173+#define write_lock_irq(lock) write_lock(lock)
1a6e0f06 14174+
e4b2b4a8 14175+#define read_unlock(lock) rt_read_unlock(lock)
1a6e0f06 14176+
e4b2b4a8
JK
14177+#define read_unlock_bh(lock) \
14178+ do { \
14179+ rt_read_unlock(lock); \
14180+ local_bh_enable(); \
14181+ } while (0)
1a6e0f06 14182+
e4b2b4a8 14183+#define read_unlock_irq(lock) read_unlock(lock)
1a6e0f06 14184+
e4b2b4a8
JK
14185+#define write_unlock(lock) rt_write_unlock(lock)
14186+
14187+#define write_unlock_bh(lock) \
14188+ do { \
14189+ rt_write_unlock(lock); \
14190+ local_bh_enable(); \
14191+ } while (0)
14192+
14193+#define write_unlock_irq(lock) write_unlock(lock)
14194+
14195+#define read_unlock_irqrestore(lock, flags) \
14196+ do { \
14197+ typecheck(unsigned long, flags); \
14198+ (void) flags; \
14199+ rt_read_unlock(lock); \
14200+ } while (0)
14201+
14202+#define write_unlock_irqrestore(lock, flags) \
14203+ do { \
14204+ typecheck(unsigned long, flags); \
14205+ (void) flags; \
14206+ rt_write_unlock(lock); \
14207+ } while (0)
14208+
14209+#define rwlock_init(rwl) \
14210+do { \
14211+ static struct lock_class_key __key; \
14212+ \
14213+ __rt_rwlock_init(rwl, #rwl, &__key); \
14214+} while (0)
1a6e0f06 14215+
1a6e0f06 14216+/*
e4b2b4a8 14217+ * Internal functions made global for CPU pinning
1a6e0f06 14218+ */
e4b2b4a8
JK
14219+void __read_rt_lock(struct rt_rw_lock *lock);
14220+int __read_rt_trylock(struct rt_rw_lock *lock);
14221+void __write_rt_lock(struct rt_rw_lock *lock);
14222+int __write_rt_trylock(struct rt_rw_lock *lock);
14223+void __read_rt_unlock(struct rt_rw_lock *lock);
14224+void __write_rt_unlock(struct rt_rw_lock *lock);
14225+
1a6e0f06 14226+#endif
e4b2b4a8
JK
14227diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_types.h linux-4.14/include/linux/rwlock_types.h
14228--- linux-4.14.orig/include/linux/rwlock_types.h 2017-11-12 19:46:13.000000000 +0100
14229+++ linux-4.14/include/linux/rwlock_types.h 2018-09-05 11:05:07.000000000 +0200
14230@@ -1,6 +1,10 @@
14231 #ifndef __LINUX_RWLOCK_TYPES_H
14232 #define __LINUX_RWLOCK_TYPES_H
1a6e0f06 14233
e4b2b4a8
JK
14234+#if !defined(__LINUX_SPINLOCK_TYPES_H)
14235+# error "Do not include directly, include spinlock_types.h"
14236+#endif
14237+
1a6e0f06 14238 /*
e4b2b4a8
JK
14239 * include/linux/rwlock_types.h - generic rwlock type definitions
14240 * and initializers
14241diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_types_rt.h linux-4.14/include/linux/rwlock_types_rt.h
14242--- linux-4.14.orig/include/linux/rwlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
14243+++ linux-4.14/include/linux/rwlock_types_rt.h 2018-09-05 11:05:07.000000000 +0200
14244@@ -0,0 +1,55 @@
14245+#ifndef __LINUX_RWLOCK_TYPES_RT_H
14246+#define __LINUX_RWLOCK_TYPES_RT_H
14247+
14248+#ifndef __LINUX_SPINLOCK_TYPES_H
14249+#error "Do not include directly. Include spinlock_types.h instead"
14250+#endif
14251+
14252+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14253+# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
14254+#else
14255+# define RW_DEP_MAP_INIT(lockname)
14256+#endif
14257+
14258+typedef struct rt_rw_lock rwlock_t;
14259+
14260+#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
14261+
14262+#define DEFINE_RWLOCK(name) \
14263+ rwlock_t name = __RW_LOCK_UNLOCKED(name)
14264+
14265+/*
14266+ * A reader biased implementation primarily for CPU pinning.
14267+ *
14268+ * Can be selected as general replacement for the single reader RT rwlock
14269+ * variant
14270+ */
14271+struct rt_rw_lock {
14272+ struct rt_mutex rtmutex;
14273+ atomic_t readers;
14274+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14275+ struct lockdep_map dep_map;
14276+#endif
14277+};
14278+
14279+#define READER_BIAS (1U << 31)
14280+#define WRITER_BIAS (1U << 30)
14281+
14282+#define __RWLOCK_RT_INITIALIZER(name) \
14283+{ \
14284+ .readers = ATOMIC_INIT(READER_BIAS), \
14285+ .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \
14286+ RW_DEP_MAP_INIT(name) \
14287+}
14288+
14289+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
14290+ struct lock_class_key *key);
14291+
14292+#define rwlock_biased_rt_init(rwlock) \
14293+ do { \
14294+ static struct lock_class_key __key; \
14295+ \
14296+ __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \
14297+ } while (0)
14298+
14299+#endif
14300diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwsem.h linux-4.14/include/linux/rwsem.h
14301--- linux-4.14.orig/include/linux/rwsem.h 2018-09-05 11:03:22.000000000 +0200
14302+++ linux-4.14/include/linux/rwsem.h 2018-09-05 11:05:07.000000000 +0200
14303@@ -20,6 +20,10 @@
14304 #include <linux/osq_lock.h>
14305 #endif
1a6e0f06 14306
e4b2b4a8
JK
14307+#ifdef CONFIG_PREEMPT_RT_FULL
14308+#include <linux/rwsem_rt.h>
14309+#else /* PREEMPT_RT_FULL */
14310+
14311 struct rw_semaphore;
1a6e0f06 14312
e4b2b4a8
JK
14313 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14314@@ -114,6 +118,13 @@
14315 return !list_empty(&sem->wait_list);
1a6e0f06
JK
14316 }
14317
e4b2b4a8
JK
14318+#endif /* !PREEMPT_RT_FULL */
14319+
14320+/*
14321+ * The functions below are the same for all rwsem implementations including
14322+ * the RT specific variant.
14323+ */
14324+
1a6e0f06 14325 /*
e4b2b4a8 14326 * lock for reading
1a6e0f06 14327 */
e4b2b4a8
JK
14328diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwsem_rt.h linux-4.14/include/linux/rwsem_rt.h
14329--- linux-4.14.orig/include/linux/rwsem_rt.h 1970-01-01 01:00:00.000000000 +0100
14330+++ linux-4.14/include/linux/rwsem_rt.h 2018-09-05 11:05:07.000000000 +0200
14331@@ -0,0 +1,67 @@
14332+#ifndef _LINUX_RWSEM_RT_H
14333+#define _LINUX_RWSEM_RT_H
14334+
14335+#ifndef _LINUX_RWSEM_H
14336+#error "Include rwsem.h"
14337+#endif
14338+
14339+#include <linux/rtmutex.h>
14340+#include <linux/swait.h>
14341+
14342+#define READER_BIAS (1U << 31)
14343+#define WRITER_BIAS (1U << 30)
14344+
14345+struct rw_semaphore {
14346+ atomic_t readers;
14347+ struct rt_mutex rtmutex;
14348+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14349+ struct lockdep_map dep_map;
14350+#endif
14351+};
14352+
14353+#define __RWSEM_INITIALIZER(name) \
14354+{ \
14355+ .readers = ATOMIC_INIT(READER_BIAS), \
14356+ .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \
14357+ RW_DEP_MAP_INIT(name) \
14358+}
14359+
14360+#define DECLARE_RWSEM(lockname) \
14361+ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14362+
14363+extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name,
14364+ struct lock_class_key *key);
14365+
14366+#define __init_rwsem(sem, name, key) \
14367+do { \
14368+ rt_mutex_init(&(sem)->rtmutex); \
14369+ __rwsem_init((sem), (name), (key)); \
14370+} while (0)
14371+
14372+#define init_rwsem(sem) \
14373+do { \
14374+ static struct lock_class_key __key; \
14375+ \
14376+ __init_rwsem((sem), #sem, &__key); \
14377+} while (0)
14378+
14379+static inline int rwsem_is_locked(struct rw_semaphore *sem)
1a6e0f06 14380+{
e4b2b4a8
JK
14381+ return atomic_read(&sem->readers) != READER_BIAS;
14382+}
1a6e0f06 14383+
e4b2b4a8
JK
14384+static inline int rwsem_is_contended(struct rw_semaphore *sem)
14385+{
14386+ return atomic_read(&sem->readers) > 0;
1a6e0f06
JK
14387+}
14388+
e4b2b4a8
JK
14389+extern void __down_read(struct rw_semaphore *sem);
14390+extern int __down_read_trylock(struct rw_semaphore *sem);
14391+extern void __down_write(struct rw_semaphore *sem);
14392+extern int __must_check __down_write_killable(struct rw_semaphore *sem);
14393+extern int __down_write_trylock(struct rw_semaphore *sem);
14394+extern void __up_read(struct rw_semaphore *sem);
14395+extern void __up_write(struct rw_semaphore *sem);
14396+extern void __downgrade_write(struct rw_semaphore *sem);
14397+
14398+#endif
14399diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/mm.h linux-4.14/include/linux/sched/mm.h
14400--- linux-4.14.orig/include/linux/sched/mm.h 2017-11-12 19:46:13.000000000 +0100
14401+++ linux-4.14/include/linux/sched/mm.h 2018-09-05 11:05:07.000000000 +0200
14402@@ -43,6 +43,17 @@
14403 __mmdrop(mm);
1a6e0f06 14404 }
1a6e0f06 14405
e4b2b4a8
JK
14406+#ifdef CONFIG_PREEMPT_RT_BASE
14407+extern void __mmdrop_delayed(struct rcu_head *rhp);
14408+static inline void mmdrop_delayed(struct mm_struct *mm)
14409+{
14410+ if (atomic_dec_and_test(&mm->mm_count))
14411+ call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14412+}
14413+#else
14414+# define mmdrop_delayed(mm) mmdrop(mm)
14415+#endif
1a6e0f06 14416+
e4b2b4a8
JK
14417 static inline void mmdrop_async_fn(struct work_struct *work)
14418 {
14419 struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
14420diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/task.h linux-4.14/include/linux/sched/task.h
14421--- linux-4.14.orig/include/linux/sched/task.h 2018-09-05 11:03:22.000000000 +0200
14422+++ linux-4.14/include/linux/sched/task.h 2018-09-05 11:05:07.000000000 +0200
14423@@ -88,6 +88,15 @@
1a6e0f06 14424
e4b2b4a8 14425 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
1a6e0f06 14426
e4b2b4a8
JK
14427+#ifdef CONFIG_PREEMPT_RT_BASE
14428+extern void __put_task_struct_cb(struct rcu_head *rhp);
1a6e0f06 14429+
e4b2b4a8
JK
14430+static inline void put_task_struct(struct task_struct *t)
14431+{
14432+ if (atomic_dec_and_test(&t->usage))
14433+ call_rcu(&t->put_rcu, __put_task_struct_cb);
14434+}
14435+#else
14436 extern void __put_task_struct(struct task_struct *t);
14437
14438 static inline void put_task_struct(struct task_struct *t)
14439@@ -95,7 +104,7 @@
14440 if (atomic_dec_and_test(&t->usage))
14441 __put_task_struct(t);
1a6e0f06 14442 }
e4b2b4a8
JK
14443-
14444+#endif
14445 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
1a6e0f06 14446
e4b2b4a8
JK
14447 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
14448diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/wake_q.h linux-4.14/include/linux/sched/wake_q.h
14449--- linux-4.14.orig/include/linux/sched/wake_q.h 2017-11-12 19:46:13.000000000 +0100
14450+++ linux-4.14/include/linux/sched/wake_q.h 2018-09-05 11:05:07.000000000 +0200
14451@@ -47,8 +47,29 @@
14452 head->lastp = &head->first;
14453 }
1a6e0f06 14454
e4b2b4a8
JK
14455-extern void wake_q_add(struct wake_q_head *head,
14456- struct task_struct *task);
14457-extern void wake_up_q(struct wake_q_head *head);
14458+extern void __wake_q_add(struct wake_q_head *head,
14459+ struct task_struct *task, bool sleeper);
14460+static inline void wake_q_add(struct wake_q_head *head,
14461+ struct task_struct *task)
14462+{
14463+ __wake_q_add(head, task, false);
14464+}
14465+
14466+static inline void wake_q_add_sleeper(struct wake_q_head *head,
14467+ struct task_struct *task)
14468+{
14469+ __wake_q_add(head, task, true);
14470+}
1a6e0f06 14471+
e4b2b4a8
JK
14472+extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14473+static inline void wake_up_q(struct wake_q_head *head)
14474+{
14475+ __wake_up_q(head, false);
14476+}
1a6e0f06 14477+
e4b2b4a8
JK
14478+static inline void wake_up_q_sleeper(struct wake_q_head *head)
14479+{
14480+ __wake_up_q(head, true);
14481+}
1a6e0f06 14482
e4b2b4a8
JK
14483 #endif /* _LINUX_SCHED_WAKE_Q_H */
14484diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched.h linux-4.14/include/linux/sched.h
14485--- linux-4.14.orig/include/linux/sched.h 2018-09-05 11:03:22.000000000 +0200
14486+++ linux-4.14/include/linux/sched.h 2018-09-05 11:05:07.000000000 +0200
14487@@ -27,6 +27,7 @@
14488 #include <linux/signal_types.h>
14489 #include <linux/mm_types_task.h>
14490 #include <linux/task_io_accounting.h>
14491+#include <asm/kmap_types.h>
1a6e0f06 14492
e4b2b4a8
JK
14493 /* task_struct member predeclarations (sorted alphabetically): */
14494 struct audit_context;
14495@@ -93,7 +94,6 @@
1a6e0f06 14496
e4b2b4a8
JK
14497 /* Convenience macros for the sake of wake_up(): */
14498 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14499-#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
1a6e0f06 14500
e4b2b4a8
JK
14501 /* get_task_state(): */
14502 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
14503@@ -101,12 +101,8 @@
14504 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
14505 TASK_PARKED)
1a6e0f06 14506
e4b2b4a8
JK
14507-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
14508-
14509 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
c7c16703 14510
e4b2b4a8
JK
14511-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14512-
14513 #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14514 (task->flags & PF_FROZEN) == 0 && \
14515 (task->state & TASK_NOLOAD) == 0)
14516@@ -134,6 +130,11 @@
14517 smp_store_mb(current->state, (state_value)); \
14518 } while (0)
c7c16703 14519
e4b2b4a8
JK
14520+#define __set_current_state_no_track(state_value) \
14521+ current->state = (state_value);
14522+#define set_current_state_no_track(state_value) \
14523+ smp_store_mb(current->state, (state_value));
14524+
14525 #define set_special_state(state_value) \
14526 do { \
14527 unsigned long flags; /* may shadow */ \
14528@@ -187,6 +188,9 @@
14529 #define set_current_state(state_value) \
14530 smp_store_mb(current->state, (state_value))
14531
14532+#define __set_current_state_no_track(state_value) __set_current_state(state_value)
14533+#define set_current_state_no_track(state_value) set_current_state(state_value)
14534+
14535 /*
14536 * set_special_state() should be used for those states when the blocking task
14537 * can not use the regular condition based wait-loop. In that case we must
14538@@ -566,6 +570,8 @@
14539 #endif
14540 /* -1 unrunnable, 0 runnable, >0 stopped: */
14541 volatile long state;
14542+ /* saved state for "spinlock sleepers" */
14543+ volatile long saved_state;
c7c16703 14544
c7c16703 14545 /*
e4b2b4a8
JK
14546 * This begins the randomizable portion of task_struct. Only
14547@@ -618,7 +624,25 @@
14548
14549 unsigned int policy;
14550 int nr_cpus_allowed;
14551- cpumask_t cpus_allowed;
14552+ const cpumask_t *cpus_ptr;
14553+ cpumask_t cpus_mask;
14554+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
14555+ int migrate_disable;
14556+ int migrate_disable_update;
14557+ int pinned_on_cpu;
14558+# ifdef CONFIG_SCHED_DEBUG
14559+ int migrate_disable_atomic;
14560+# endif
14561+
14562+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14563+ int migrate_disable;
14564+# ifdef CONFIG_SCHED_DEBUG
14565+ int migrate_disable_atomic;
14566+# endif
14567+#endif
14568+#ifdef CONFIG_PREEMPT_RT_FULL
14569+ int sleeping_lock;
14570+#endif
c7c16703 14571
e4b2b4a8
JK
14572 #ifdef CONFIG_PREEMPT_RCU
14573 int rcu_read_lock_nesting;
14574@@ -777,6 +801,9 @@
14575 #ifdef CONFIG_POSIX_TIMERS
14576 struct task_cputime cputime_expires;
14577 struct list_head cpu_timers[3];
14578+#ifdef CONFIG_PREEMPT_RT_BASE
14579+ struct task_struct *posix_timer_list;
14580+#endif
14581 #endif
c7c16703 14582
e4b2b4a8
JK
14583 /* Process credentials: */
14584@@ -820,11 +847,17 @@
14585 /* Signal handlers: */
14586 struct signal_struct *signal;
14587 struct sighand_struct *sighand;
14588+ struct sigqueue *sigqueue_cache;
14589+
14590 sigset_t blocked;
14591 sigset_t real_blocked;
14592 /* Restored if set_restore_sigmask() was used: */
14593 sigset_t saved_sigmask;
14594 struct sigpending pending;
14595+#ifdef CONFIG_PREEMPT_RT_FULL
14596+ /* TODO: move me into ->restart_block ? */
14597+ struct siginfo forced_info;
14598+#endif
14599 unsigned long sas_ss_sp;
14600 size_t sas_ss_size;
14601 unsigned int sas_ss_flags;
14602@@ -849,6 +882,7 @@
14603 raw_spinlock_t pi_lock;
14604
14605 struct wake_q_node wake_q;
14606+ struct wake_q_node wake_q_sleeper;
14607
14608 #ifdef CONFIG_RT_MUTEXES
14609 /* PI waiters blocked on a rt_mutex held by this task: */
14610@@ -1116,9 +1150,23 @@
14611 unsigned int sequential_io;
14612 unsigned int sequential_io_avg;
14613 #endif
14614+#ifdef CONFIG_PREEMPT_RT_BASE
14615+ struct rcu_head put_rcu;
14616+ int softirq_nestcnt;
14617+ unsigned int softirqs_raised;
14618+#endif
14619+#ifdef CONFIG_PREEMPT_RT_FULL
14620+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14621+ int kmap_idx;
14622+ pte_t kmap_pte[KM_TYPE_NR];
14623+# endif
14624+#endif
14625 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14626 unsigned long task_state_change;
14627 #endif
14628+#ifdef CONFIG_PREEMPT_RT_FULL
14629+ int xmit_recursion;
14630+#endif
14631 int pagefault_disabled;
14632 #ifdef CONFIG_MMU
14633 struct task_struct *oom_reaper_list;
14634@@ -1332,6 +1380,7 @@
14635 /*
14636 * Per process flags
14637 */
14638+#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
14639 #define PF_IDLE 0x00000002 /* I am an IDLE thread */
14640 #define PF_EXITING 0x00000004 /* Getting shut down */
14641 #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
14642@@ -1355,7 +1404,7 @@
14643 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
14644 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
14645 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
14646-#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
14647+#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
14648 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
14649 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
14650 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
14651@@ -1535,6 +1584,7 @@
14652
14653 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14654 extern int wake_up_process(struct task_struct *tsk);
14655+extern int wake_up_lock_sleeper(struct task_struct *tsk);
14656 extern void wake_up_new_task(struct task_struct *tsk);
14657
14658 #ifdef CONFIG_SMP
14659@@ -1611,6 +1661,89 @@
14660 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14661 }
14662
14663+#ifdef CONFIG_PREEMPT_LAZY
14664+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14665+{
14666+ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14667+}
14668+
14669+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14670+{
14671+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14672+}
14673+
14674+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14675+{
14676+ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14677+}
14678+
14679+static inline int need_resched_lazy(void)
14680+{
14681+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14682+}
14683+
14684+static inline int need_resched_now(void)
14685+{
14686+ return test_thread_flag(TIF_NEED_RESCHED);
14687+}
14688+
14689+#else
14690+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14691+static inline int need_resched_lazy(void) { return 0; }
14692+
14693+static inline int need_resched_now(void)
14694+{
14695+ return test_thread_flag(TIF_NEED_RESCHED);
14696+}
14697+
14698+#endif
14699+
14700+
14701+static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14702+{
14703+ if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14704+ return true;
14705+#ifdef CONFIG_PREEMPT_RT_FULL
14706+ if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14707+ return true;
14708+#endif
14709+ return false;
14710+}
14711+
14712+static inline bool task_is_stopped_or_traced(struct task_struct *task)
14713+{
14714+ bool traced_stopped;
14715+
14716+#ifdef CONFIG_PREEMPT_RT_FULL
14717+ unsigned long flags;
14718+
14719+ raw_spin_lock_irqsave(&task->pi_lock, flags);
14720+ traced_stopped = __task_is_stopped_or_traced(task);
14721+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14722+#else
14723+ traced_stopped = __task_is_stopped_or_traced(task);
14724+#endif
14725+ return traced_stopped;
14726+}
14727+
14728+static inline bool task_is_traced(struct task_struct *task)
14729+{
14730+ bool traced = false;
14731+
14732+ if (task->state & __TASK_TRACED)
14733+ return true;
14734+#ifdef CONFIG_PREEMPT_RT_FULL
14735+ /* in case the task is sleeping on tasklist_lock */
14736+ raw_spin_lock_irq(&task->pi_lock);
14737+ if (task->state & __TASK_TRACED)
14738+ traced = true;
14739+ else if (task->saved_state & __TASK_TRACED)
14740+ traced = true;
14741+ raw_spin_unlock_irq(&task->pi_lock);
14742+#endif
14743+ return traced;
14744+}
14745+
14746 /*
14747 * cond_resched() and cond_resched_lock(): latency reduction via
14748 * explicit rescheduling in places that are safe. The return
14749@@ -1636,12 +1769,16 @@
14750 __cond_resched_lock(lock); \
14751 })
14752
14753+#ifndef CONFIG_PREEMPT_RT_FULL
14754 extern int __cond_resched_softirq(void);
14755
14756 #define cond_resched_softirq() ({ \
14757 ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
14758 __cond_resched_softirq(); \
14759 })
14760+#else
14761+# define cond_resched_softirq() cond_resched()
14762+#endif
c7c16703 14763
e4b2b4a8
JK
14764 static inline void cond_resched_rcu(void)
14765 {
14766@@ -1671,6 +1808,23 @@
14767 return unlikely(tif_need_resched());
14768 }
c7c16703 14769
e4b2b4a8
JK
14770+#ifdef CONFIG_PREEMPT_RT_FULL
14771+static inline void sleeping_lock_inc(void)
14772+{
14773+ current->sleeping_lock++;
14774+}
14775+
14776+static inline void sleeping_lock_dec(void)
14777+{
14778+ current->sleeping_lock--;
14779+}
14780+
14781+#else
14782+
14783+static inline void sleeping_lock_inc(void) { }
14784+static inline void sleeping_lock_dec(void) { }
14785+#endif
14786+
14787 /*
14788 * Wrappers for p->thread_info->cpu access. No-op on UP.
14789 */
14790diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/seqlock.h linux-4.14/include/linux/seqlock.h
14791--- linux-4.14.orig/include/linux/seqlock.h 2017-11-12 19:46:13.000000000 +0100
14792+++ linux-4.14/include/linux/seqlock.h 2018-09-05 11:05:07.000000000 +0200
14793@@ -221,20 +221,30 @@
14794 return __read_seqcount_retry(s, start);
14795 }
c7c16703 14796
e4b2b4a8
JK
14797-
14798-
14799-static inline void raw_write_seqcount_begin(seqcount_t *s)
14800+static inline void __raw_write_seqcount_begin(seqcount_t *s)
14801 {
14802 s->sequence++;
14803 smp_wmb();
14804 }
c7c16703 14805
e4b2b4a8
JK
14806-static inline void raw_write_seqcount_end(seqcount_t *s)
14807+static inline void raw_write_seqcount_begin(seqcount_t *s)
14808+{
14809+ preempt_disable_rt();
14810+ __raw_write_seqcount_begin(s);
14811+}
14812+
14813+static inline void __raw_write_seqcount_end(seqcount_t *s)
14814 {
14815 smp_wmb();
14816 s->sequence++;
14817 }
c7c16703 14818
e4b2b4a8
JK
14819+static inline void raw_write_seqcount_end(seqcount_t *s)
14820+{
14821+ __raw_write_seqcount_end(s);
14822+ preempt_enable_rt();
14823+}
14824+
14825 /**
14826 * raw_write_seqcount_barrier - do a seq write barrier
14827 * @s: pointer to seqcount_t
14828@@ -429,10 +439,32 @@
14829 /*
14830 * Read side functions for starting and finalizing a read side section.
14831 */
14832+#ifndef CONFIG_PREEMPT_RT_FULL
14833 static inline unsigned read_seqbegin(const seqlock_t *sl)
14834 {
14835 return read_seqcount_begin(&sl->seqcount);
14836 }
14837+#else
14838+/*
14839+ * Starvation safe read side for RT
14840+ */
14841+static inline unsigned read_seqbegin(seqlock_t *sl)
14842+{
14843+ unsigned ret;
14844+
14845+repeat:
14846+ ret = ACCESS_ONCE(sl->seqcount.sequence);
14847+ if (unlikely(ret & 1)) {
14848+ /*
14849+ * Take the lock and let the writer proceed (i.e. evtl
14850+ * boost it), otherwise we could loop here forever.
14851+ */
14852+ spin_unlock_wait(&sl->lock);
14853+ goto repeat;
14854+ }
14855+ return ret;
14856+}
14857+#endif
c7c16703 14858
e4b2b4a8
JK
14859 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14860 {
14861@@ -447,36 +479,45 @@
14862 static inline void write_seqlock(seqlock_t *sl)
14863 {
14864 spin_lock(&sl->lock);
14865- write_seqcount_begin(&sl->seqcount);
14866+ __raw_write_seqcount_begin(&sl->seqcount);
14867+}
14868+
14869+static inline int try_write_seqlock(seqlock_t *sl)
14870+{
14871+ if (spin_trylock(&sl->lock)) {
14872+ __raw_write_seqcount_begin(&sl->seqcount);
14873+ return 1;
14874+ }
14875+ return 0;
c7c16703 14876 }
c7c16703 14877
e4b2b4a8
JK
14878 static inline void write_sequnlock(seqlock_t *sl)
14879 {
14880- write_seqcount_end(&sl->seqcount);
14881+ __raw_write_seqcount_end(&sl->seqcount);
14882 spin_unlock(&sl->lock);
14883 }
c7c16703 14884
e4b2b4a8
JK
14885 static inline void write_seqlock_bh(seqlock_t *sl)
14886 {
14887 spin_lock_bh(&sl->lock);
14888- write_seqcount_begin(&sl->seqcount);
14889+ __raw_write_seqcount_begin(&sl->seqcount);
c7c16703 14890 }
e4b2b4a8
JK
14891
14892 static inline void write_sequnlock_bh(seqlock_t *sl)
1a6e0f06 14893 {
e4b2b4a8
JK
14894- write_seqcount_end(&sl->seqcount);
14895+ __raw_write_seqcount_end(&sl->seqcount);
14896 spin_unlock_bh(&sl->lock);
14897 }
1a6e0f06 14898
e4b2b4a8
JK
14899 static inline void write_seqlock_irq(seqlock_t *sl)
14900 {
14901 spin_lock_irq(&sl->lock);
14902- write_seqcount_begin(&sl->seqcount);
14903+ __raw_write_seqcount_begin(&sl->seqcount);
14904 }
1a6e0f06 14905
e4b2b4a8 14906 static inline void write_sequnlock_irq(seqlock_t *sl)
1a6e0f06 14907 {
e4b2b4a8
JK
14908- write_seqcount_end(&sl->seqcount);
14909+ __raw_write_seqcount_end(&sl->seqcount);
14910 spin_unlock_irq(&sl->lock);
14911 }
14912
14913@@ -485,7 +526,7 @@
14914 unsigned long flags;
14915
14916 spin_lock_irqsave(&sl->lock, flags);
14917- write_seqcount_begin(&sl->seqcount);
14918+ __raw_write_seqcount_begin(&sl->seqcount);
14919 return flags;
14920 }
14921
14922@@ -495,7 +536,7 @@
14923 static inline void
14924 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
1a6e0f06 14925 {
e4b2b4a8
JK
14926- write_seqcount_end(&sl->seqcount);
14927+ __raw_write_seqcount_end(&sl->seqcount);
14928 spin_unlock_irqrestore(&sl->lock, flags);
14929 }
1a6e0f06 14930
e4b2b4a8
JK
14931diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/signal.h linux-4.14/include/linux/signal.h
14932--- linux-4.14.orig/include/linux/signal.h 2017-11-12 19:46:13.000000000 +0100
14933+++ linux-4.14/include/linux/signal.h 2018-09-05 11:05:07.000000000 +0200
14934@@ -243,6 +243,7 @@
1a6e0f06
JK
14935 }
14936
e4b2b4a8
JK
14937 extern void flush_sigqueue(struct sigpending *queue);
14938+extern void flush_task_sigqueue(struct task_struct *tsk);
14939
14940 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
14941 static inline int valid_signal(unsigned long sig)
14942diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/skbuff.h linux-4.14/include/linux/skbuff.h
14943--- linux-4.14.orig/include/linux/skbuff.h 2018-09-05 11:03:22.000000000 +0200
14944+++ linux-4.14/include/linux/skbuff.h 2018-09-05 11:05:07.000000000 +0200
14945@@ -287,6 +287,7 @@
14946
14947 __u32 qlen;
14948 spinlock_t lock;
14949+ raw_spinlock_t raw_lock;
1a6e0f06
JK
14950 };
14951
e4b2b4a8
JK
14952 struct sk_buff;
14953@@ -1667,6 +1668,12 @@
14954 __skb_queue_head_init(list);
14955 }
1a6e0f06 14956
e4b2b4a8
JK
14957+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
14958+{
14959+ raw_spin_lock_init(&list->raw_lock);
14960+ __skb_queue_head_init(list);
14961+}
14962+
14963 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
14964 struct lock_class_key *class)
14965 {
14966diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/smp.h linux-4.14/include/linux/smp.h
14967--- linux-4.14.orig/include/linux/smp.h 2017-11-12 19:46:13.000000000 +0100
14968+++ linux-4.14/include/linux/smp.h 2018-09-05 11:05:07.000000000 +0200
14969@@ -202,6 +202,9 @@
14970 #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
14971 #define put_cpu() preempt_enable()
1a6e0f06 14972
e4b2b4a8
JK
14973+#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
14974+#define put_cpu_light() migrate_enable()
14975+
14976 /*
14977 * Callback to arch code if there's nosmp or maxcpus=0 on the
14978 * boot command line:
14979diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_api_smp.h linux-4.14/include/linux/spinlock_api_smp.h
14980--- linux-4.14.orig/include/linux/spinlock_api_smp.h 2017-11-12 19:46:13.000000000 +0100
14981+++ linux-4.14/include/linux/spinlock_api_smp.h 2018-09-05 11:05:07.000000000 +0200
14982@@ -187,6 +187,8 @@
14983 return 0;
1a6e0f06 14984 }
e4b2b4a8
JK
14985
14986-#include <linux/rwlock_api_smp.h>
14987+#ifndef CONFIG_PREEMPT_RT_FULL
14988+# include <linux/rwlock_api_smp.h>
1a6e0f06 14989+#endif
e4b2b4a8
JK
14990
14991 #endif /* __LINUX_SPINLOCK_API_SMP_H */
14992diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock.h linux-4.14/include/linux/spinlock.h
14993--- linux-4.14.orig/include/linux/spinlock.h 2017-11-12 19:46:13.000000000 +0100
14994+++ linux-4.14/include/linux/spinlock.h 2018-09-05 11:05:07.000000000 +0200
14995@@ -286,7 +286,11 @@
14996 #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
14997
14998 /* Include rwlock functions */
14999-#include <linux/rwlock.h>
1a6e0f06 15000+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 15001+# include <linux/rwlock_rt.h>
1a6e0f06 15002+#else
e4b2b4a8 15003+# include <linux/rwlock.h>
1a6e0f06 15004+#endif
1a6e0f06 15005
e4b2b4a8
JK
15006 /*
15007 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15008@@ -297,6 +301,10 @@
15009 # include <linux/spinlock_api_up.h>
15010 #endif
1a6e0f06 15011
e4b2b4a8
JK
15012+#ifdef CONFIG_PREEMPT_RT_FULL
15013+# include <linux/spinlock_rt.h>
15014+#else /* PREEMPT_RT_FULL */
15015+
15016 /*
15017 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15018 */
15019@@ -421,4 +429,6 @@
15020 #define atomic_dec_and_lock(atomic, lock) \
15021 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
1a6e0f06 15022
e4b2b4a8
JK
15023+#endif /* !PREEMPT_RT_FULL */
15024+
15025 #endif /* __LINUX_SPINLOCK_H */
15026diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_rt.h linux-4.14/include/linux/spinlock_rt.h
15027--- linux-4.14.orig/include/linux/spinlock_rt.h 1970-01-01 01:00:00.000000000 +0100
15028+++ linux-4.14/include/linux/spinlock_rt.h 2018-09-05 11:05:07.000000000 +0200
15029@@ -0,0 +1,159 @@
15030+#ifndef __LINUX_SPINLOCK_RT_H
15031+#define __LINUX_SPINLOCK_RT_H
15032+
15033+#ifndef __LINUX_SPINLOCK_H
15034+#error Do not include directly. Use spinlock.h
15035+#endif
15036+
15037+#include <linux/bug.h>
15038+
15039+extern void
15040+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key);
15041+
15042+#define spin_lock_init(slock) \
15043+do { \
15044+ static struct lock_class_key __key; \
15045+ \
15046+ rt_mutex_init(&(slock)->lock); \
15047+ __rt_spin_lock_init(slock, #slock, &__key); \
15048+} while (0)
15049+
15050+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15051+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15052+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15053+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15054+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15055+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15056+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15057+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15058+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15059+
15060+/*
15061+ * lockdep-less calls, for derived types like rwlock:
15062+ * (for trylock they can use rt_mutex_trylock() directly.
15063+ * Migrate disable handling must be done at the call site.
15064+ */
15065+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15066+extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15067+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15068+
15069+#define spin_lock(lock) rt_spin_lock(lock)
15070+
15071+#define spin_lock_bh(lock) \
15072+ do { \
15073+ local_bh_disable(); \
15074+ rt_spin_lock(lock); \
15075+ } while (0)
15076+
15077+#define spin_lock_irq(lock) spin_lock(lock)
15078+
15079+#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
15080+
15081+#define spin_trylock(lock) \
15082+({ \
15083+ int __locked; \
15084+ __locked = spin_do_trylock(lock); \
15085+ __locked; \
15086+})
15087+
15088+#ifdef CONFIG_LOCKDEP
15089+# define spin_lock_nested(lock, subclass) \
15090+ do { \
15091+ rt_spin_lock_nested(lock, subclass); \
15092+ } while (0)
15093+
15094+#define spin_lock_bh_nested(lock, subclass) \
15095+ do { \
15096+ local_bh_disable(); \
15097+ rt_spin_lock_nested(lock, subclass); \
15098+ } while (0)
15099+
15100+# define spin_lock_irqsave_nested(lock, flags, subclass) \
15101+ do { \
15102+ typecheck(unsigned long, flags); \
15103+ flags = 0; \
15104+ rt_spin_lock_nested(lock, subclass); \
15105+ } while (0)
15106+#else
15107+# define spin_lock_nested(lock, subclass) spin_lock(lock)
15108+# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock)
15109+
15110+# define spin_lock_irqsave_nested(lock, flags, subclass) \
15111+ do { \
15112+ typecheck(unsigned long, flags); \
15113+ flags = 0; \
15114+ spin_lock(lock); \
15115+ } while (0)
15116+#endif
15117+
15118+#define spin_lock_irqsave(lock, flags) \
15119+ do { \
15120+ typecheck(unsigned long, flags); \
15121+ flags = 0; \
15122+ spin_lock(lock); \
15123+ } while (0)
15124+
15125+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
1a6e0f06 15126+{
e4b2b4a8
JK
15127+ unsigned long flags = 0;
15128+#ifdef CONFIG_TRACE_IRQFLAGS
15129+ flags = rt_spin_lock_trace_flags(lock);
15130+#else
15131+ spin_lock(lock); /* lock_local */
15132+#endif
15133+ return flags;
1a6e0f06
JK
15134+}
15135+
e4b2b4a8
JK
15136+/* FIXME: we need rt_spin_lock_nest_lock */
15137+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15138+
15139+#define spin_unlock(lock) rt_spin_unlock(lock)
15140+
15141+#define spin_unlock_bh(lock) \
15142+ do { \
15143+ rt_spin_unlock(lock); \
15144+ local_bh_enable(); \
15145+ } while (0)
15146+
15147+#define spin_unlock_irq(lock) spin_unlock(lock)
15148+
15149+#define spin_unlock_irqrestore(lock, flags) \
15150+ do { \
15151+ typecheck(unsigned long, flags); \
15152+ (void) flags; \
15153+ spin_unlock(lock); \
15154+ } while (0)
15155+
15156+#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
15157+#define spin_trylock_irq(lock) spin_trylock(lock)
15158+
15159+#define spin_trylock_irqsave(lock, flags) \
15160+ rt_spin_trylock_irqsave(lock, &(flags))
15161+
15162+#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
15163+
15164+#ifdef CONFIG_GENERIC_LOCKBREAK
15165+# define spin_is_contended(lock) ((lock)->break_lock)
15166+#else
15167+# define spin_is_contended(lock) (((void)(lock), 0))
15168+#endif
15169+
15170+static inline int spin_can_lock(spinlock_t *lock)
1a6e0f06 15171+{
e4b2b4a8 15172+ return !rt_mutex_is_locked(&lock->lock);
1a6e0f06
JK
15173+}
15174+
e4b2b4a8 15175+static inline int spin_is_locked(spinlock_t *lock)
1a6e0f06 15176+{
e4b2b4a8 15177+ return rt_mutex_is_locked(&lock->lock);
1a6e0f06
JK
15178+}
15179+
e4b2b4a8 15180+static inline void assert_spin_locked(spinlock_t *lock)
1a6e0f06 15181+{
e4b2b4a8 15182+ BUG_ON(!spin_is_locked(lock));
1a6e0f06 15183+}
1a6e0f06 15184+
e4b2b4a8
JK
15185+#define atomic_dec_and_lock(atomic, lock) \
15186+ atomic_dec_and_spin_lock(atomic, lock)
15187+
15188+#endif
15189diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types.h linux-4.14/include/linux/spinlock_types.h
15190--- linux-4.14.orig/include/linux/spinlock_types.h 2017-11-12 19:46:13.000000000 +0100
15191+++ linux-4.14/include/linux/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
15192@@ -9,80 +9,15 @@
15193 * Released under the General Public License (GPL).
15194 */
1a6e0f06 15195
e4b2b4a8
JK
15196-#if defined(CONFIG_SMP)
15197-# include <asm/spinlock_types.h>
15198-#else
15199-# include <linux/spinlock_types_up.h>
15200-#endif
15201-
15202-#include <linux/lockdep.h>
15203-
15204-typedef struct raw_spinlock {
15205- arch_spinlock_t raw_lock;
15206-#ifdef CONFIG_GENERIC_LOCKBREAK
15207- unsigned int break_lock;
15208-#endif
15209-#ifdef CONFIG_DEBUG_SPINLOCK
15210- unsigned int magic, owner_cpu;
15211- void *owner;
15212-#endif
15213-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15214- struct lockdep_map dep_map;
15215-#endif
15216-} raw_spinlock_t;
15217-
15218-#define SPINLOCK_MAGIC 0xdead4ead
15219-
15220-#define SPINLOCK_OWNER_INIT ((void *)-1L)
15221-
15222-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15223-# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
15224-#else
15225-# define SPIN_DEP_MAP_INIT(lockname)
15226-#endif
15227+#include <linux/spinlock_types_raw.h>
1a6e0f06 15228
e4b2b4a8
JK
15229-#ifdef CONFIG_DEBUG_SPINLOCK
15230-# define SPIN_DEBUG_INIT(lockname) \
15231- .magic = SPINLOCK_MAGIC, \
15232- .owner_cpu = -1, \
15233- .owner = SPINLOCK_OWNER_INIT,
15234+#ifndef CONFIG_PREEMPT_RT_FULL
15235+# include <linux/spinlock_types_nort.h>
15236+# include <linux/rwlock_types.h>
15237 #else
15238-# define SPIN_DEBUG_INIT(lockname)
15239-#endif
15240-
15241-#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15242- { \
15243- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15244- SPIN_DEBUG_INIT(lockname) \
15245- SPIN_DEP_MAP_INIT(lockname) }
15246-
15247-#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15248- (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15249-
15250-#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15251-
15252-typedef struct spinlock {
15253- union {
15254- struct raw_spinlock rlock;
15255-
15256-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15257-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15258- struct {
15259- u8 __padding[LOCK_PADSIZE];
15260- struct lockdep_map dep_map;
15261- };
15262+# include <linux/rtmutex.h>
15263+# include <linux/spinlock_types_rt.h>
15264+# include <linux/rwlock_types_rt.h>
15265 #endif
15266- };
15267-} spinlock_t;
15268-
15269-#define __SPIN_LOCK_INITIALIZER(lockname) \
15270- { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15271-
15272-#define __SPIN_LOCK_UNLOCKED(lockname) \
15273- (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15274-
15275-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15276-
15277-#include <linux/rwlock_types.h>
1a6e0f06 15278
e4b2b4a8
JK
15279 #endif /* __LINUX_SPINLOCK_TYPES_H */
15280diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_nort.h linux-4.14/include/linux/spinlock_types_nort.h
15281--- linux-4.14.orig/include/linux/spinlock_types_nort.h 1970-01-01 01:00:00.000000000 +0100
15282+++ linux-4.14/include/linux/spinlock_types_nort.h 2018-09-05 11:05:07.000000000 +0200
15283@@ -0,0 +1,33 @@
15284+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15285+#define __LINUX_SPINLOCK_TYPES_NORT_H
1a6e0f06 15286+
e4b2b4a8
JK
15287+#ifndef __LINUX_SPINLOCK_TYPES_H
15288+#error "Do not include directly. Include spinlock_types.h instead"
15289+#endif
1a6e0f06 15290+
e4b2b4a8
JK
15291+/*
15292+ * The non RT version maps spinlocks to raw_spinlocks
15293+ */
15294+typedef struct spinlock {
15295+ union {
15296+ struct raw_spinlock rlock;
1a6e0f06 15297+
e4b2b4a8
JK
15298+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15299+# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15300+ struct {
15301+ u8 __padding[LOCK_PADSIZE];
15302+ struct lockdep_map dep_map;
15303+ };
1a6e0f06 15304+#endif
e4b2b4a8
JK
15305+ };
15306+} spinlock_t;
1a6e0f06 15307+
e4b2b4a8
JK
15308+#define __SPIN_LOCK_INITIALIZER(lockname) \
15309+ { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
1a6e0f06 15310+
e4b2b4a8
JK
15311+#define __SPIN_LOCK_UNLOCKED(lockname) \
15312+ (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
1a6e0f06 15313+
e4b2b4a8 15314+#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
1a6e0f06 15315+
e4b2b4a8
JK
15316+#endif
15317diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_raw.h linux-4.14/include/linux/spinlock_types_raw.h
15318--- linux-4.14.orig/include/linux/spinlock_types_raw.h 1970-01-01 01:00:00.000000000 +0100
15319+++ linux-4.14/include/linux/spinlock_types_raw.h 2018-09-05 11:05:07.000000000 +0200
15320@@ -0,0 +1,58 @@
15321+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15322+#define __LINUX_SPINLOCK_TYPES_RAW_H
1a6e0f06 15323+
e4b2b4a8 15324+#include <linux/types.h>
1a6e0f06 15325+
e4b2b4a8
JK
15326+#if defined(CONFIG_SMP)
15327+# include <asm/spinlock_types.h>
15328+#else
15329+# include <linux/spinlock_types_up.h>
15330+#endif
1a6e0f06 15331+
e4b2b4a8
JK
15332+#include <linux/lockdep.h>
15333+
15334+typedef struct raw_spinlock {
15335+ arch_spinlock_t raw_lock;
15336+#ifdef CONFIG_GENERIC_LOCKBREAK
15337+ unsigned int break_lock;
15338+#endif
15339+#ifdef CONFIG_DEBUG_SPINLOCK
15340+ unsigned int magic, owner_cpu;
15341+ void *owner;
15342+#endif
15343+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15344+ struct lockdep_map dep_map;
15345+#endif
15346+} raw_spinlock_t;
15347+
15348+#define SPINLOCK_MAGIC 0xdead4ead
15349+
15350+#define SPINLOCK_OWNER_INIT ((void *)-1L)
1a6e0f06 15351+
e4b2b4a8
JK
15352+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15353+# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
1a6e0f06 15354+#else
e4b2b4a8
JK
15355+# define SPIN_DEP_MAP_INIT(lockname)
15356+#endif
1a6e0f06 15357+
e4b2b4a8
JK
15358+#ifdef CONFIG_DEBUG_SPINLOCK
15359+# define SPIN_DEBUG_INIT(lockname) \
15360+ .magic = SPINLOCK_MAGIC, \
15361+ .owner_cpu = -1, \
15362+ .owner = SPINLOCK_OWNER_INIT,
15363+#else
15364+# define SPIN_DEBUG_INIT(lockname)
1a6e0f06 15365+#endif
e4b2b4a8
JK
15366+
15367+#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15368+ { \
15369+ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15370+ SPIN_DEBUG_INIT(lockname) \
15371+ SPIN_DEP_MAP_INIT(lockname) }
15372+
15373+#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15374+ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15375+
15376+#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15377+
1a6e0f06 15378+#endif
e4b2b4a8
JK
15379diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_rt.h linux-4.14/include/linux/spinlock_types_rt.h
15380--- linux-4.14.orig/include/linux/spinlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
15381+++ linux-4.14/include/linux/spinlock_types_rt.h 2018-09-05 11:05:07.000000000 +0200
15382@@ -0,0 +1,48 @@
15383+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15384+#define __LINUX_SPINLOCK_TYPES_RT_H
15385+
15386+#ifndef __LINUX_SPINLOCK_TYPES_H
15387+#error "Do not include directly. Include spinlock_types.h instead"
1a6e0f06 15388+#endif
1a6e0f06 15389+
e4b2b4a8
JK
15390+#include <linux/cache.h>
15391+
15392+/*
15393+ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15394+ */
15395+typedef struct spinlock {
15396+ struct rt_mutex lock;
15397+ unsigned int break_lock;
15398+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15399+ struct lockdep_map dep_map;
1a6e0f06 15400+#endif
e4b2b4a8 15401+} spinlock_t;
1a6e0f06 15402+
e4b2b4a8
JK
15403+#ifdef CONFIG_DEBUG_RT_MUTEXES
15404+# define __RT_SPIN_INITIALIZER(name) \
15405+ { \
15406+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15407+ .save_state = 1, \
15408+ .file = __FILE__, \
15409+ .line = __LINE__ , \
15410+ }
1a6e0f06 15411+#else
e4b2b4a8
JK
15412+# define __RT_SPIN_INITIALIZER(name) \
15413+ { \
15414+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15415+ .save_state = 1, \
15416+ }
1a6e0f06 15417+#endif
1a6e0f06 15418+
e4b2b4a8
JK
15419+/*
15420+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15421+*/
15422+
15423+#define __SPIN_LOCK_UNLOCKED(name) \
15424+ { .lock = __RT_SPIN_INITIALIZER(name.lock), \
15425+ SPIN_DEP_MAP_INIT(name) }
15426+
15427+#define DEFINE_SPINLOCK(name) \
15428+ spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15429+
1a6e0f06 15430+#endif
e4b2b4a8
JK
15431diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_up.h linux-4.14/include/linux/spinlock_types_up.h
15432--- linux-4.14.orig/include/linux/spinlock_types_up.h 2017-11-12 19:46:13.000000000 +0100
15433+++ linux-4.14/include/linux/spinlock_types_up.h 2018-09-05 11:05:07.000000000 +0200
15434@@ -1,10 +1,6 @@
15435 #ifndef __LINUX_SPINLOCK_TYPES_UP_H
15436 #define __LINUX_SPINLOCK_TYPES_UP_H
1a6e0f06 15437
e4b2b4a8
JK
15438-#ifndef __LINUX_SPINLOCK_TYPES_H
15439-# error "please don't include this file directly"
15440-#endif
15441-
15442 /*
15443 * include/linux/spinlock_types_up.h - spinlock type definitions for UP
15444 *
15445diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/srcutiny.h linux-4.14/include/linux/srcutiny.h
15446--- linux-4.14.orig/include/linux/srcutiny.h 2017-11-12 19:46:13.000000000 +0100
15447+++ linux-4.14/include/linux/srcutiny.h 2018-09-05 11:05:07.000000000 +0200
15448@@ -43,7 +43,7 @@
1a6e0f06 15449
e4b2b4a8 15450 void srcu_drive_gp(struct work_struct *wp);
1a6e0f06 15451
e4b2b4a8
JK
15452-#define __SRCU_STRUCT_INIT(name) \
15453+#define __SRCU_STRUCT_INIT(name, __ignored) \
15454 { \
15455 .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
15456 .srcu_cb_tail = &name.srcu_cb_head, \
15457@@ -56,9 +56,9 @@
15458 * Tree SRCU, which needs some per-CPU data.
1a6e0f06 15459 */
e4b2b4a8
JK
15460 #define DEFINE_SRCU(name) \
15461- struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15462+ struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15463 #define DEFINE_STATIC_SRCU(name) \
15464- static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15465+ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15466
15467 void synchronize_srcu(struct srcu_struct *sp);
15468
15469diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/srcutree.h linux-4.14/include/linux/srcutree.h
15470--- linux-4.14.orig/include/linux/srcutree.h 2017-11-12 19:46:13.000000000 +0100
15471+++ linux-4.14/include/linux/srcutree.h 2018-09-05 11:05:07.000000000 +0200
15472@@ -40,7 +40,7 @@
15473 unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
15474
15475 /* Update-side state. */
15476- raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
15477+ spinlock_t __private lock ____cacheline_internodealigned_in_smp;
15478 struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
15479 unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
15480 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
15481@@ -58,7 +58,7 @@
15482 * Node in SRCU combining tree, similar in function to rcu_data.
15483 */
15484 struct srcu_node {
15485- raw_spinlock_t __private lock;
15486+ spinlock_t __private lock;
15487 unsigned long srcu_have_cbs[4]; /* GP seq for children */
15488 /* having CBs, but only */
15489 /* is > ->srcu_gq_seq. */
15490@@ -78,7 +78,7 @@
15491 struct srcu_node *level[RCU_NUM_LVLS + 1];
15492 /* First node at each level. */
15493 struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
15494- raw_spinlock_t __private lock; /* Protect counters */
15495+ spinlock_t __private lock; /* Protect counters */
15496 struct mutex srcu_gp_mutex; /* Serialize GP work. */
15497 unsigned int srcu_idx; /* Current rdr array element. */
15498 unsigned long srcu_gp_seq; /* Grace-period seq #. */
15499@@ -104,10 +104,10 @@
15500 #define SRCU_STATE_SCAN1 1
15501 #define SRCU_STATE_SCAN2 2
1a6e0f06 15502
e4b2b4a8
JK
15503-#define __SRCU_STRUCT_INIT(name) \
15504+#define __SRCU_STRUCT_INIT(name, pcpu_name) \
15505 { \
15506- .sda = &name##_srcu_data, \
15507- .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
15508+ .sda = &pcpu_name, \
15509+ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
15510 .srcu_gp_seq_needed = 0 - 1, \
15511 __SRCU_DEP_MAP_INIT(name) \
15512 }
15513@@ -133,7 +133,7 @@
1a6e0f06 15514 */
e4b2b4a8
JK
15515 #define __DEFINE_SRCU(name, is_static) \
15516 static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
15517- is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15518+ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
15519 #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
15520 #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
15521
15522diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/suspend.h linux-4.14/include/linux/suspend.h
15523--- linux-4.14.orig/include/linux/suspend.h 2018-09-05 11:03:22.000000000 +0200
15524+++ linux-4.14/include/linux/suspend.h 2018-09-05 11:05:07.000000000 +0200
15525@@ -196,6 +196,12 @@
15526 void (*end)(void);
1a6e0f06
JK
15527 };
15528
e4b2b4a8
JK
15529+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15530+extern bool pm_in_action;
15531+#else
15532+# define pm_in_action false
15533+#endif
15534+
15535 #ifdef CONFIG_SUSPEND
15536 extern suspend_state_t mem_sleep_current;
15537 extern suspend_state_t mem_sleep_default;
15538diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swait.h linux-4.14/include/linux/swait.h
15539--- linux-4.14.orig/include/linux/swait.h 2017-11-12 19:46:13.000000000 +0100
15540+++ linux-4.14/include/linux/swait.h 2018-09-05 11:05:07.000000000 +0200
15541@@ -5,6 +5,7 @@
15542 #include <linux/list.h>
15543 #include <linux/stddef.h>
15544 #include <linux/spinlock.h>
15545+#include <linux/wait.h>
15546 #include <asm/current.h>
1a6e0f06 15547
e4b2b4a8
JK
15548 /*
15549@@ -147,6 +148,7 @@
15550 extern void swake_up(struct swait_queue_head *q);
15551 extern void swake_up_all(struct swait_queue_head *q);
15552 extern void swake_up_locked(struct swait_queue_head *q);
15553+extern void swake_up_all_locked(struct swait_queue_head *q);
1a6e0f06 15554
e4b2b4a8
JK
15555 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15556 extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15557diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swap.h linux-4.14/include/linux/swap.h
15558--- linux-4.14.orig/include/linux/swap.h 2017-11-12 19:46:13.000000000 +0100
15559+++ linux-4.14/include/linux/swap.h 2018-09-05 11:05:07.000000000 +0200
15560@@ -12,6 +12,7 @@
15561 #include <linux/fs.h>
15562 #include <linux/atomic.h>
15563 #include <linux/page-flags.h>
15564+#include <linux/locallock.h>
15565 #include <asm/page.h>
15566
15567 struct notifier_block;
15568@@ -297,7 +298,8 @@
15569 void *workingset_eviction(struct address_space *mapping, struct page *page);
15570 bool workingset_refault(void *shadow);
15571 void workingset_activation(struct page *page);
15572-void workingset_update_node(struct radix_tree_node *node, void *private);
15573+void __workingset_update_node(struct radix_tree_node *node, void *private);
15574+DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
1a6e0f06 15575
e4b2b4a8
JK
15576 /* linux/mm/page_alloc.c */
15577 extern unsigned long totalram_pages;
15578@@ -310,6 +312,7 @@
1a6e0f06 15579
1a6e0f06 15580
e4b2b4a8
JK
15581 /* linux/mm/swap.c */
15582+DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15583 extern void lru_cache_add(struct page *);
15584 extern void lru_cache_add_anon(struct page *page);
15585 extern void lru_cache_add_file(struct page *page);
15586diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swork.h linux-4.14/include/linux/swork.h
15587--- linux-4.14.orig/include/linux/swork.h 1970-01-01 01:00:00.000000000 +0100
15588+++ linux-4.14/include/linux/swork.h 2018-09-05 11:05:07.000000000 +0200
15589@@ -0,0 +1,24 @@
15590+#ifndef _LINUX_SWORK_H
15591+#define _LINUX_SWORK_H
15592+
15593+#include <linux/list.h>
15594+
15595+struct swork_event {
15596+ struct list_head item;
15597+ unsigned long flags;
15598+ void (*func)(struct swork_event *);
15599+};
15600+
15601+static inline void INIT_SWORK(struct swork_event *event,
15602+ void (*func)(struct swork_event *))
15603+{
15604+ event->flags = 0;
15605+ event->func = func;
15606+}
15607+
15608+bool swork_queue(struct swork_event *sev);
15609+
15610+int swork_get(void);
15611+void swork_put(void);
15612+
15613+#endif /* _LINUX_SWORK_H */
15614diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/thread_info.h linux-4.14/include/linux/thread_info.h
15615--- linux-4.14.orig/include/linux/thread_info.h 2018-09-05 11:03:22.000000000 +0200
15616+++ linux-4.14/include/linux/thread_info.h 2018-09-05 11:05:07.000000000 +0200
15617@@ -86,7 +86,17 @@
15618 #define test_thread_flag(flag) \
15619 test_ti_thread_flag(current_thread_info(), flag)
15620
15621-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15622+#ifdef CONFIG_PREEMPT_LAZY
15623+#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
15624+ test_thread_flag(TIF_NEED_RESCHED_LAZY))
15625+#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15626+#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
15627+
1a6e0f06 15628+#else
e4b2b4a8
JK
15629+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15630+#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15631+#define tif_need_resched_lazy() 0
1a6e0f06 15632+#endif
1a6e0f06 15633
e4b2b4a8
JK
15634 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
15635 static inline int arch_within_stack_frames(const void * const stack,
15636diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/timer.h linux-4.14/include/linux/timer.h
15637--- linux-4.14.orig/include/linux/timer.h 2018-09-05 11:03:22.000000000 +0200
15638+++ linux-4.14/include/linux/timer.h 2018-09-05 11:05:07.000000000 +0200
15639@@ -213,7 +213,7 @@
1a6e0f06 15640
e4b2b4a8 15641 extern int try_to_del_timer_sync(struct timer_list *timer);
1a6e0f06 15642
e4b2b4a8
JK
15643-#ifdef CONFIG_SMP
15644+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15645 extern int del_timer_sync(struct timer_list *timer);
15646 #else
15647 # define del_timer_sync(t) del_timer(t)
15648diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/trace_events.h linux-4.14/include/linux/trace_events.h
15649--- linux-4.14.orig/include/linux/trace_events.h 2017-11-12 19:46:13.000000000 +0100
15650+++ linux-4.14/include/linux/trace_events.h 2018-09-05 11:05:07.000000000 +0200
15651@@ -62,6 +62,9 @@
15652 unsigned char flags;
15653 unsigned char preempt_count;
15654 int pid;
15655+ unsigned short migrate_disable;
15656+ unsigned short padding;
15657+ unsigned char preempt_lazy_count;
15658 };
1a6e0f06 15659
e4b2b4a8
JK
15660 #define TRACE_EVENT_TYPE_MAX \
15661@@ -402,11 +405,13 @@
15662
15663 extern int filter_match_preds(struct event_filter *filter, void *rec);
15664
15665-extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
15666- void *rec);
15667-extern void event_triggers_post_call(struct trace_event_file *file,
15668- enum event_trigger_type tt,
15669- void *rec);
15670+extern enum event_trigger_type
15671+event_triggers_call(struct trace_event_file *file, void *rec,
15672+ struct ring_buffer_event *event);
15673+extern void
15674+event_triggers_post_call(struct trace_event_file *file,
15675+ enum event_trigger_type tt,
15676+ void *rec, struct ring_buffer_event *event);
1a6e0f06 15677
e4b2b4a8 15678 bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
1a6e0f06 15679
e4b2b4a8 15680@@ -426,7 +431,7 @@
1a6e0f06 15681
e4b2b4a8
JK
15682 if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
15683 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
15684- event_triggers_call(file, NULL);
15685+ event_triggers_call(file, NULL, NULL);
15686 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
15687 return true;
15688 if (eflags & EVENT_FILE_FL_PID_FILTER)
15689diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/uaccess.h linux-4.14/include/linux/uaccess.h
15690--- linux-4.14.orig/include/linux/uaccess.h 2017-11-12 19:46:13.000000000 +0100
15691+++ linux-4.14/include/linux/uaccess.h 2018-09-05 11:05:07.000000000 +0200
15692@@ -185,6 +185,7 @@
15693 */
15694 static inline void pagefault_disable(void)
1a6e0f06 15695 {
e4b2b4a8
JK
15696+ migrate_disable();
15697 pagefault_disabled_inc();
15698 /*
15699 * make sure to have issued the store before a pagefault
15700@@ -201,6 +202,7 @@
15701 */
15702 barrier();
15703 pagefault_disabled_dec();
15704+ migrate_enable();
15705 }
1a6e0f06 15706
e4b2b4a8
JK
15707 /*
15708diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/vmstat.h linux-4.14/include/linux/vmstat.h
15709--- linux-4.14.orig/include/linux/vmstat.h 2017-11-12 19:46:13.000000000 +0100
15710+++ linux-4.14/include/linux/vmstat.h 2018-09-05 11:05:07.000000000 +0200
15711@@ -33,7 +33,9 @@
15712 */
15713 static inline void __count_vm_event(enum vm_event_item item)
15714 {
15715+ preempt_disable_rt();
15716 raw_cpu_inc(vm_event_states.event[item]);
15717+ preempt_enable_rt();
1a6e0f06
JK
15718 }
15719
e4b2b4a8
JK
15720 static inline void count_vm_event(enum vm_event_item item)
15721@@ -43,7 +45,9 @@
15722
15723 static inline void __count_vm_events(enum vm_event_item item, long delta)
1a6e0f06 15724 {
e4b2b4a8
JK
15725+ preempt_disable_rt();
15726 raw_cpu_add(vm_event_states.event[item], delta);
15727+ preempt_enable_rt();
1a6e0f06
JK
15728 }
15729
e4b2b4a8
JK
15730 static inline void count_vm_events(enum vm_event_item item, long delta)
15731diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/wait.h linux-4.14/include/linux/wait.h
15732--- linux-4.14.orig/include/linux/wait.h 2017-11-12 19:46:13.000000000 +0100
15733+++ linux-4.14/include/linux/wait.h 2018-09-05 11:05:07.000000000 +0200
15734@@ -10,6 +10,7 @@
15735
15736 #include <asm/current.h>
15737 #include <uapi/linux/wait.h>
15738+#include <linux/atomic.h>
15739
15740 typedef struct wait_queue_entry wait_queue_entry_t;
15741
15742@@ -486,8 +487,8 @@
15743 int __ret = 0; \
15744 struct hrtimer_sleeper __t; \
15745 \
15746- hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \
15747- hrtimer_init_sleeper(&__t, current); \
15748+ hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL, \
15749+ current); \
15750 if ((timeout) != KTIME_MAX) \
15751 hrtimer_start_range_ns(&__t.timer, timeout, \
15752 current->timer_slack_ns, \
15753diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/gen_stats.h linux-4.14/include/net/gen_stats.h
15754--- linux-4.14.orig/include/net/gen_stats.h 2017-11-12 19:46:13.000000000 +0100
15755+++ linux-4.14/include/net/gen_stats.h 2018-09-05 11:05:07.000000000 +0200
15756@@ -6,6 +6,7 @@
15757 #include <linux/socket.h>
15758 #include <linux/rtnetlink.h>
15759 #include <linux/pkt_sched.h>
15760+#include <net/net_seq_lock.h>
15761
15762 struct gnet_stats_basic_cpu {
15763 struct gnet_stats_basic_packed bstats;
15764@@ -36,11 +37,11 @@
15765 spinlock_t *lock, struct gnet_dump *d,
15766 int padattr);
1a6e0f06 15767
e4b2b4a8
JK
15768-int gnet_stats_copy_basic(const seqcount_t *running,
15769+int gnet_stats_copy_basic(net_seqlock_t *running,
15770 struct gnet_dump *d,
15771 struct gnet_stats_basic_cpu __percpu *cpu,
15772 struct gnet_stats_basic_packed *b);
15773-void __gnet_stats_copy_basic(const seqcount_t *running,
15774+void __gnet_stats_copy_basic(net_seqlock_t *running,
15775 struct gnet_stats_basic_packed *bstats,
15776 struct gnet_stats_basic_cpu __percpu *cpu,
15777 struct gnet_stats_basic_packed *b);
15778@@ -57,13 +58,13 @@
15779 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
15780 struct net_rate_estimator __rcu **rate_est,
15781 spinlock_t *stats_lock,
15782- seqcount_t *running, struct nlattr *opt);
15783+ net_seqlock_t *running, struct nlattr *opt);
15784 void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
15785 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
15786 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
15787 struct net_rate_estimator __rcu **ptr,
15788 spinlock_t *stats_lock,
15789- seqcount_t *running, struct nlattr *opt);
15790+ net_seqlock_t *running, struct nlattr *opt);
15791 bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
15792 bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
15793 struct gnet_stats_rate_est64 *sample);
15794diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/neighbour.h linux-4.14/include/net/neighbour.h
15795--- linux-4.14.orig/include/net/neighbour.h 2017-11-12 19:46:13.000000000 +0100
15796+++ linux-4.14/include/net/neighbour.h 2018-09-05 11:05:07.000000000 +0200
15797@@ -450,7 +450,7 @@
1a6e0f06 15798 }
e4b2b4a8 15799 #endif
1a6e0f06 15800
e4b2b4a8
JK
15801-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
15802+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
15803 {
15804 unsigned int seq;
15805 unsigned int hh_len;
15806@@ -474,7 +474,7 @@
1a6e0f06 15807
e4b2b4a8
JK
15808 static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
15809 {
15810- const struct hh_cache *hh = &n->hh;
15811+ struct hh_cache *hh = &n->hh;
1a6e0f06 15812
e4b2b4a8
JK
15813 if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
15814 return neigh_hh_output(hh, skb);
15815@@ -515,7 +515,7 @@
1a6e0f06 15816
e4b2b4a8 15817 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
1a6e0f06 15818
e4b2b4a8
JK
15819-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
15820+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
15821 const struct net_device *dev)
15822 {
15823 unsigned int seq;
15824diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/net_seq_lock.h linux-4.14/include/net/net_seq_lock.h
15825--- linux-4.14.orig/include/net/net_seq_lock.h 1970-01-01 01:00:00.000000000 +0100
15826+++ linux-4.14/include/net/net_seq_lock.h 2018-09-05 11:05:07.000000000 +0200
15827@@ -0,0 +1,15 @@
15828+#ifndef __NET_NET_SEQ_LOCK_H__
15829+#define __NET_NET_SEQ_LOCK_H__
15830+
1a6e0f06 15831+#ifdef CONFIG_PREEMPT_RT_BASE
e4b2b4a8
JK
15832+# define net_seqlock_t seqlock_t
15833+# define net_seq_begin(__r) read_seqbegin(__r)
15834+# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
15835+
1a6e0f06 15836+#else
e4b2b4a8
JK
15837+# define net_seqlock_t seqcount_t
15838+# define net_seq_begin(__r) read_seqcount_begin(__r)
15839+# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
1a6e0f06
JK
15840+#endif
15841+
e4b2b4a8
JK
15842+#endif
15843diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/sch_generic.h linux-4.14/include/net/sch_generic.h
15844--- linux-4.14.orig/include/net/sch_generic.h 2018-09-05 11:03:22.000000000 +0200
15845+++ linux-4.14/include/net/sch_generic.h 2018-09-05 11:05:07.000000000 +0200
15846@@ -10,6 +10,7 @@
15847 #include <linux/percpu.h>
15848 #include <linux/dynamic_queue_limits.h>
15849 #include <linux/list.h>
15850+#include <net/net_seq_lock.h>
15851 #include <linux/refcount.h>
15852 #include <linux/workqueue.h>
15853 #include <net/gen_stats.h>
15854@@ -90,7 +91,7 @@
15855 struct sk_buff *gso_skb ____cacheline_aligned_in_smp;
15856 struct qdisc_skb_head q;
15857 struct gnet_stats_basic_packed bstats;
15858- seqcount_t running;
15859+ net_seqlock_t running;
15860 struct gnet_stats_queue qstats;
15861 unsigned long state;
15862 struct Qdisc *next_sched;
15863@@ -109,13 +110,22 @@
15864 refcount_inc(&qdisc->refcnt);
15865 }
1a6e0f06 15866
e4b2b4a8
JK
15867-static inline bool qdisc_is_running(const struct Qdisc *qdisc)
15868+static inline bool qdisc_is_running(struct Qdisc *qdisc)
1a6e0f06 15869 {
e4b2b4a8
JK
15870+#ifdef CONFIG_PREEMPT_RT_BASE
15871+ return spin_is_locked(&qdisc->running.lock) ? true : false;
1a6e0f06 15872+#else
e4b2b4a8 15873 return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
1a6e0f06 15874+#endif
e4b2b4a8 15875 }
1a6e0f06 15876
e4b2b4a8
JK
15877 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
15878 {
1a6e0f06 15879+#ifdef CONFIG_PREEMPT_RT_BASE
e4b2b4a8
JK
15880+ if (try_write_seqlock(&qdisc->running))
15881+ return true;
15882+ return false;
1a6e0f06 15883+#else
e4b2b4a8
JK
15884 if (qdisc_is_running(qdisc))
15885 return false;
15886 /* Variant of write_seqcount_begin() telling lockdep a trylock
15887@@ -124,11 +134,16 @@
15888 raw_write_seqcount_begin(&qdisc->running);
15889 seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
15890 return true;
1a6e0f06 15891+#endif
e4b2b4a8 15892 }
1a6e0f06 15893
e4b2b4a8
JK
15894 static inline void qdisc_run_end(struct Qdisc *qdisc)
15895 {
1a6e0f06 15896+#ifdef CONFIG_PREEMPT_RT_BASE
e4b2b4a8 15897+ write_sequnlock(&qdisc->running);
1a6e0f06 15898+#else
e4b2b4a8 15899 write_seqcount_end(&qdisc->running);
1a6e0f06 15900+#endif
e4b2b4a8 15901 }
1a6e0f06 15902
e4b2b4a8
JK
15903 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
15904@@ -337,7 +352,7 @@
15905 return qdisc_lock(root);
15906 }
1a6e0f06 15907
e4b2b4a8
JK
15908-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
15909+static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
15910 {
15911 struct Qdisc *root = qdisc_root_sleeping(qdisc);
1a6e0f06 15912
e4b2b4a8
JK
15913diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/xfrm.h linux-4.14/include/net/xfrm.h
15914--- linux-4.14.orig/include/net/xfrm.h 2018-09-05 11:03:22.000000000 +0200
15915+++ linux-4.14/include/net/xfrm.h 2018-09-05 11:05:07.000000000 +0200
15916@@ -217,7 +217,7 @@
15917 struct xfrm_stats stats;
15918
15919 struct xfrm_lifetime_cur curlft;
15920- struct tasklet_hrtimer mtimer;
15921+ struct hrtimer mtimer;
15922
15923 struct xfrm_state_offload xso;
15924
15925diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/trace/events/timer.h linux-4.14/include/trace/events/timer.h
15926--- linux-4.14.orig/include/trace/events/timer.h 2018-09-05 11:03:22.000000000 +0200
15927+++ linux-4.14/include/trace/events/timer.h 2018-09-05 11:05:07.000000000 +0200
15928@@ -148,7 +148,11 @@
15929 { HRTIMER_MODE_ABS, "ABS" }, \
15930 { HRTIMER_MODE_REL, "REL" }, \
15931 { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \
15932- { HRTIMER_MODE_REL_PINNED, "REL|PINNED" })
15933+ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \
15934+ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \
15935+ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \
15936+ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \
15937+ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
1a6e0f06 15938
e4b2b4a8
JK
15939 /**
15940 * hrtimer_init - called when the hrtimer is initialized
15941@@ -186,15 +190,16 @@
15942 */
15943 TRACE_EVENT(hrtimer_start,
15944
15945- TP_PROTO(struct hrtimer *hrtimer),
15946+ TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
15947
15948- TP_ARGS(hrtimer),
15949+ TP_ARGS(hrtimer, mode),
15950
15951 TP_STRUCT__entry(
15952 __field( void *, hrtimer )
15953 __field( void *, function )
15954 __field( s64, expires )
15955 __field( s64, softexpires )
15956+ __field( enum hrtimer_mode, mode )
15957 ),
15958
15959 TP_fast_assign(
15960@@ -202,12 +207,14 @@
15961 __entry->function = hrtimer->function;
15962 __entry->expires = hrtimer_get_expires(hrtimer);
15963 __entry->softexpires = hrtimer_get_softexpires(hrtimer);
15964+ __entry->mode = mode;
15965 ),
15966
15967- TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
15968- __entry->hrtimer, __entry->function,
15969+ TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
15970+ "mode=%s", __entry->hrtimer, __entry->function,
15971 (unsigned long long) __entry->expires,
15972- (unsigned long long) __entry->softexpires)
15973+ (unsigned long long) __entry->softexpires,
15974+ decode_hrtimer_mode(__entry->mode))
15975 );
1a6e0f06 15976
e4b2b4a8
JK
15977 /**
15978diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/Kconfig linux-4.14/init/Kconfig
15979--- linux-4.14.orig/init/Kconfig 2018-09-05 11:03:22.000000000 +0200
15980+++ linux-4.14/init/Kconfig 2018-09-05 11:05:07.000000000 +0200
15981@@ -744,6 +744,7 @@
15982 config RT_GROUP_SCHED
15983 bool "Group scheduling for SCHED_RR/FIFO"
15984 depends on CGROUP_SCHED
15985+ depends on !PREEMPT_RT_FULL
15986 default n
15987 help
15988 This feature lets you explicitly allocate real CPU bandwidth
15989@@ -1533,6 +1534,7 @@
1a6e0f06 15990
e4b2b4a8
JK
15991 config SLAB
15992 bool "SLAB"
15993+ depends on !PREEMPT_RT_FULL
15994 select HAVE_HARDENED_USERCOPY_ALLOCATOR
15995 help
15996 The regular slab allocator that is established and known to work
15997@@ -1553,6 +1555,7 @@
15998 config SLOB
15999 depends on EXPERT
16000 bool "SLOB (Simple Allocator)"
16001+ depends on !PREEMPT_RT_FULL
16002 help
16003 SLOB replaces the stock allocator with a drastically simpler
16004 allocator. SLOB is generally more space efficient but
16005@@ -1594,7 +1597,7 @@
1a6e0f06 16006
e4b2b4a8
JK
16007 config SLUB_CPU_PARTIAL
16008 default y
16009- depends on SLUB && SMP
16010+ depends on SLUB && SMP && !PREEMPT_RT_FULL
16011 bool "SLUB per cpu partial cache"
16012 help
16013 Per cpu partial caches accellerate objects allocation and freeing
16014diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/main.c linux-4.14/init/main.c
16015--- linux-4.14.orig/init/main.c 2018-09-05 11:03:22.000000000 +0200
16016+++ linux-4.14/init/main.c 2018-09-05 11:05:07.000000000 +0200
16017@@ -543,6 +543,7 @@
16018 setup_command_line(command_line);
16019 setup_nr_cpu_ids();
16020 setup_per_cpu_areas();
16021+ softirq_early_init();
16022 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16023 boot_cpu_hotplug_init();
1a6e0f06 16024
e4b2b4a8
JK
16025diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/Makefile linux-4.14/init/Makefile
16026--- linux-4.14.orig/init/Makefile 2017-11-12 19:46:13.000000000 +0100
16027+++ linux-4.14/init/Makefile 2018-09-05 11:05:07.000000000 +0200
16028@@ -36,4 +36,4 @@
16029 include/generated/compile.h: FORCE
16030 @$($(quiet)chk_compile.h)
16031 $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16032- "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16033+ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16034diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cgroup/cgroup.c linux-4.14/kernel/cgroup/cgroup.c
16035--- linux-4.14.orig/kernel/cgroup/cgroup.c 2018-09-05 11:03:22.000000000 +0200
16036+++ linux-4.14/kernel/cgroup/cgroup.c 2018-09-05 11:05:07.000000000 +0200
16037@@ -4508,10 +4508,10 @@
16038 queue_work(cgroup_destroy_wq, &css->destroy_work);
1a6e0f06
JK
16039 }
16040
e4b2b4a8
JK
16041-static void css_release_work_fn(struct work_struct *work)
16042+static void css_release_work_fn(struct swork_event *sev)
1a6e0f06 16043 {
e4b2b4a8
JK
16044 struct cgroup_subsys_state *css =
16045- container_of(work, struct cgroup_subsys_state, destroy_work);
16046+ container_of(sev, struct cgroup_subsys_state, destroy_swork);
16047 struct cgroup_subsys *ss = css->ss;
16048 struct cgroup *cgrp = css->cgroup;
1a6e0f06 16049
e4b2b4a8
JK
16050@@ -4562,8 +4562,8 @@
16051 struct cgroup_subsys_state *css =
16052 container_of(ref, struct cgroup_subsys_state, refcnt);
1a6e0f06 16053
e4b2b4a8
JK
16054- INIT_WORK(&css->destroy_work, css_release_work_fn);
16055- queue_work(cgroup_destroy_wq, &css->destroy_work);
16056+ INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16057+ swork_queue(&css->destroy_swork);
1a6e0f06
JK
16058 }
16059
e4b2b4a8
JK
16060 static void init_and_link_css(struct cgroup_subsys_state *css,
16061@@ -5269,6 +5269,7 @@
16062 */
16063 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16064 BUG_ON(!cgroup_destroy_wq);
16065+ BUG_ON(swork_get());
16066 return 0;
16067 }
16068 core_initcall(cgroup_wq_init);
16069diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cgroup/cpuset.c linux-4.14/kernel/cgroup/cpuset.c
16070--- linux-4.14.orig/kernel/cgroup/cpuset.c 2017-11-12 19:46:13.000000000 +0100
16071+++ linux-4.14/kernel/cgroup/cpuset.c 2018-09-05 11:05:07.000000000 +0200
16072@@ -288,7 +288,7 @@
1a6e0f06 16073 */
1a6e0f06 16074
e4b2b4a8
JK
16075 static DEFINE_MUTEX(cpuset_mutex);
16076-static DEFINE_SPINLOCK(callback_lock);
16077+static DEFINE_RAW_SPINLOCK(callback_lock);
1a6e0f06 16078
e4b2b4a8 16079 static struct workqueue_struct *cpuset_migrate_mm_wq;
1a6e0f06 16080
e4b2b4a8
JK
16081@@ -926,9 +926,9 @@
16082 continue;
16083 rcu_read_unlock();
1a6e0f06 16084
e4b2b4a8
JK
16085- spin_lock_irq(&callback_lock);
16086+ raw_spin_lock_irq(&callback_lock);
16087 cpumask_copy(cp->effective_cpus, new_cpus);
16088- spin_unlock_irq(&callback_lock);
16089+ raw_spin_unlock_irq(&callback_lock);
1a6e0f06 16090
e4b2b4a8
JK
16091 WARN_ON(!is_in_v2_mode() &&
16092 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
16093@@ -993,9 +993,9 @@
16094 if (retval < 0)
16095 return retval;
1a6e0f06 16096
e4b2b4a8
JK
16097- spin_lock_irq(&callback_lock);
16098+ raw_spin_lock_irq(&callback_lock);
16099 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
16100- spin_unlock_irq(&callback_lock);
16101+ raw_spin_unlock_irq(&callback_lock);
16102
16103 /* use trialcs->cpus_allowed as a temp variable */
16104 update_cpumasks_hier(cs, trialcs->cpus_allowed);
16105@@ -1179,9 +1179,9 @@
16106 continue;
16107 rcu_read_unlock();
16108
16109- spin_lock_irq(&callback_lock);
16110+ raw_spin_lock_irq(&callback_lock);
16111 cp->effective_mems = *new_mems;
16112- spin_unlock_irq(&callback_lock);
16113+ raw_spin_unlock_irq(&callback_lock);
16114
16115 WARN_ON(!is_in_v2_mode() &&
16116 !nodes_equal(cp->mems_allowed, cp->effective_mems));
16117@@ -1249,9 +1249,9 @@
16118 if (retval < 0)
16119 goto done;
16120
16121- spin_lock_irq(&callback_lock);
16122+ raw_spin_lock_irq(&callback_lock);
16123 cs->mems_allowed = trialcs->mems_allowed;
16124- spin_unlock_irq(&callback_lock);
16125+ raw_spin_unlock_irq(&callback_lock);
16126
16127 /* use trialcs->mems_allowed as a temp variable */
16128 update_nodemasks_hier(cs, &trialcs->mems_allowed);
16129@@ -1342,9 +1342,9 @@
16130 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
16131 || (is_spread_page(cs) != is_spread_page(trialcs)));
16132
16133- spin_lock_irq(&callback_lock);
16134+ raw_spin_lock_irq(&callback_lock);
16135 cs->flags = trialcs->flags;
16136- spin_unlock_irq(&callback_lock);
16137+ raw_spin_unlock_irq(&callback_lock);
16138
16139 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
16140 rebuild_sched_domains_locked();
16141@@ -1759,7 +1759,7 @@
16142 cpuset_filetype_t type = seq_cft(sf)->private;
16143 int ret = 0;
1a6e0f06 16144
e4b2b4a8
JK
16145- spin_lock_irq(&callback_lock);
16146+ raw_spin_lock_irq(&callback_lock);
1a6e0f06 16147
e4b2b4a8
JK
16148 switch (type) {
16149 case FILE_CPULIST:
16150@@ -1778,7 +1778,7 @@
16151 ret = -EINVAL;
16152 }
1a6e0f06 16153
e4b2b4a8
JK
16154- spin_unlock_irq(&callback_lock);
16155+ raw_spin_unlock_irq(&callback_lock);
16156 return ret;
1a6e0f06
JK
16157 }
16158
e4b2b4a8 16159@@ -1993,12 +1993,12 @@
1a6e0f06 16160
e4b2b4a8 16161 cpuset_inc();
1a6e0f06 16162
e4b2b4a8
JK
16163- spin_lock_irq(&callback_lock);
16164+ raw_spin_lock_irq(&callback_lock);
16165 if (is_in_v2_mode()) {
16166 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
16167 cs->effective_mems = parent->effective_mems;
16168 }
16169- spin_unlock_irq(&callback_lock);
16170+ raw_spin_unlock_irq(&callback_lock);
16171
16172 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
16173 goto out_unlock;
16174@@ -2025,12 +2025,12 @@
16175 }
16176 rcu_read_unlock();
16177
16178- spin_lock_irq(&callback_lock);
16179+ raw_spin_lock_irq(&callback_lock);
16180 cs->mems_allowed = parent->mems_allowed;
16181 cs->effective_mems = parent->mems_allowed;
16182 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
16183 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
16184- spin_unlock_irq(&callback_lock);
16185+ raw_spin_unlock_irq(&callback_lock);
16186 out_unlock:
16187 mutex_unlock(&cpuset_mutex);
16188 return 0;
16189@@ -2069,7 +2069,7 @@
16190 static void cpuset_bind(struct cgroup_subsys_state *root_css)
1a6e0f06 16191 {
e4b2b4a8
JK
16192 mutex_lock(&cpuset_mutex);
16193- spin_lock_irq(&callback_lock);
16194+ raw_spin_lock_irq(&callback_lock);
16195
16196 if (is_in_v2_mode()) {
16197 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
16198@@ -2080,7 +2080,7 @@
16199 top_cpuset.mems_allowed = top_cpuset.effective_mems;
16200 }
16201
16202- spin_unlock_irq(&callback_lock);
16203+ raw_spin_unlock_irq(&callback_lock);
16204 mutex_unlock(&cpuset_mutex);
1a6e0f06
JK
16205 }
16206
e4b2b4a8
JK
16207@@ -2094,7 +2094,7 @@
16208 if (task_css_is_root(task, cpuset_cgrp_id))
16209 return;
16210
16211- set_cpus_allowed_ptr(task, &current->cpus_allowed);
16212+ set_cpus_allowed_ptr(task, current->cpus_ptr);
16213 task->mems_allowed = current->mems_allowed;
1a6e0f06
JK
16214 }
16215
e4b2b4a8 16216@@ -2178,12 +2178,12 @@
1a6e0f06 16217 {
e4b2b4a8 16218 bool is_empty;
1a6e0f06 16219
e4b2b4a8
JK
16220- spin_lock_irq(&callback_lock);
16221+ raw_spin_lock_irq(&callback_lock);
16222 cpumask_copy(cs->cpus_allowed, new_cpus);
16223 cpumask_copy(cs->effective_cpus, new_cpus);
16224 cs->mems_allowed = *new_mems;
16225 cs->effective_mems = *new_mems;
16226- spin_unlock_irq(&callback_lock);
16227+ raw_spin_unlock_irq(&callback_lock);
1a6e0f06 16228
e4b2b4a8
JK
16229 /*
16230 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
16231@@ -2220,10 +2220,10 @@
16232 if (nodes_empty(*new_mems))
16233 *new_mems = parent_cs(cs)->effective_mems;
1a6e0f06 16234
e4b2b4a8
JK
16235- spin_lock_irq(&callback_lock);
16236+ raw_spin_lock_irq(&callback_lock);
16237 cpumask_copy(cs->effective_cpus, new_cpus);
16238 cs->effective_mems = *new_mems;
16239- spin_unlock_irq(&callback_lock);
16240+ raw_spin_unlock_irq(&callback_lock);
1a6e0f06 16241
e4b2b4a8
JK
16242 if (cpus_updated)
16243 update_tasks_cpumask(cs);
16244@@ -2316,21 +2316,21 @@
1a6e0f06 16245
e4b2b4a8
JK
16246 /* synchronize cpus_allowed to cpu_active_mask */
16247 if (cpus_updated) {
16248- spin_lock_irq(&callback_lock);
16249+ raw_spin_lock_irq(&callback_lock);
16250 if (!on_dfl)
16251 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
16252 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
16253- spin_unlock_irq(&callback_lock);
16254+ raw_spin_unlock_irq(&callback_lock);
16255 /* we don't mess with cpumasks of tasks in top_cpuset */
16256 }
1a6e0f06 16257
e4b2b4a8
JK
16258 /* synchronize mems_allowed to N_MEMORY */
16259 if (mems_updated) {
16260- spin_lock_irq(&callback_lock);
16261+ raw_spin_lock_irq(&callback_lock);
16262 if (!on_dfl)
16263 top_cpuset.mems_allowed = new_mems;
16264 top_cpuset.effective_mems = new_mems;
16265- spin_unlock_irq(&callback_lock);
16266+ raw_spin_unlock_irq(&callback_lock);
16267 update_tasks_nodemask(&top_cpuset);
16268 }
1a6e0f06 16269
e4b2b4a8
JK
16270@@ -2429,11 +2429,11 @@
16271 {
16272 unsigned long flags;
1a6e0f06 16273
e4b2b4a8
JK
16274- spin_lock_irqsave(&callback_lock, flags);
16275+ raw_spin_lock_irqsave(&callback_lock, flags);
16276 rcu_read_lock();
16277 guarantee_online_cpus(task_cs(tsk), pmask);
16278 rcu_read_unlock();
16279- spin_unlock_irqrestore(&callback_lock, flags);
16280+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16281 }
1a6e0f06 16282
e4b2b4a8
JK
16283 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
16284@@ -2481,11 +2481,11 @@
16285 nodemask_t mask;
16286 unsigned long flags;
1a6e0f06 16287
e4b2b4a8
JK
16288- spin_lock_irqsave(&callback_lock, flags);
16289+ raw_spin_lock_irqsave(&callback_lock, flags);
16290 rcu_read_lock();
16291 guarantee_online_mems(task_cs(tsk), &mask);
16292 rcu_read_unlock();
16293- spin_unlock_irqrestore(&callback_lock, flags);
16294+ raw_spin_unlock_irqrestore(&callback_lock, flags);
1a6e0f06 16295
e4b2b4a8 16296 return mask;
1a6e0f06 16297 }
e4b2b4a8
JK
16298@@ -2577,14 +2577,14 @@
16299 return true;
16300
16301 /* Not hardwall and node outside mems_allowed: scan up cpusets */
16302- spin_lock_irqsave(&callback_lock, flags);
16303+ raw_spin_lock_irqsave(&callback_lock, flags);
16304
16305 rcu_read_lock();
16306 cs = nearest_hardwall_ancestor(task_cs(current));
16307 allowed = node_isset(node, cs->mems_allowed);
16308 rcu_read_unlock();
1a6e0f06 16309
e4b2b4a8
JK
16310- spin_unlock_irqrestore(&callback_lock, flags);
16311+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16312 return allowed;
1a6e0f06
JK
16313 }
16314
e4b2b4a8
JK
16315diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cpu.c linux-4.14/kernel/cpu.c
16316--- linux-4.14.orig/kernel/cpu.c 2018-09-05 11:03:22.000000000 +0200
16317+++ linux-4.14/kernel/cpu.c 2018-09-05 11:05:07.000000000 +0200
16318@@ -74,6 +74,11 @@
16319 .fail = CPUHP_INVALID,
16320 };
16321
16322+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
16323+static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
16324+ __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
1a6e0f06
JK
16325+#endif
16326+
e4b2b4a8
JK
16327 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
16328 static struct lockdep_map cpuhp_state_up_map =
16329 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
16330@@ -287,6 +292,55 @@
16331
16332 #ifdef CONFIG_HOTPLUG_CPU
16333
16334+/**
16335+ * pin_current_cpu - Prevent the current cpu from being unplugged
1a6e0f06 16336+ */
e4b2b4a8 16337+void pin_current_cpu(void)
1a6e0f06 16338+{
e4b2b4a8
JK
16339+#ifdef CONFIG_PREEMPT_RT_FULL
16340+ struct rt_rw_lock *cpuhp_pin;
16341+ unsigned int cpu;
16342+ int ret;
1a6e0f06 16343+
e4b2b4a8
JK
16344+again:
16345+ cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16346+ ret = __read_rt_trylock(cpuhp_pin);
16347+ if (ret) {
16348+ current->pinned_on_cpu = smp_processor_id();
16349+ return;
16350+ }
16351+ cpu = smp_processor_id();
16352+ preempt_lazy_enable();
16353+ preempt_enable();
1a6e0f06 16354+
e4b2b4a8 16355+ __read_rt_lock(cpuhp_pin);
1a6e0f06 16356+
e4b2b4a8
JK
16357+ preempt_disable();
16358+ preempt_lazy_disable();
16359+ if (cpu != smp_processor_id()) {
16360+ __read_rt_unlock(cpuhp_pin);
16361+ goto again;
16362+ }
16363+ current->pinned_on_cpu = cpu;
16364+#endif
16365+}
1a6e0f06 16366+
e4b2b4a8
JK
16367+/**
16368+ * unpin_current_cpu - Allow unplug of current cpu
16369+ */
16370+void unpin_current_cpu(void)
16371+{
16372+#ifdef CONFIG_PREEMPT_RT_FULL
16373+ struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
1a6e0f06 16374+
e4b2b4a8
JK
16375+ if (WARN_ON(current->pinned_on_cpu != smp_processor_id()))
16376+ cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu);
1a6e0f06 16377+
e4b2b4a8
JK
16378+ current->pinned_on_cpu = -1;
16379+ __read_rt_unlock(cpuhp_pin);
16380+#endif
16381+}
1a6e0f06 16382+
e4b2b4a8
JK
16383 DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
16384
16385 void cpus_read_lock(void)
16386@@ -843,6 +897,9 @@
16387
16388 static int takedown_cpu(unsigned int cpu)
16389 {
16390+#ifdef CONFIG_PREEMPT_RT_FULL
16391+ struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu);
16392+#endif
16393 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
16394 int err;
16395
16396@@ -855,11 +912,18 @@
16397 */
16398 irq_lock_sparse();
16399
16400+#ifdef CONFIG_PREEMPT_RT_FULL
16401+ __write_rt_lock(cpuhp_pin);
1a6e0f06
JK
16402+#endif
16403+
e4b2b4a8
JK
16404 /*
16405 * So now all preempt/rcu users must observe !cpu_active().
16406 */
16407 err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
16408 if (err) {
16409+#ifdef CONFIG_PREEMPT_RT_FULL
16410+ __write_rt_unlock(cpuhp_pin);
1a6e0f06 16411+#endif
e4b2b4a8
JK
16412 /* CPU refused to die */
16413 irq_unlock_sparse();
16414 /* Unpark the hotplug thread so we can rollback there */
16415@@ -878,6 +942,9 @@
16416 wait_for_ap_thread(st, false);
16417 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
16418
16419+#ifdef CONFIG_PREEMPT_RT_FULL
16420+ __write_rt_unlock(cpuhp_pin);
1a6e0f06 16421+#endif
e4b2b4a8
JK
16422 /* Interrupts are moved away from the dying cpu, reenable alloc/free */
16423 irq_unlock_sparse();
1a6e0f06 16424
e4b2b4a8
JK
16425diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/debug/kdb/kdb_io.c linux-4.14/kernel/debug/kdb/kdb_io.c
16426--- linux-4.14.orig/kernel/debug/kdb/kdb_io.c 2018-09-05 11:03:22.000000000 +0200
16427+++ linux-4.14/kernel/debug/kdb/kdb_io.c 2018-09-05 11:05:07.000000000 +0200
16428@@ -854,9 +854,11 @@
16429 va_list ap;
16430 int r;
16431
16432+ kdb_trap_printk++;
16433 va_start(ap, fmt);
16434 r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
16435 va_end(ap);
16436+ kdb_trap_printk--;
16437
16438 return r;
16439 }
16440diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/events/core.c linux-4.14/kernel/events/core.c
16441--- linux-4.14.orig/kernel/events/core.c 2018-09-05 11:03:22.000000000 +0200
16442+++ linux-4.14/kernel/events/core.c 2018-09-05 11:05:07.000000000 +0200
16443@@ -1065,7 +1065,7 @@
16444 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
16445
16446 raw_spin_lock_init(&cpuctx->hrtimer_lock);
16447- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
16448+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
16449 timer->function = perf_mux_hrtimer_handler;
16450 }
16451
16452@@ -8750,7 +8750,7 @@
16453 if (!is_sampling_event(event))
16454 return;
16455
16456- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16457+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
16458 hwc->hrtimer.function = perf_swevent_hrtimer;
16459
16460 /*
16461diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/exit.c linux-4.14/kernel/exit.c
16462--- linux-4.14.orig/kernel/exit.c 2018-09-05 11:03:22.000000000 +0200
16463+++ linux-4.14/kernel/exit.c 2018-09-05 11:05:07.000000000 +0200
16464@@ -159,7 +159,7 @@
16465 * Do this under ->siglock, we can race with another thread
16466 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
16467 */
16468- flush_sigqueue(&tsk->pending);
16469+ flush_task_sigqueue(tsk);
16470 tsk->sighand = NULL;
16471 spin_unlock(&sighand->siglock);
16472
16473diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/fork.c linux-4.14/kernel/fork.c
16474--- linux-4.14.orig/kernel/fork.c 2018-09-05 11:03:28.000000000 +0200
16475+++ linux-4.14/kernel/fork.c 2018-09-05 11:05:07.000000000 +0200
16476@@ -40,6 +40,7 @@
16477 #include <linux/hmm.h>
16478 #include <linux/fs.h>
16479 #include <linux/mm.h>
16480+#include <linux/kprobes.h>
16481 #include <linux/vmacache.h>
16482 #include <linux/nsproxy.h>
16483 #include <linux/capability.h>
16484@@ -407,13 +408,24 @@
16485 if (atomic_dec_and_test(&sig->sigcnt))
16486 free_signal_struct(sig);
16487 }
16488-
16489+#ifdef CONFIG_PREEMPT_RT_BASE
16490+static
1a6e0f06 16491+#endif
e4b2b4a8
JK
16492 void __put_task_struct(struct task_struct *tsk)
16493 {
16494 WARN_ON(!tsk->exit_state);
16495 WARN_ON(atomic_read(&tsk->usage));
16496 WARN_ON(tsk == current);
16497
16498+ /*
16499+ * Remove function-return probe instances associated with this
16500+ * task and put them back on the free list.
16501+ */
16502+ kprobe_flush_task(tsk);
1a6e0f06 16503+
e4b2b4a8
JK
16504+ /* Task is done with its stack. */
16505+ put_task_stack(tsk);
16506+
16507 cgroup_free(tsk);
16508 task_numa_free(tsk);
16509 security_task_free(tsk);
16510@@ -424,7 +436,18 @@
16511 if (!profile_handoff_task(tsk))
16512 free_task(tsk);
16513 }
16514+#ifndef CONFIG_PREEMPT_RT_BASE
16515 EXPORT_SYMBOL_GPL(__put_task_struct);
1a6e0f06 16516+#else
e4b2b4a8
JK
16517+void __put_task_struct_cb(struct rcu_head *rhp)
16518+{
16519+ struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
1a6e0f06 16520+
e4b2b4a8
JK
16521+ __put_task_struct(tsk);
16522+
16523+}
16524+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
16525+#endif
16526
16527 void __init __weak arch_task_cache_init(void) { }
16528
16529@@ -563,7 +586,8 @@
16530 #ifdef CONFIG_CC_STACKPROTECTOR
16531 tsk->stack_canary = get_random_canary();
1a6e0f06 16532 #endif
e4b2b4a8
JK
16533-
16534+ if (orig->cpus_ptr == &orig->cpus_mask)
16535+ tsk->cpus_ptr = &tsk->cpus_mask;
16536 /*
16537 * One for us, one for whoever does the "release_task()" (usually
16538 * parent)
16539@@ -575,6 +599,7 @@
16540 tsk->splice_pipe = NULL;
16541 tsk->task_frag.page = NULL;
16542 tsk->wake_q.next = NULL;
16543+ tsk->wake_q_sleeper.next = NULL;
1a6e0f06 16544
e4b2b4a8
JK
16545 account_kernel_stack(tsk, 1);
16546
16547@@ -915,6 +940,19 @@
16548 }
16549 EXPORT_SYMBOL_GPL(__mmdrop);
16550
16551+#ifdef CONFIG_PREEMPT_RT_BASE
16552+/*
16553+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
16554+ * want another facility to make this work.
16555+ */
16556+void __mmdrop_delayed(struct rcu_head *rhp)
16557+{
16558+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
16559+
16560+ __mmdrop(mm);
16561+}
16562+#endif
16563+
16564 static inline void __mmput(struct mm_struct *mm)
16565 {
16566 VM_BUG_ON(atomic_read(&mm->mm_users));
16567@@ -1494,6 +1532,9 @@
16568 */
16569 static void posix_cpu_timers_init(struct task_struct *tsk)
16570 {
16571+#ifdef CONFIG_PREEMPT_RT_BASE
16572+ tsk->posix_timer_list = NULL;
16573+#endif
16574 tsk->cputime_expires.prof_exp = 0;
16575 tsk->cputime_expires.virt_exp = 0;
16576 tsk->cputime_expires.sched_exp = 0;
16577@@ -1646,6 +1687,7 @@
16578 spin_lock_init(&p->alloc_lock);
16579
16580 init_sigpending(&p->pending);
16581+ p->sigqueue_cache = NULL;
16582
16583 p->utime = p->stime = p->gtime = 0;
16584 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
16585diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/futex.c linux-4.14/kernel/futex.c
16586--- linux-4.14.orig/kernel/futex.c 2018-09-05 11:03:22.000000000 +0200
16587+++ linux-4.14/kernel/futex.c 2018-09-05 11:05:07.000000000 +0200
16588@@ -936,7 +936,9 @@
16589 if (head->next != next) {
16590 /* retain curr->pi_lock for the loop invariant */
16591 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
16592+ raw_spin_unlock_irq(&curr->pi_lock);
16593 spin_unlock(&hb->lock);
16594+ raw_spin_lock_irq(&curr->pi_lock);
16595 put_pi_state(pi_state);
16596 continue;
16597 }
16598@@ -1430,6 +1432,7 @@
16599 struct task_struct *new_owner;
16600 bool postunlock = false;
16601 DEFINE_WAKE_Q(wake_q);
16602+ DEFINE_WAKE_Q(wake_sleeper_q);
16603 int ret = 0;
16604
16605 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
16606@@ -1491,13 +1494,13 @@
16607 pi_state->owner = new_owner;
16608 raw_spin_unlock(&new_owner->pi_lock);
16609
16610- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1a6e0f06 16611-
e4b2b4a8
JK
16612+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
16613+ &wake_sleeper_q);
16614 out_unlock:
16615 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
16616
16617 if (postunlock)
16618- rt_mutex_postunlock(&wake_q);
16619+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
16620
16621 return ret;
16622 }
16623@@ -2104,6 +2107,16 @@
16624 requeue_pi_wake_futex(this, &key2, hb2);
16625 drop_count++;
16626 continue;
16627+ } else if (ret == -EAGAIN) {
16628+ /*
16629+ * Waiter was woken by timeout or
16630+ * signal and has set pi_blocked_on to
16631+ * PI_WAKEUP_INPROGRESS before we
16632+ * tried to enqueue it on the rtmutex.
16633+ */
16634+ this->pi_state = NULL;
16635+ put_pi_state(pi_state);
16636+ continue;
16637 } else if (ret) {
16638 /*
16639 * rt_mutex_start_proxy_lock() detected a
16640@@ -2642,10 +2655,9 @@
16641 if (abs_time) {
16642 to = &timeout;
16643
16644- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
16645- CLOCK_REALTIME : CLOCK_MONOTONIC,
16646- HRTIMER_MODE_ABS);
16647- hrtimer_init_sleeper(to, current);
16648+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
16649+ CLOCK_REALTIME : CLOCK_MONOTONIC,
16650+ HRTIMER_MODE_ABS, current);
16651 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
16652 current->timer_slack_ns);
16653 }
16654@@ -2744,9 +2756,8 @@
16655
16656 if (time) {
16657 to = &timeout;
16658- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
16659- HRTIMER_MODE_ABS);
16660- hrtimer_init_sleeper(to, current);
16661+ hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
16662+ HRTIMER_MODE_ABS, current);
16663 hrtimer_set_expires(&to->timer, *time);
16664 }
16665
16666@@ -2801,7 +2812,7 @@
16667 goto no_block;
16668 }
16669
16670- rt_mutex_init_waiter(&rt_waiter);
16671+ rt_mutex_init_waiter(&rt_waiter, false);
16672
16673 /*
16674 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
16675@@ -2816,9 +2827,18 @@
16676 * lock handoff sequence.
16677 */
16678 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
16679+ /*
16680+ * the migrate_disable() here disables migration in the in_atomic() fast
16681+ * path which is enabled again in the following spin_unlock(). We have
16682+ * one migrate_disable() pending in the slow-path which is reversed
16683+ * after the raw_spin_unlock_irq() where we leave the atomic context.
16684+ */
16685+ migrate_disable();
16686+
16687 spin_unlock(q.lock_ptr);
16688 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
16689 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
16690+ migrate_enable();
16691
16692 if (ret) {
16693 if (ret == 1)
16694@@ -2965,11 +2985,21 @@
16695 * observed.
16696 */
16697 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
16698+ /*
16699+ * Magic trickery for now to make the RT migrate disable
16700+ * logic happy. The following spin_unlock() happens with
16701+ * interrupts disabled so the internal migrate_enable()
16702+ * won't undo the migrate_disable() which was issued when
16703+ * locking hb->lock.
16704+ */
16705+ migrate_disable();
16706 spin_unlock(&hb->lock);
16707
16708 /* drops pi_state->pi_mutex.wait_lock */
16709 ret = wake_futex_pi(uaddr, uval, pi_state);
16710
16711+ migrate_enable();
16712+
16713 put_pi_state(pi_state);
16714
16715 /*
16716@@ -3127,7 +3157,7 @@
16717 struct hrtimer_sleeper timeout, *to = NULL;
16718 struct futex_pi_state *pi_state = NULL;
16719 struct rt_mutex_waiter rt_waiter;
16720- struct futex_hash_bucket *hb;
16721+ struct futex_hash_bucket *hb, *hb2;
16722 union futex_key key2 = FUTEX_KEY_INIT;
16723 struct futex_q q = futex_q_init;
16724 int res, ret;
16725@@ -3143,10 +3173,9 @@
16726
16727 if (abs_time) {
16728 to = &timeout;
16729- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
16730- CLOCK_REALTIME : CLOCK_MONOTONIC,
16731- HRTIMER_MODE_ABS);
16732- hrtimer_init_sleeper(to, current);
16733+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
16734+ CLOCK_REALTIME : CLOCK_MONOTONIC,
16735+ HRTIMER_MODE_ABS, current);
16736 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
16737 current->timer_slack_ns);
16738 }
16739@@ -3155,7 +3184,7 @@
16740 * The waiter is allocated on our stack, manipulated by the requeue
16741 * code while we sleep on uaddr.
16742 */
16743- rt_mutex_init_waiter(&rt_waiter);
16744+ rt_mutex_init_waiter(&rt_waiter, false);
1a6e0f06 16745
e4b2b4a8
JK
16746 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
16747 if (unlikely(ret != 0))
16748@@ -3186,20 +3215,55 @@
16749 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
16750 futex_wait_queue_me(hb, &q, to);
1a6e0f06 16751
e4b2b4a8
JK
16752- spin_lock(&hb->lock);
16753- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
16754- spin_unlock(&hb->lock);
16755- if (ret)
16756- goto out_put_keys;
16757+ /*
16758+ * On RT we must avoid races with requeue and trying to block
16759+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
16760+ * serializing access to pi_blocked_on with pi_lock.
16761+ */
16762+ raw_spin_lock_irq(&current->pi_lock);
16763+ if (current->pi_blocked_on) {
16764+ /*
16765+ * We have been requeued or are in the process of
16766+ * being requeued.
16767+ */
16768+ raw_spin_unlock_irq(&current->pi_lock);
16769+ } else {
16770+ /*
16771+ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
16772+ * prevents a concurrent requeue from moving us to the
16773+ * uaddr2 rtmutex. After that we can safely acquire
16774+ * (and possibly block on) hb->lock.
16775+ */
16776+ current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
16777+ raw_spin_unlock_irq(&current->pi_lock);
1a6e0f06 16778+
e4b2b4a8 16779+ spin_lock(&hb->lock);
1a6e0f06 16780+
e4b2b4a8
JK
16781+ /*
16782+ * Clean up pi_blocked_on. We might leak it otherwise
16783+ * when we succeeded with the hb->lock in the fast
16784+ * path.
16785+ */
16786+ raw_spin_lock_irq(&current->pi_lock);
16787+ current->pi_blocked_on = NULL;
16788+ raw_spin_unlock_irq(&current->pi_lock);
1a6e0f06 16789+
e4b2b4a8
JK
16790+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
16791+ spin_unlock(&hb->lock);
16792+ if (ret)
16793+ goto out_put_keys;
1a6e0f06 16794+ }
c7c16703 16795
e4b2b4a8
JK
16796 /*
16797- * In order for us to be here, we know our q.key == key2, and since
16798- * we took the hb->lock above, we also know that futex_requeue() has
16799- * completed and we no longer have to concern ourselves with a wakeup
16800- * race with the atomic proxy lock acquisition by the requeue code. The
16801- * futex_requeue dropped our key1 reference and incremented our key2
16802- * reference count.
16803+ * In order to be here, we have either been requeued, are in
16804+ * the process of being requeued, or requeue successfully
16805+ * acquired uaddr2 on our behalf. If pi_blocked_on was
16806+ * non-null above, we may be racing with a requeue. Do not
16807+ * rely on q->lock_ptr to be hb2->lock until after blocking on
16808+ * hb->lock or hb2->lock. The futex_requeue dropped our key1
16809+ * reference and incremented our key2 reference count.
16810 */
16811+ hb2 = hash_futex(&key2);
16812
16813 /* Check if the requeue code acquired the second futex for us. */
16814 if (!q.rt_waiter) {
16815@@ -3208,7 +3272,8 @@
16816 * did a lock-steal - fix up the PI-state in that case.
16817 */
16818 if (q.pi_state && (q.pi_state->owner != current)) {
16819- spin_lock(q.lock_ptr);
16820+ spin_lock(&hb2->lock);
16821+ BUG_ON(&hb2->lock != q.lock_ptr);
16822 ret = fixup_pi_state_owner(uaddr2, &q, current);
16823 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
16824 pi_state = q.pi_state;
16825@@ -3219,7 +3284,7 @@
16826 * the requeue_pi() code acquired for us.
16827 */
16828 put_pi_state(q.pi_state);
16829- spin_unlock(q.lock_ptr);
16830+ spin_unlock(&hb2->lock);
16831 }
16832 } else {
16833 struct rt_mutex *pi_mutex;
16834@@ -3233,7 +3298,8 @@
16835 pi_mutex = &q.pi_state->pi_mutex;
16836 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
16837
16838- spin_lock(q.lock_ptr);
16839+ spin_lock(&hb2->lock);
16840+ BUG_ON(&hb2->lock != q.lock_ptr);
16841 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
16842 ret = 0;
16843
16844diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/handle.c linux-4.14/kernel/irq/handle.c
16845--- linux-4.14.orig/kernel/irq/handle.c 2017-11-12 19:46:13.000000000 +0100
16846+++ linux-4.14/kernel/irq/handle.c 2018-09-05 11:05:07.000000000 +0200
16847@@ -183,10 +183,16 @@
16848 {
16849 irqreturn_t retval;
16850 unsigned int flags = 0;
16851+ struct pt_regs *regs = get_irq_regs();
16852+ u64 ip = regs ? instruction_pointer(regs) : 0;
16853
16854 retval = __handle_irq_event_percpu(desc, &flags);
16855
16856- add_interrupt_randomness(desc->irq_data.irq, flags);
c7c16703 16857+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 16858+ desc->random_ip = ip;
c7c16703 16859+#else
e4b2b4a8 16860+ add_interrupt_randomness(desc->irq_data.irq, flags, ip);
c7c16703
JK
16861+#endif
16862
e4b2b4a8
JK
16863 if (!noirqdebug)
16864 note_interrupt(desc, retval);
16865diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/manage.c linux-4.14/kernel/irq/manage.c
16866--- linux-4.14.orig/kernel/irq/manage.c 2018-09-05 11:03:22.000000000 +0200
16867+++ linux-4.14/kernel/irq/manage.c 2018-09-05 11:05:07.000000000 +0200
16868@@ -24,6 +24,7 @@
16869 #include "internals.h"
1a6e0f06 16870
e4b2b4a8
JK
16871 #ifdef CONFIG_IRQ_FORCED_THREADING
16872+# ifndef CONFIG_PREEMPT_RT_BASE
16873 __read_mostly bool force_irqthreads;
16874
16875 static int __init setup_forced_irqthreads(char *arg)
16876@@ -32,6 +33,7 @@
16877 return 0;
16878 }
16879 early_param("threadirqs", setup_forced_irqthreads);
16880+# endif
16881 #endif
16882
16883 static void __synchronize_hardirq(struct irq_desc *desc)
16884@@ -224,7 +226,12 @@
16885
16886 if (desc->affinity_notify) {
16887 kref_get(&desc->affinity_notify->kref);
1a6e0f06 16888+
e4b2b4a8
JK
16889+#ifdef CONFIG_PREEMPT_RT_BASE
16890+ swork_queue(&desc->affinity_notify->swork);
1a6e0f06 16891+#else
e4b2b4a8
JK
16892 schedule_work(&desc->affinity_notify->work);
16893+#endif
16894 }
16895 irqd_set(data, IRQD_AFFINITY_SET);
1a6e0f06 16896
e4b2b4a8
JK
16897@@ -262,10 +269,8 @@
16898 }
16899 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
16900
16901-static void irq_affinity_notify(struct work_struct *work)
16902+static void _irq_affinity_notify(struct irq_affinity_notify *notify)
1a6e0f06 16903 {
e4b2b4a8
JK
16904- struct irq_affinity_notify *notify =
16905- container_of(work, struct irq_affinity_notify, work);
16906 struct irq_desc *desc = irq_to_desc(notify->irq);
16907 cpumask_var_t cpumask;
16908 unsigned long flags;
16909@@ -287,6 +292,35 @@
16910 kref_put(&notify->kref, notify->release);
1a6e0f06
JK
16911 }
16912
e4b2b4a8
JK
16913+#ifdef CONFIG_PREEMPT_RT_BASE
16914+static void init_helper_thread(void)
1a6e0f06 16915+{
e4b2b4a8
JK
16916+ static int init_sworker_once;
16917+
16918+ if (init_sworker_once)
16919+ return;
16920+ if (WARN_ON(swork_get()))
16921+ return;
16922+ init_sworker_once = 1;
1a6e0f06
JK
16923+}
16924+
e4b2b4a8 16925+static void irq_affinity_notify(struct swork_event *swork)
1a6e0f06 16926+{
e4b2b4a8
JK
16927+ struct irq_affinity_notify *notify =
16928+ container_of(swork, struct irq_affinity_notify, swork);
16929+ _irq_affinity_notify(notify);
1a6e0f06
JK
16930+}
16931+
e4b2b4a8
JK
16932+#else
16933+
16934+static void irq_affinity_notify(struct work_struct *work)
1a6e0f06 16935+{
e4b2b4a8
JK
16936+ struct irq_affinity_notify *notify =
16937+ container_of(work, struct irq_affinity_notify, work);
16938+ _irq_affinity_notify(notify);
1a6e0f06
JK
16939+}
16940+#endif
16941+
e4b2b4a8
JK
16942 /**
16943 * irq_set_affinity_notifier - control notification of IRQ affinity changes
16944 * @irq: Interrupt for which to enable/disable notification
16945@@ -315,7 +349,12 @@
16946 if (notify) {
16947 notify->irq = irq;
16948 kref_init(&notify->kref);
16949+#ifdef CONFIG_PREEMPT_RT_BASE
16950+ INIT_SWORK(&notify->swork, irq_affinity_notify);
16951+ init_helper_thread();
16952+#else
16953 INIT_WORK(&notify->work, irq_affinity_notify);
16954+#endif
16955 }
1a6e0f06 16956
e4b2b4a8
JK
16957 raw_spin_lock_irqsave(&desc->lock, flags);
16958@@ -883,7 +922,15 @@
16959 local_bh_disable();
16960 ret = action->thread_fn(action->irq, action->dev_id);
16961 irq_finalize_oneshot(desc, action);
16962- local_bh_enable();
16963+ /*
16964+ * Interrupts which have real time requirements can be set up
16965+ * to avoid softirq processing in the thread handler. This is
16966+ * safe as these interrupts do not raise soft interrupts.
16967+ */
16968+ if (irq_settings_no_softirq_call(desc))
16969+ _local_bh_enable();
16970+ else
16971+ local_bh_enable();
16972 return ret;
16973 }
1a6e0f06 16974
e4b2b4a8
JK
16975@@ -980,6 +1027,12 @@
16976 if (action_ret == IRQ_WAKE_THREAD)
16977 irq_wake_secondary(desc, action);
1a6e0f06 16978
e4b2b4a8
JK
16979+#ifdef CONFIG_PREEMPT_RT_FULL
16980+ migrate_disable();
16981+ add_interrupt_randomness(action->irq, 0,
16982+ desc->random_ip ^ (unsigned long) action);
16983+ migrate_enable();
16984+#endif
16985 wake_threads_waitq(desc);
16986 }
1a6e0f06 16987
e4b2b4a8
JK
16988@@ -1378,6 +1431,9 @@
16989 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
16990 }
1a6e0f06 16991
e4b2b4a8
JK
16992+ if (new->flags & IRQF_NO_SOFTIRQ_CALL)
16993+ irq_settings_set_no_softirq_call(desc);
1a6e0f06 16994+
e4b2b4a8
JK
16995 if (irq_settings_can_autoenable(desc)) {
16996 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
16997 } else {
16998@@ -2159,7 +2215,7 @@
16999 * This call sets the internal irqchip state of an interrupt,
17000 * depending on the value of @which.
1a6e0f06 17001 *
e4b2b4a8
JK
17002- * This function should be called with preemption disabled if the
17003+ * This function should be called with migration disabled if the
17004 * interrupt controller has per-cpu registers.
17005 */
17006 int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17007diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/settings.h linux-4.14/kernel/irq/settings.h
17008--- linux-4.14.orig/kernel/irq/settings.h 2017-11-12 19:46:13.000000000 +0100
17009+++ linux-4.14/kernel/irq/settings.h 2018-09-05 11:05:07.000000000 +0200
17010@@ -17,6 +17,7 @@
17011 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17012 _IRQ_IS_POLLED = IRQ_IS_POLLED,
17013 _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
17014+ _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
17015 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17016 };
1a6e0f06 17017
e4b2b4a8
JK
17018@@ -31,6 +32,7 @@
17019 #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
17020 #define IRQ_IS_POLLED GOT_YOU_MORON
17021 #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
17022+#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
17023 #undef IRQF_MODIFY_MASK
17024 #define IRQF_MODIFY_MASK GOT_YOU_MORON
1a6e0f06 17025
e4b2b4a8
JK
17026@@ -41,6 +43,16 @@
17027 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17028 }
1a6e0f06 17029
e4b2b4a8
JK
17030+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17031+{
17032+ return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17033+}
1a6e0f06 17034+
e4b2b4a8
JK
17035+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17036+{
17037+ desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17038+}
1a6e0f06 17039+
e4b2b4a8
JK
17040 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17041 {
17042 return desc->status_use_accessors & _IRQ_PER_CPU;
17043diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/spurious.c linux-4.14/kernel/irq/spurious.c
17044--- linux-4.14.orig/kernel/irq/spurious.c 2017-11-12 19:46:13.000000000 +0100
17045+++ linux-4.14/kernel/irq/spurious.c 2018-09-05 11:05:07.000000000 +0200
17046@@ -445,6 +445,10 @@
1a6e0f06 17047
e4b2b4a8
JK
17048 static int __init irqfixup_setup(char *str)
17049 {
17050+#ifdef CONFIG_PREEMPT_RT_BASE
17051+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17052+ return 1;
17053+#endif
17054 irqfixup = 1;
17055 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17056 printk(KERN_WARNING "This may impact system performance.\n");
17057@@ -457,6 +461,10 @@
1a6e0f06 17058
e4b2b4a8
JK
17059 static int __init irqpoll_setup(char *str)
17060 {
17061+#ifdef CONFIG_PREEMPT_RT_BASE
17062+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17063+ return 1;
17064+#endif
17065 irqfixup = 2;
17066 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17067 "enabled\n");
17068diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq_work.c linux-4.14/kernel/irq_work.c
17069--- linux-4.14.orig/kernel/irq_work.c 2017-11-12 19:46:13.000000000 +0100
17070+++ linux-4.14/kernel/irq_work.c 2018-09-05 11:05:07.000000000 +0200
17071@@ -17,6 +17,7 @@
17072 #include <linux/cpu.h>
17073 #include <linux/notifier.h>
17074 #include <linux/smp.h>
17075+#include <linux/interrupt.h>
17076 #include <asm/processor.h>
1a6e0f06 17077
1a6e0f06 17078
e4b2b4a8 17079@@ -65,6 +66,8 @@
1a6e0f06 17080 */
e4b2b4a8
JK
17081 bool irq_work_queue_on(struct irq_work *work, int cpu)
17082 {
17083+ struct llist_head *list;
1a6e0f06 17084+
e4b2b4a8
JK
17085 /* All work should have been flushed before going offline */
17086 WARN_ON_ONCE(cpu_is_offline(cpu));
1a6e0f06 17087
e4b2b4a8
JK
17088@@ -75,7 +78,12 @@
17089 if (!irq_work_claim(work))
17090 return false;
17091
17092- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17093+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17094+ list = &per_cpu(lazy_list, cpu);
17095+ else
17096+ list = &per_cpu(raised_list, cpu);
17097+
17098+ if (llist_add(&work->llnode, list))
17099 arch_send_call_function_single_ipi(cpu);
c7c16703 17100
e4b2b4a8
JK
17101 return true;
17102@@ -86,6 +94,9 @@
17103 /* Enqueue the irq work @work on the current CPU */
17104 bool irq_work_queue(struct irq_work *work)
17105 {
17106+ struct llist_head *list;
17107+ bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17108+
17109 /* Only queue if not already pending */
17110 if (!irq_work_claim(work))
17111 return false;
17112@@ -93,13 +104,15 @@
17113 /* Queue the entry and raise the IPI if needed. */
17114 preempt_disable();
c7c16703 17115
e4b2b4a8
JK
17116- /* If the work is "lazy", handle it from next tick if any */
17117- if (work->flags & IRQ_WORK_LAZY) {
17118- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17119- tick_nohz_tick_stopped())
17120- arch_irq_work_raise();
17121- } else {
17122- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17123+ lazy_work = work->flags & IRQ_WORK_LAZY;
17124+
17125+ if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17126+ list = this_cpu_ptr(&lazy_list);
17127+ else
17128+ list = this_cpu_ptr(&raised_list);
17129+
17130+ if (llist_add(&work->llnode, list)) {
17131+ if (!lazy_work || tick_nohz_tick_stopped())
17132 arch_irq_work_raise();
17133 }
c7c16703 17134
e4b2b4a8
JK
17135@@ -116,9 +129,8 @@
17136 raised = this_cpu_ptr(&raised_list);
17137 lazy = this_cpu_ptr(&lazy_list);
c7c16703 17138
e4b2b4a8
JK
17139- if (llist_empty(raised) || arch_irq_work_has_interrupt())
17140- if (llist_empty(lazy))
17141- return false;
17142+ if (llist_empty(raised) && llist_empty(lazy))
17143+ return false;
c7c16703 17144
e4b2b4a8
JK
17145 /* All work should have been flushed before going offline */
17146 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17147@@ -132,7 +144,7 @@
17148 struct irq_work *work;
17149 struct llist_node *llnode;
c7c16703 17150
e4b2b4a8
JK
17151- BUG_ON(!irqs_disabled());
17152+ BUG_ON_NONRT(!irqs_disabled());
c7c16703 17153
e4b2b4a8
JK
17154 if (llist_empty(list))
17155 return;
17156@@ -169,7 +181,16 @@
17157 void irq_work_run(void)
c7c16703 17158 {
e4b2b4a8
JK
17159 irq_work_run_list(this_cpu_ptr(&raised_list));
17160- irq_work_run_list(this_cpu_ptr(&lazy_list));
17161+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17162+ /*
17163+ * NOTE: we raise softirq via IPI for safety,
17164+ * and execute in irq_work_tick() to move the
17165+ * overhead from hard to soft irq context.
17166+ */
17167+ if (!llist_empty(this_cpu_ptr(&lazy_list)))
17168+ raise_softirq(TIMER_SOFTIRQ);
17169+ } else
17170+ irq_work_run_list(this_cpu_ptr(&lazy_list));
c7c16703 17171 }
e4b2b4a8 17172 EXPORT_SYMBOL_GPL(irq_work_run);
c7c16703 17173
e4b2b4a8 17174@@ -179,8 +200,17 @@
1a6e0f06 17175
e4b2b4a8
JK
17176 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17177 irq_work_run_list(raised);
1a6e0f06 17178+
e4b2b4a8
JK
17179+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17180+ irq_work_run_list(this_cpu_ptr(&lazy_list));
17181+}
1a6e0f06 17182+
e4b2b4a8
JK
17183+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17184+void irq_work_tick_soft(void)
17185+{
17186 irq_work_run_list(this_cpu_ptr(&lazy_list));
17187 }
1a6e0f06 17188+#endif
1a6e0f06 17189
e4b2b4a8
JK
17190 /*
17191 * Synchronize against the irq_work @entry, ensures the entry is not
17192diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/Kconfig.locks linux-4.14/kernel/Kconfig.locks
17193--- linux-4.14.orig/kernel/Kconfig.locks 2017-11-12 19:46:13.000000000 +0100
17194+++ linux-4.14/kernel/Kconfig.locks 2018-09-05 11:05:07.000000000 +0200
17195@@ -225,11 +225,11 @@
1a6e0f06 17196
e4b2b4a8
JK
17197 config MUTEX_SPIN_ON_OWNER
17198 def_bool y
17199- depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
17200+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
1a6e0f06 17201
e4b2b4a8
JK
17202 config RWSEM_SPIN_ON_OWNER
17203 def_bool y
17204- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
17205+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
1a6e0f06 17206
e4b2b4a8
JK
17207 config LOCK_SPIN_ON_OWNER
17208 def_bool y
17209diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/Kconfig.preempt linux-4.14/kernel/Kconfig.preempt
17210--- linux-4.14.orig/kernel/Kconfig.preempt 2017-11-12 19:46:13.000000000 +0100
17211+++ linux-4.14/kernel/Kconfig.preempt 2018-09-05 11:05:07.000000000 +0200
17212@@ -1,3 +1,16 @@
17213+config PREEMPT
17214+ bool
17215+ select PREEMPT_COUNT
17216+
17217+config PREEMPT_RT_BASE
17218+ bool
17219+ select PREEMPT
17220+
17221+config HAVE_PREEMPT_LAZY
17222+ bool
17223+
17224+config PREEMPT_LAZY
17225+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
1a6e0f06 17226
e4b2b4a8
JK
17227 choice
17228 prompt "Preemption Model"
17229@@ -33,9 +46,9 @@
1a6e0f06 17230
e4b2b4a8 17231 Select this if you are building a kernel for a desktop system.
1a6e0f06 17232
e4b2b4a8
JK
17233-config PREEMPT
17234+config PREEMPT__LL
17235 bool "Preemptible Kernel (Low-Latency Desktop)"
17236- select PREEMPT_COUNT
17237+ select PREEMPT
17238 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
17239 help
17240 This option reduces the latency of the kernel by making
17241@@ -52,6 +65,22 @@
17242 embedded system with latency requirements in the milliseconds
17243 range.
1a6e0f06 17244
e4b2b4a8
JK
17245+config PREEMPT_RTB
17246+ bool "Preemptible Kernel (Basic RT)"
17247+ select PREEMPT_RT_BASE
17248+ help
17249+ This option is basically the same as (Low-Latency Desktop) but
17250+ enables changes which are preliminary for the full preemptible
17251+ RT kernel.
1a6e0f06 17252+
e4b2b4a8
JK
17253+config PREEMPT_RT_FULL
17254+ bool "Fully Preemptible Kernel (RT)"
17255+ depends on IRQ_FORCED_THREADING
17256+ select PREEMPT_RT_BASE
17257+ select PREEMPT_RCU
17258+ help
17259+ All and everything
17260+
17261 endchoice
1a6e0f06 17262
e4b2b4a8
JK
17263 config PREEMPT_COUNT
17264diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/ksysfs.c linux-4.14/kernel/ksysfs.c
17265--- linux-4.14.orig/kernel/ksysfs.c 2017-11-12 19:46:13.000000000 +0100
17266+++ linux-4.14/kernel/ksysfs.c 2018-09-05 11:05:07.000000000 +0200
17267@@ -140,6 +140,15 @@
1a6e0f06 17268
e4b2b4a8 17269 #endif /* CONFIG_CRASH_CORE */
1a6e0f06 17270
e4b2b4a8
JK
17271+#if defined(CONFIG_PREEMPT_RT_FULL)
17272+static ssize_t realtime_show(struct kobject *kobj,
17273+ struct kobj_attribute *attr, char *buf)
17274+{
17275+ return sprintf(buf, "%d\n", 1);
17276+}
17277+KERNEL_ATTR_RO(realtime);
17278+#endif
17279+
17280 /* whether file capabilities are enabled */
17281 static ssize_t fscaps_show(struct kobject *kobj,
17282 struct kobj_attribute *attr, char *buf)
17283@@ -231,6 +240,9 @@
17284 &rcu_expedited_attr.attr,
17285 &rcu_normal_attr.attr,
17286 #endif
17287+#ifdef CONFIG_PREEMPT_RT_FULL
17288+ &realtime_attr.attr,
1a6e0f06 17289+#endif
e4b2b4a8
JK
17290 NULL
17291 };
1a6e0f06 17292
e4b2b4a8
JK
17293diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/lockdep.c linux-4.14/kernel/locking/lockdep.c
17294--- linux-4.14.orig/kernel/locking/lockdep.c 2018-09-05 11:03:29.000000000 +0200
17295+++ linux-4.14/kernel/locking/lockdep.c 2018-09-05 11:05:07.000000000 +0200
17296@@ -3916,6 +3916,7 @@
17297 }
17298 }
1a6e0f06 17299
e4b2b4a8
JK
17300+#ifndef CONFIG_PREEMPT_RT_FULL
17301 /*
17302 * We dont accurately track softirq state in e.g.
17303 * hardirq contexts (such as on 4KSTACKS), so only
17304@@ -3930,6 +3931,7 @@
17305 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
17306 }
17307 }
17308+#endif
1a6e0f06 17309
e4b2b4a8
JK
17310 if (!debug_locks)
17311 print_irqtrace_events(current);
17312diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/locktorture.c linux-4.14/kernel/locking/locktorture.c
17313--- linux-4.14.orig/kernel/locking/locktorture.c 2018-09-05 11:03:22.000000000 +0200
17314+++ linux-4.14/kernel/locking/locktorture.c 2018-09-05 11:05:07.000000000 +0200
17315@@ -26,7 +26,6 @@
17316 #include <linux/kthread.h>
17317 #include <linux/sched/rt.h>
17318 #include <linux/spinlock.h>
17319-#include <linux/rwlock.h>
17320 #include <linux/mutex.h>
17321 #include <linux/rwsem.h>
17322 #include <linux/smp.h>
17323diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/Makefile linux-4.14/kernel/locking/Makefile
17324--- linux-4.14.orig/kernel/locking/Makefile 2017-11-12 19:46:13.000000000 +0100
17325+++ linux-4.14/kernel/locking/Makefile 2018-09-05 11:05:07.000000000 +0200
17326@@ -3,7 +3,7 @@
17327 # and is generally not a function of system call inputs.
17328 KCOV_INSTRUMENT := n
1a6e0f06 17329
e4b2b4a8
JK
17330-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17331+obj-y += semaphore.o percpu-rwsem.o
1a6e0f06 17332
e4b2b4a8
JK
17333 ifdef CONFIG_FUNCTION_TRACER
17334 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17335@@ -12,7 +12,11 @@
17336 CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17337 endif
17338
17339+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17340+obj-y += mutex.o
17341 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17342+endif
17343+obj-y += rwsem.o
17344 obj-$(CONFIG_LOCKDEP) += lockdep.o
17345 ifeq ($(CONFIG_PROC_FS),y)
17346 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17347@@ -25,8 +29,11 @@
17348 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17349 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17350 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17351+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17352 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17353 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17354+endif
17355+obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
17356 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17357 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17358 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
17359diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/mutex-rt.c linux-4.14/kernel/locking/mutex-rt.c
17360--- linux-4.14.orig/kernel/locking/mutex-rt.c 1970-01-01 01:00:00.000000000 +0100
17361+++ linux-4.14/kernel/locking/mutex-rt.c 2018-09-05 11:05:07.000000000 +0200
17362@@ -0,0 +1,223 @@
17363+/*
17364+ * kernel/rt.c
17365+ *
17366+ * Real-Time Preemption Support
17367+ *
17368+ * started by Ingo Molnar:
17369+ *
17370+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17371+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17372+ *
17373+ * historic credit for proving that Linux spinlocks can be implemented via
17374+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
17375+ * and others) who prototyped it on 2.4 and did lots of comparative
17376+ * research and analysis; TimeSys, for proving that you can implement a
17377+ * fully preemptible kernel via the use of IRQ threading and mutexes;
17378+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
17379+ * right one; and to MontaVista, who ported pmutexes to 2.6.
17380+ *
17381+ * This code is a from-scratch implementation and is not based on pmutexes,
17382+ * but the idea of converting spinlocks to mutexes is used here too.
17383+ *
17384+ * lock debugging, locking tree, deadlock detection:
17385+ *
17386+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
17387+ * Released under the General Public License (GPL).
17388+ *
17389+ * Includes portions of the generic R/W semaphore implementation from:
17390+ *
17391+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
17392+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
17393+ * - Derived also from comments by Linus
17394+ *
17395+ * Pending ownership of locks and ownership stealing:
17396+ *
17397+ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
17398+ *
17399+ * (also by Steven Rostedt)
17400+ * - Converted single pi_lock to individual task locks.
17401+ *
17402+ * By Esben Nielsen:
17403+ * Doing priority inheritance with help of the scheduler.
17404+ *
17405+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17406+ * - major rework based on Esben Nielsens initial patch
17407+ * - replaced thread_info references by task_struct refs
17408+ * - removed task->pending_owner dependency
17409+ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
17410+ * in the scheduler return path as discussed with Steven Rostedt
17411+ *
17412+ * Copyright (C) 2006, Kihon Technologies Inc.
17413+ * Steven Rostedt <rostedt@goodmis.org>
17414+ * - debugged and patched Thomas Gleixner's rework.
17415+ * - added back the cmpxchg to the rework.
17416+ * - turned atomic require back on for SMP.
17417+ */
17418+
17419+#include <linux/spinlock.h>
17420+#include <linux/rtmutex.h>
17421+#include <linux/sched.h>
17422+#include <linux/delay.h>
17423+#include <linux/module.h>
17424+#include <linux/kallsyms.h>
17425+#include <linux/syscalls.h>
17426+#include <linux/interrupt.h>
17427+#include <linux/plist.h>
17428+#include <linux/fs.h>
17429+#include <linux/futex.h>
17430+#include <linux/hrtimer.h>
17431+
17432+#include "rtmutex_common.h"
17433+
17434+/*
17435+ * struct mutex functions
17436+ */
17437+void __mutex_do_init(struct mutex *mutex, const char *name,
17438+ struct lock_class_key *key)
17439+{
17440+#ifdef CONFIG_DEBUG_LOCK_ALLOC
17441+ /*
17442+ * Make sure we are not reinitializing a held lock:
17443+ */
17444+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
17445+ lockdep_init_map(&mutex->dep_map, name, key, 0);
1a6e0f06 17446+#endif
e4b2b4a8
JK
17447+ mutex->lock.save_state = 0;
17448+}
17449+EXPORT_SYMBOL(__mutex_do_init);
17450+
17451+void __lockfunc _mutex_lock(struct mutex *lock)
17452+{
17453+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17454+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17455+}
17456+EXPORT_SYMBOL(_mutex_lock);
17457+
17458+void __lockfunc _mutex_lock_io(struct mutex *lock)
17459+{
17460+ int token;
17461+
17462+ token = io_schedule_prepare();
17463+ _mutex_lock(lock);
17464+ io_schedule_finish(token);
17465+}
17466+EXPORT_SYMBOL_GPL(_mutex_lock_io);
17467+
17468+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
17469+{
17470+ int ret;
17471+
17472+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17473+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
17474+ if (ret)
17475+ mutex_release(&lock->dep_map, 1, _RET_IP_);
17476+ return ret;
17477+}
17478+EXPORT_SYMBOL(_mutex_lock_interruptible);
17479+
17480+int __lockfunc _mutex_lock_killable(struct mutex *lock)
17481+{
17482+ int ret;
17483+
17484+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17485+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
17486+ if (ret)
17487+ mutex_release(&lock->dep_map, 1, _RET_IP_);
17488+ return ret;
17489+}
17490+EXPORT_SYMBOL(_mutex_lock_killable);
17491+
17492+#ifdef CONFIG_DEBUG_LOCK_ALLOC
17493+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
17494+{
17495+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17496+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17497+}
17498+EXPORT_SYMBOL(_mutex_lock_nested);
1a6e0f06 17499+
e4b2b4a8
JK
17500+void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
17501+{
17502+ int token;
1f39f580 17503+
e4b2b4a8 17504+ token = io_schedule_prepare();
1a6e0f06 17505+
e4b2b4a8
JK
17506+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17507+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
1a6e0f06 17508+
e4b2b4a8
JK
17509+ io_schedule_finish(token);
17510+}
17511+EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
1a6e0f06 17512+
e4b2b4a8
JK
17513+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
17514+{
17515+ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
17516+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17517+}
17518+EXPORT_SYMBOL(_mutex_lock_nest_lock);
1a6e0f06 17519+
e4b2b4a8 17520+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
1a6e0f06 17521+{
e4b2b4a8
JK
17522+ int ret;
17523+
17524+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17525+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
17526+ if (ret)
17527+ mutex_release(&lock->dep_map, 1, _RET_IP_);
17528+ return ret;
1a6e0f06 17529+}
e4b2b4a8
JK
17530+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
17531+
17532+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
17533+{
17534+ int ret;
17535+
17536+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
17537+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
17538+ if (ret)
17539+ mutex_release(&lock->dep_map, 1, _RET_IP_);
17540+ return ret;
17541+}
17542+EXPORT_SYMBOL(_mutex_lock_killable_nested);
1a6e0f06 17543+#endif
e4b2b4a8
JK
17544+
17545+int __lockfunc _mutex_trylock(struct mutex *lock)
17546+{
17547+ int ret = __rt_mutex_trylock(&lock->lock);
17548+
17549+ if (ret)
17550+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
17551+
17552+ return ret;
17553+}
17554+EXPORT_SYMBOL(_mutex_trylock);
17555+
17556+void __lockfunc _mutex_unlock(struct mutex *lock)
17557+{
17558+ mutex_release(&lock->dep_map, 1, _RET_IP_);
17559+ __rt_mutex_unlock(&lock->lock);
17560+}
17561+EXPORT_SYMBOL(_mutex_unlock);
17562+
17563+/**
17564+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
17565+ * @cnt: the atomic which we are to dec
17566+ * @lock: the mutex to return holding if we dec to 0
17567+ *
17568+ * return true and hold lock if we dec to 0, return false otherwise
17569+ */
17570+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
17571+{
17572+ /* dec if we can't possibly hit 0 */
17573+ if (atomic_add_unless(cnt, -1, 1))
17574+ return 0;
17575+ /* we might hit 0, so take the lock */
17576+ mutex_lock(lock);
17577+ if (!atomic_dec_and_test(cnt)) {
17578+ /* when we actually did the dec, we didn't hit 0 */
17579+ mutex_unlock(lock);
17580+ return 0;
17581+ }
17582+ /* we hit 0, and we hold the lock */
17583+ return 1;
17584+}
17585+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
17586diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rtmutex.c linux-4.14/kernel/locking/rtmutex.c
17587--- linux-4.14.orig/kernel/locking/rtmutex.c 2018-09-05 11:03:22.000000000 +0200
17588+++ linux-4.14/kernel/locking/rtmutex.c 2018-09-05 11:05:07.000000000 +0200
17589@@ -7,6 +7,11 @@
17590 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17591 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
17592 * Copyright (C) 2006 Esben Nielsen
17593+ * Adaptive Spinlocks:
17594+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
17595+ * and Peter Morreale,
17596+ * Adaptive Spinlocks simplification:
17597+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
1a6e0f06 17598 *
e4b2b4a8
JK
17599 * See Documentation/locking/rt-mutex-design.txt for details.
17600 */
17601@@ -18,6 +23,8 @@
17602 #include <linux/sched/wake_q.h>
17603 #include <linux/sched/debug.h>
17604 #include <linux/timer.h>
17605+#include <linux/ww_mutex.h>
17606+#include <linux/blkdev.h>
17607
17608 #include "rtmutex_common.h"
17609
17610@@ -135,6 +142,12 @@
17611 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
1a6e0f06
JK
17612 }
17613
e4b2b4a8
JK
17614+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
17615+{
17616+ return waiter && waiter != PI_WAKEUP_INPROGRESS &&
17617+ waiter != PI_REQUEUE_INPROGRESS;
17618+}
17619+
1a6e0f06 17620 /*
e4b2b4a8
JK
17621 * We can speed up the acquire/release, if there's no debugging state to be
17622 * set up.
17623@@ -228,7 +241,7 @@
17624 * Only use with rt_mutex_waiter_{less,equal}()
1a6e0f06 17625 */
e4b2b4a8
JK
17626 #define task_to_waiter(p) \
17627- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
17628+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
1a6e0f06 17629
e4b2b4a8
JK
17630 static inline int
17631 rt_mutex_waiter_less(struct rt_mutex_waiter *left,
17632@@ -268,6 +281,27 @@
17633 return 1;
1a6e0f06
JK
17634 }
17635
e4b2b4a8
JK
17636+#define STEAL_NORMAL 0
17637+#define STEAL_LATERAL 1
17638+
17639+static inline int
17640+rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
17641+{
17642+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
17643+
17644+ if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
17645+ return 1;
17646+
17647+ /*
17648+ * Note that RT tasks are excluded from lateral-steals
17649+ * to prevent the introduction of an unbounded latency.
17650+ */
17651+ if (mode == STEAL_NORMAL || rt_task(waiter->task))
17652+ return 0;
17653+
17654+ return rt_mutex_waiter_equal(waiter, top_waiter);
17655+}
17656+
17657 static void
17658 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
17659 {
17660@@ -372,6 +406,14 @@
17661 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
1a6e0f06
JK
17662 }
17663
e4b2b4a8
JK
17664+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
17665+{
17666+ if (waiter->savestate)
17667+ wake_up_lock_sleeper(waiter->task);
17668+ else
17669+ wake_up_process(waiter->task);
17670+}
17671+
17672 /*
17673 * Max number of times we'll walk the boosting chain:
17674 */
17675@@ -379,7 +421,8 @@
1a6e0f06 17676
e4b2b4a8
JK
17677 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
17678 {
17679- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
17680+ return rt_mutex_real_waiter(p->pi_blocked_on) ?
17681+ p->pi_blocked_on->lock : NULL;
17682 }
1a6e0f06 17683
e4b2b4a8
JK
17684 /*
17685@@ -515,7 +558,7 @@
17686 * reached or the state of the chain has changed while we
17687 * dropped the locks.
17688 */
17689- if (!waiter)
17690+ if (!rt_mutex_real_waiter(waiter))
17691 goto out_unlock_pi;
1a6e0f06 17692
e4b2b4a8
JK
17693 /*
17694@@ -696,13 +739,16 @@
17695 * follow here. This is the end of the chain we are walking.
17696 */
17697 if (!rt_mutex_owner(lock)) {
17698+ struct rt_mutex_waiter *lock_top_waiter;
1a6e0f06 17699+
e4b2b4a8
JK
17700 /*
17701 * If the requeue [7] above changed the top waiter,
17702 * then we need to wake the new top waiter up to try
17703 * to get the lock.
17704 */
17705- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
17706- wake_up_process(rt_mutex_top_waiter(lock)->task);
17707+ lock_top_waiter = rt_mutex_top_waiter(lock);
17708+ if (prerequeue_top_waiter != lock_top_waiter)
17709+ rt_mutex_wake_waiter(lock_top_waiter);
17710 raw_spin_unlock_irq(&lock->wait_lock);
17711 return 0;
17712 }
17713@@ -804,9 +850,11 @@
17714 * @task: The task which wants to acquire the lock
17715 * @waiter: The waiter that is queued to the lock's wait tree if the
17716 * callsite called task_blocked_on_lock(), otherwise NULL
17717+ * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
17718 */
17719-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
17720- struct rt_mutex_waiter *waiter)
17721+static int __try_to_take_rt_mutex(struct rt_mutex *lock,
17722+ struct task_struct *task,
17723+ struct rt_mutex_waiter *waiter, int mode)
17724 {
17725 lockdep_assert_held(&lock->wait_lock);
1a6e0f06 17726
e4b2b4a8
JK
17727@@ -842,12 +890,11 @@
17728 */
17729 if (waiter) {
17730 /*
17731- * If waiter is not the highest priority waiter of
17732- * @lock, give up.
17733+ * If waiter is not the highest priority waiter of @lock,
17734+ * or its peer when lateral steal is allowed, give up.
17735 */
17736- if (waiter != rt_mutex_top_waiter(lock))
17737+ if (!rt_mutex_steal(lock, waiter, mode))
17738 return 0;
17739-
17740 /*
17741 * We can acquire the lock. Remove the waiter from the
17742 * lock waiters tree.
17743@@ -865,14 +912,12 @@
17744 */
17745 if (rt_mutex_has_waiters(lock)) {
17746 /*
17747- * If @task->prio is greater than or equal to
17748- * the top waiter priority (kernel view),
17749- * @task lost.
17750+ * If @task->prio is greater than the top waiter
17751+ * priority (kernel view), or equal to it when a
17752+ * lateral steal is forbidden, @task lost.
17753 */
17754- if (!rt_mutex_waiter_less(task_to_waiter(task),
17755- rt_mutex_top_waiter(lock)))
17756+ if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
17757 return 0;
17758-
17759 /*
17760 * The current top waiter stays enqueued. We
17761 * don't have to change anything in the lock
17762@@ -919,6 +964,351 @@
17763 return 1;
17764 }
1a6e0f06 17765
e4b2b4a8
JK
17766+#ifdef CONFIG_PREEMPT_RT_FULL
17767+/*
17768+ * preemptible spin_lock functions:
17769+ */
17770+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
17771+ void (*slowfn)(struct rt_mutex *lock))
17772+{
17773+ might_sleep_no_state_check();
17774+
17775+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
17776+ return;
17777+ else
17778+ slowfn(lock);
17779+}
17780+
17781+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
17782+ void (*slowfn)(struct rt_mutex *lock))
17783+{
17784+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
17785+ return;
17786+ else
17787+ slowfn(lock);
17788+}
17789+#ifdef CONFIG_SMP
17790+/*
17791+ * Note that owner is a speculative pointer and dereferencing relies
17792+ * on rcu_read_lock() and the check against the lock owner.
17793+ */
17794+static int adaptive_wait(struct rt_mutex *lock,
17795+ struct task_struct *owner)
17796+{
17797+ int res = 0;
17798+
17799+ rcu_read_lock();
17800+ for (;;) {
17801+ if (owner != rt_mutex_owner(lock))
17802+ break;
17803+ /*
17804+ * Ensure that owner->on_cpu is dereferenced _after_
17805+ * checking the above to be valid.
17806+ */
17807+ barrier();
17808+ if (!owner->on_cpu) {
17809+ res = 1;
17810+ break;
17811+ }
17812+ cpu_relax();
17813+ }
17814+ rcu_read_unlock();
17815+ return res;
17816+}
17817+#else
17818+static int adaptive_wait(struct rt_mutex *lock,
17819+ struct task_struct *orig_owner)
17820+{
17821+ return 1;
17822+}
1a6e0f06
JK
17823+#endif
17824+
e4b2b4a8
JK
17825+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
17826+ struct rt_mutex_waiter *waiter,
17827+ struct task_struct *task,
17828+ enum rtmutex_chainwalk chwalk);
17829+/*
17830+ * Slow path lock function spin_lock style: this variant is very
17831+ * careful not to miss any non-lock wakeups.
17832+ *
17833+ * We store the current state under p->pi_lock in p->saved_state and
17834+ * the try_to_wake_up() code handles this accordingly.
17835+ */
17836+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
17837+ struct rt_mutex_waiter *waiter,
17838+ unsigned long flags)
17839+{
17840+ struct task_struct *lock_owner, *self = current;
17841+ struct rt_mutex_waiter *top_waiter;
17842+ int ret;
1a6e0f06 17843+
e4b2b4a8
JK
17844+ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
17845+ return;
1a6e0f06 17846+
e4b2b4a8 17847+ BUG_ON(rt_mutex_owner(lock) == self);
1a6e0f06 17848+
e4b2b4a8
JK
17849+ /*
17850+ * We save whatever state the task is in and we'll restore it
17851+ * after acquiring the lock taking real wakeups into account
17852+ * as well. We are serialized via pi_lock against wakeups. See
17853+ * try_to_wake_up().
17854+ */
17855+ raw_spin_lock(&self->pi_lock);
17856+ self->saved_state = self->state;
17857+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
17858+ raw_spin_unlock(&self->pi_lock);
1a6e0f06 17859+
e4b2b4a8
JK
17860+ ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
17861+ BUG_ON(ret);
1a6e0f06 17862+
e4b2b4a8
JK
17863+ for (;;) {
17864+ /* Try to acquire the lock again. */
17865+ if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
17866+ break;
1a6e0f06 17867+
e4b2b4a8
JK
17868+ top_waiter = rt_mutex_top_waiter(lock);
17869+ lock_owner = rt_mutex_owner(lock);
1a6e0f06 17870+
e4b2b4a8 17871+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1a6e0f06 17872+
e4b2b4a8 17873+ debug_rt_mutex_print_deadlock(waiter);
1a6e0f06 17874+
e4b2b4a8
JK
17875+ if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
17876+ schedule();
1a6e0f06 17877+
e4b2b4a8 17878+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
1a6e0f06 17879+
e4b2b4a8
JK
17880+ raw_spin_lock(&self->pi_lock);
17881+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
17882+ raw_spin_unlock(&self->pi_lock);
17883+ }
1a6e0f06 17884+
e4b2b4a8
JK
17885+ /*
17886+ * Restore the task state to current->saved_state. We set it
17887+ * to the original state above and the try_to_wake_up() code
17888+ * has possibly updated it when a real (non-rtmutex) wakeup
17889+ * happened while we were blocked. Clear saved_state so
17890+ * try_to_wakeup() does not get confused.
17891+ */
17892+ raw_spin_lock(&self->pi_lock);
17893+ __set_current_state_no_track(self->saved_state);
17894+ self->saved_state = TASK_RUNNING;
17895+ raw_spin_unlock(&self->pi_lock);
1a6e0f06 17896+
e4b2b4a8
JK
17897+ /*
17898+ * try_to_take_rt_mutex() sets the waiter bit
17899+ * unconditionally. We might have to fix that up:
17900+ */
17901+ fixup_rt_mutex_waiters(lock);
1a6e0f06 17902+
e4b2b4a8
JK
17903+ BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
17904+ BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
17905+}
1a6e0f06 17906+
e4b2b4a8
JK
17907+static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
17908+{
17909+ struct rt_mutex_waiter waiter;
17910+ unsigned long flags;
1a6e0f06 17911+
e4b2b4a8 17912+ rt_mutex_init_waiter(&waiter, true);
1a6e0f06 17913+
e4b2b4a8
JK
17914+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
17915+ rt_spin_lock_slowlock_locked(lock, &waiter, flags);
17916+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17917+ debug_rt_mutex_free_waiter(&waiter);
17918+}
1a6e0f06 17919+
e4b2b4a8
JK
17920+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
17921+ struct wake_q_head *wake_q,
17922+ struct wake_q_head *wq_sleeper);
17923+/*
17924+ * Slow path to release a rt_mutex spin_lock style
17925+ */
17926+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
17927+{
17928+ unsigned long flags;
17929+ DEFINE_WAKE_Q(wake_q);
17930+ DEFINE_WAKE_Q(wake_sleeper_q);
17931+ bool postunlock;
1a6e0f06 17932+
e4b2b4a8
JK
17933+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
17934+ postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
17935+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1a6e0f06 17936+
e4b2b4a8
JK
17937+ if (postunlock)
17938+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
17939+}
1a6e0f06 17940+
e4b2b4a8
JK
17941+void __lockfunc rt_spin_lock(spinlock_t *lock)
17942+{
17943+ sleeping_lock_inc();
17944+ migrate_disable();
17945+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17946+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
17947+}
17948+EXPORT_SYMBOL(rt_spin_lock);
1a6e0f06 17949+
e4b2b4a8
JK
17950+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
17951+{
17952+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
17953+}
1a6e0f06 17954+
e4b2b4a8
JK
17955+#ifdef CONFIG_DEBUG_LOCK_ALLOC
17956+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
17957+{
17958+ sleeping_lock_inc();
17959+ migrate_disable();
17960+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
17961+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
17962+}
17963+EXPORT_SYMBOL(rt_spin_lock_nested);
17964+#endif
1a6e0f06 17965+
e4b2b4a8
JK
17966+void __lockfunc rt_spin_unlock(spinlock_t *lock)
17967+{
17968+ /* NOTE: we always pass in '1' for nested, for simplicity */
17969+ spin_release(&lock->dep_map, 1, _RET_IP_);
17970+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
17971+ migrate_enable();
17972+ sleeping_lock_dec();
17973+}
17974+EXPORT_SYMBOL(rt_spin_unlock);
1a6e0f06 17975+
e4b2b4a8
JK
17976+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
17977+{
17978+ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
17979+}
17980+EXPORT_SYMBOL(__rt_spin_unlock);
17981+
17982+/*
17983+ * Wait for the lock to get unlocked: instead of polling for an unlock
17984+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
17985+ * schedule if there's contention:
17986+ */
17987+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
17988+{
17989+ spin_lock(lock);
17990+ spin_unlock(lock);
17991+}
17992+EXPORT_SYMBOL(rt_spin_unlock_wait);
17993+
17994+int __lockfunc rt_spin_trylock(spinlock_t *lock)
17995+{
17996+ int ret;
17997+
17998+ sleeping_lock_inc();
17999+ migrate_disable();
18000+ ret = __rt_mutex_trylock(&lock->lock);
18001+ if (ret) {
18002+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18003+ } else {
18004+ migrate_enable();
18005+ sleeping_lock_dec();
18006+ }
18007+ return ret;
18008+}
18009+EXPORT_SYMBOL(rt_spin_trylock);
18010+
18011+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
18012+{
18013+ int ret;
18014+
18015+ local_bh_disable();
18016+ ret = __rt_mutex_trylock(&lock->lock);
18017+ if (ret) {
18018+ sleeping_lock_inc();
18019+ migrate_disable();
18020+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18021+ } else
18022+ local_bh_enable();
18023+ return ret;
18024+}
18025+EXPORT_SYMBOL(rt_spin_trylock_bh);
18026+
18027+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
18028+{
18029+ int ret;
1a6e0f06 18030+
e4b2b4a8
JK
18031+ *flags = 0;
18032+ ret = __rt_mutex_trylock(&lock->lock);
18033+ if (ret) {
18034+ sleeping_lock_inc();
18035+ migrate_disable();
18036+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18037+ }
18038+ return ret;
18039+}
18040+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
1a6e0f06 18041+
e4b2b4a8
JK
18042+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
18043+{
18044+ /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
18045+ if (atomic_add_unless(atomic, -1, 1))
18046+ return 0;
18047+ rt_spin_lock(lock);
18048+ if (atomic_dec_and_test(atomic))
18049+ return 1;
18050+ rt_spin_unlock(lock);
18051+ return 0;
18052+}
18053+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
1a6e0f06 18054+
e4b2b4a8
JK
18055+void
18056+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
18057+{
1a6e0f06 18058+#ifdef CONFIG_DEBUG_LOCK_ALLOC
e4b2b4a8
JK
18059+ /*
18060+ * Make sure we are not reinitializing a held lock:
18061+ */
18062+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
18063+ lockdep_init_map(&lock->dep_map, name, key, 0);
1a6e0f06 18064+#endif
e4b2b4a8
JK
18065+}
18066+EXPORT_SYMBOL(__rt_spin_lock_init);
1a6e0f06 18067+
e4b2b4a8 18068+#endif /* PREEMPT_RT_FULL */
1a6e0f06 18069+
e4b2b4a8
JK
18070+#ifdef CONFIG_PREEMPT_RT_FULL
18071+ static inline int __sched
18072+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18073+{
18074+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18075+ struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
1a6e0f06 18076+
e4b2b4a8
JK
18077+ if (!hold_ctx)
18078+ return 0;
1a6e0f06 18079+
e4b2b4a8
JK
18080+ if (unlikely(ctx == hold_ctx))
18081+ return -EALREADY;
1a6e0f06 18082+
e4b2b4a8
JK
18083+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
18084+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
18085+#ifdef CONFIG_DEBUG_MUTEXES
18086+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
18087+ ctx->contending_lock = ww;
18088+#endif
18089+ return -EDEADLK;
18090+ }
1a6e0f06 18091+
e4b2b4a8
JK
18092+ return 0;
18093+}
18094+#else
18095+ static inline int __sched
18096+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18097+{
18098+ BUG();
18099+ return 0;
18100+}
1a6e0f06 18101+
1a6e0f06
JK
18102+#endif
18103+
e4b2b4a8
JK
18104+static inline int
18105+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18106+ struct rt_mutex_waiter *waiter)
18107+{
18108+ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
18109+}
1a6e0f06 18110+
e4b2b4a8
JK
18111 /*
18112 * Task blocks on lock.
18113 *
18114@@ -951,6 +1341,22 @@
18115 return -EDEADLK;
18116
18117 raw_spin_lock(&task->pi_lock);
18118+ /*
18119+ * In the case of futex requeue PI, this will be a proxy
18120+ * lock. The task will wake unaware that it is enqueueed on
18121+ * this lock. Avoid blocking on two locks and corrupting
18122+ * pi_blocked_on via the PI_WAKEUP_INPROGRESS
18123+ * flag. futex_wait_requeue_pi() sets this when it wakes up
18124+ * before requeue (due to a signal or timeout). Do not enqueue
18125+ * the task if PI_WAKEUP_INPROGRESS is set.
18126+ */
18127+ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
18128+ raw_spin_unlock(&task->pi_lock);
18129+ return -EAGAIN;
18130+ }
1a6e0f06 18131+
e4b2b4a8 18132+ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
1a6e0f06 18133+
e4b2b4a8
JK
18134 waiter->task = task;
18135 waiter->lock = lock;
18136 waiter->prio = task->prio;
18137@@ -974,7 +1380,7 @@
18138 rt_mutex_enqueue_pi(owner, waiter);
18139
18140 rt_mutex_adjust_prio(owner);
18141- if (owner->pi_blocked_on)
18142+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
18143 chain_walk = 1;
18144 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
18145 chain_walk = 1;
18146@@ -1016,6 +1422,7 @@
18147 * Called with lock->wait_lock held and interrupts disabled.
18148 */
18149 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18150+ struct wake_q_head *wake_sleeper_q,
18151 struct rt_mutex *lock)
18152 {
18153 struct rt_mutex_waiter *waiter;
18154@@ -1055,7 +1462,10 @@
18155 * Pairs with preempt_enable() in rt_mutex_postunlock();
18156 */
18157 preempt_disable();
18158- wake_q_add(wake_q, waiter->task);
18159+ if (waiter->savestate)
18160+ wake_q_add_sleeper(wake_sleeper_q, waiter->task);
18161+ else
18162+ wake_q_add(wake_q, waiter->task);
18163 raw_spin_unlock(&current->pi_lock);
18164 }
18165
18166@@ -1070,7 +1480,7 @@
18167 {
18168 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
18169 struct task_struct *owner = rt_mutex_owner(lock);
18170- struct rt_mutex *next_lock;
18171+ struct rt_mutex *next_lock = NULL;
18172
18173 lockdep_assert_held(&lock->wait_lock);
18174
18175@@ -1096,7 +1506,8 @@
18176 rt_mutex_adjust_prio(owner);
18177
18178 /* Store the lock on which owner is blocked or NULL */
18179- next_lock = task_blocked_on_lock(owner);
18180+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
18181+ next_lock = task_blocked_on_lock(owner);
18182
18183 raw_spin_unlock(&owner->pi_lock);
18184
18185@@ -1132,26 +1543,28 @@
18186 raw_spin_lock_irqsave(&task->pi_lock, flags);
18187
18188 waiter = task->pi_blocked_on;
18189- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18190+ if (!rt_mutex_real_waiter(waiter) ||
18191+ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18192 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18193 return;
18194 }
18195 next_lock = waiter->lock;
18196- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18197
18198 /* gets dropped in rt_mutex_adjust_prio_chain()! */
18199 get_task_struct(task);
18200
18201+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18202 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
18203 next_lock, NULL, task);
18204 }
18205
18206-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18207+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
18208 {
18209 debug_rt_mutex_init_waiter(waiter);
18210 RB_CLEAR_NODE(&waiter->pi_tree_entry);
18211 RB_CLEAR_NODE(&waiter->tree_entry);
18212 waiter->task = NULL;
18213+ waiter->savestate = savestate;
18214 }
18215
18216 /**
18217@@ -1167,7 +1580,8 @@
18218 static int __sched
18219 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18220 struct hrtimer_sleeper *timeout,
18221- struct rt_mutex_waiter *waiter)
18222+ struct rt_mutex_waiter *waiter,
18223+ struct ww_acquire_ctx *ww_ctx)
18224 {
18225 int ret = 0;
18226
18227@@ -1176,16 +1590,17 @@
18228 if (try_to_take_rt_mutex(lock, current, waiter))
18229 break;
18230
18231- /*
18232- * TASK_INTERRUPTIBLE checks for signals and
18233- * timeout. Ignored otherwise.
18234- */
18235- if (likely(state == TASK_INTERRUPTIBLE)) {
18236- /* Signal pending? */
18237- if (signal_pending(current))
18238- ret = -EINTR;
18239- if (timeout && !timeout->task)
18240- ret = -ETIMEDOUT;
18241+ if (timeout && !timeout->task) {
18242+ ret = -ETIMEDOUT;
18243+ break;
18244+ }
18245+ if (signal_pending_state(state, current)) {
18246+ ret = -EINTR;
18247+ break;
18248+ }
1a6e0f06 18249+
e4b2b4a8
JK
18250+ if (ww_ctx && ww_ctx->acquired > 0) {
18251+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
18252 if (ret)
18253 break;
18254 }
18255@@ -1224,33 +1639,104 @@
18256 }
18257 }
18258
18259-/*
18260- * Slow path lock function:
18261- */
18262-static int __sched
18263-rt_mutex_slowlock(struct rt_mutex *lock, int state,
18264- struct hrtimer_sleeper *timeout,
18265- enum rtmutex_chainwalk chwalk)
18266+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
18267+ struct ww_acquire_ctx *ww_ctx)
18268 {
18269- struct rt_mutex_waiter waiter;
18270- unsigned long flags;
18271- int ret = 0;
18272+#ifdef CONFIG_DEBUG_MUTEXES
18273+ /*
18274+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
18275+ * but released with a normal mutex_unlock in this call.
18276+ *
18277+ * This should never happen, always use ww_mutex_unlock.
18278+ */
18279+ DEBUG_LOCKS_WARN_ON(ww->ctx);
18280
18281- rt_mutex_init_waiter(&waiter);
18282+ /*
18283+ * Not quite done after calling ww_acquire_done() ?
18284+ */
18285+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
1a6e0f06 18286+
e4b2b4a8
JK
18287+ if (ww_ctx->contending_lock) {
18288+ /*
18289+ * After -EDEADLK you tried to
18290+ * acquire a different ww_mutex? Bad!
18291+ */
18292+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
1a6e0f06 18293+
e4b2b4a8
JK
18294+ /*
18295+ * You called ww_mutex_lock after receiving -EDEADLK,
18296+ * but 'forgot' to unlock everything else first?
18297+ */
18298+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
18299+ ww_ctx->contending_lock = NULL;
18300+ }
18301
18302 /*
18303- * Technically we could use raw_spin_[un]lock_irq() here, but this can
18304- * be called in early boot if the cmpxchg() fast path is disabled
18305- * (debug, no architecture support). In this case we will acquire the
18306- * rtmutex with lock->wait_lock held. But we cannot unconditionally
18307- * enable interrupts in that early boot case. So we need to use the
18308- * irqsave/restore variants.
18309+ * Naughty, using a different class will lead to undefined behavior!
18310 */
18311- raw_spin_lock_irqsave(&lock->wait_lock, flags);
18312+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
18313+#endif
18314+ ww_ctx->acquired++;
18315+}
1a6e0f06 18316+
e4b2b4a8
JK
18317+#ifdef CONFIG_PREEMPT_RT_FULL
18318+static void ww_mutex_account_lock(struct rt_mutex *lock,
18319+ struct ww_acquire_ctx *ww_ctx)
18320+{
18321+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18322+ struct rt_mutex_waiter *waiter, *n;
1a6e0f06 18323+
e4b2b4a8
JK
18324+ /*
18325+ * This branch gets optimized out for the common case,
18326+ * and is only important for ww_mutex_lock.
18327+ */
18328+ ww_mutex_lock_acquired(ww, ww_ctx);
18329+ ww->ctx = ww_ctx;
1a6e0f06 18330+
e4b2b4a8
JK
18331+ /*
18332+ * Give any possible sleeping processes the chance to wake up,
18333+ * so they can recheck if they have to back off.
18334+ */
18335+ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
18336+ tree_entry) {
18337+ /* XXX debug rt mutex waiter wakeup */
1a6e0f06 18338+
e4b2b4a8
JK
18339+ BUG_ON(waiter->lock != lock);
18340+ rt_mutex_wake_waiter(waiter);
18341+ }
1a6e0f06
JK
18342+}
18343+
e4b2b4a8 18344+#else
1a6e0f06 18345+
e4b2b4a8
JK
18346+static void ww_mutex_account_lock(struct rt_mutex *lock,
18347+ struct ww_acquire_ctx *ww_ctx)
1a6e0f06 18348+{
e4b2b4a8 18349+ BUG();
1a6e0f06 18350+}
e4b2b4a8 18351+#endif
1a6e0f06 18352+
e4b2b4a8
JK
18353+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
18354+ struct hrtimer_sleeper *timeout,
18355+ enum rtmutex_chainwalk chwalk,
18356+ struct ww_acquire_ctx *ww_ctx,
18357+ struct rt_mutex_waiter *waiter)
1a6e0f06 18358+{
e4b2b4a8 18359+ int ret;
1a6e0f06 18360+
e4b2b4a8
JK
18361+#ifdef CONFIG_PREEMPT_RT_FULL
18362+ if (ww_ctx) {
18363+ struct ww_mutex *ww;
1a6e0f06 18364+
e4b2b4a8
JK
18365+ ww = container_of(lock, struct ww_mutex, base.lock);
18366+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
18367+ return -EALREADY;
18368+ }
18369+#endif
18370
18371 /* Try to acquire the lock again: */
18372 if (try_to_take_rt_mutex(lock, current, NULL)) {
18373- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18374+ if (ww_ctx)
18375+ ww_mutex_account_lock(lock, ww_ctx);
18376 return 0;
18377 }
18378
18379@@ -1260,17 +1746,27 @@
18380 if (unlikely(timeout))
18381 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
18382
18383- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
18384+ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
18385
18386- if (likely(!ret))
18387+ if (likely(!ret)) {
18388 /* sleep on the mutex */
18389- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
18390+ ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
18391+ ww_ctx);
18392+ } else if (ww_ctx) {
18393+ /* ww_mutex received EDEADLK, let it become EALREADY */
18394+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
18395+ BUG_ON(!ret);
18396+ }
18397
18398 if (unlikely(ret)) {
18399 __set_current_state(TASK_RUNNING);
18400 if (rt_mutex_has_waiters(lock))
18401- remove_waiter(lock, &waiter);
18402- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
18403+ remove_waiter(lock, waiter);
18404+ /* ww_mutex want to report EDEADLK/EALREADY, let them */
18405+ if (!ww_ctx)
18406+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
18407+ } else if (ww_ctx) {
18408+ ww_mutex_account_lock(lock, ww_ctx);
18409 }
18410
18411 /*
18412@@ -1278,6 +1774,36 @@
18413 * unconditionally. We might have to fix that up.
18414 */
18415 fixup_rt_mutex_waiters(lock);
18416+ return ret;
1a6e0f06
JK
18417+}
18418+
e4b2b4a8
JK
18419+/*
18420+ * Slow path lock function:
18421+ */
18422+static int __sched
18423+rt_mutex_slowlock(struct rt_mutex *lock, int state,
18424+ struct hrtimer_sleeper *timeout,
18425+ enum rtmutex_chainwalk chwalk,
18426+ struct ww_acquire_ctx *ww_ctx)
1a6e0f06 18427+{
e4b2b4a8
JK
18428+ struct rt_mutex_waiter waiter;
18429+ unsigned long flags;
18430+ int ret = 0;
1a6e0f06 18431+
e4b2b4a8 18432+ rt_mutex_init_waiter(&waiter, false);
1a6e0f06 18433+
e4b2b4a8
JK
18434+ /*
18435+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
18436+ * be called in early boot if the cmpxchg() fast path is disabled
18437+ * (debug, no architecture support). In this case we will acquire the
18438+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
18439+ * enable interrupts in that early boot case. So we need to use the
18440+ * irqsave/restore variants.
18441+ */
18442+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
1a6e0f06 18443+
e4b2b4a8
JK
18444+ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
18445+ &waiter);
18446
18447 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18448
18449@@ -1338,7 +1864,8 @@
18450 * Return whether the current task needs to call rt_mutex_postunlock().
18451 */
18452 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
18453- struct wake_q_head *wake_q)
18454+ struct wake_q_head *wake_q,
18455+ struct wake_q_head *wake_sleeper_q)
18456 {
18457 unsigned long flags;
18458
18459@@ -1392,7 +1919,7 @@
18460 *
18461 * Queue the next waiter for wakeup once we release the wait_lock.
18462 */
18463- mark_wakeup_next_waiter(wake_q, lock);
18464+ mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
18465 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18466
18467 return true; /* call rt_mutex_postunlock() */
18468@@ -1406,29 +1933,45 @@
18469 */
18470 static inline int
18471 rt_mutex_fastlock(struct rt_mutex *lock, int state,
18472+ struct ww_acquire_ctx *ww_ctx,
18473 int (*slowfn)(struct rt_mutex *lock, int state,
18474 struct hrtimer_sleeper *timeout,
18475- enum rtmutex_chainwalk chwalk))
18476+ enum rtmutex_chainwalk chwalk,
18477+ struct ww_acquire_ctx *ww_ctx))
18478 {
18479 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18480 return 0;
18481
18482- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
18483+ /*
18484+ * If rt_mutex blocks, the function sched_submit_work will not call
18485+ * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
18486+ * We must call blk_schedule_flush_plug here, if we don't call it,
18487+ * a deadlock in device mapper may happen.
18488+ */
18489+ if (unlikely(blk_needs_flush_plug(current)))
18490+ blk_schedule_flush_plug(current);
1a6e0f06 18491+
e4b2b4a8
JK
18492+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
18493 }
18494
18495 static inline int
18496 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
18497 struct hrtimer_sleeper *timeout,
18498 enum rtmutex_chainwalk chwalk,
18499+ struct ww_acquire_ctx *ww_ctx,
18500 int (*slowfn)(struct rt_mutex *lock, int state,
18501 struct hrtimer_sleeper *timeout,
18502- enum rtmutex_chainwalk chwalk))
18503+ enum rtmutex_chainwalk chwalk,
18504+ struct ww_acquire_ctx *ww_ctx))
18505 {
18506 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
18507 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18508 return 0;
18509
18510- return slowfn(lock, state, timeout, chwalk);
18511+ if (unlikely(blk_needs_flush_plug(current)))
18512+ blk_schedule_flush_plug(current);
1a6e0f06 18513+
e4b2b4a8
JK
18514+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
18515 }
18516
18517 static inline int
18518@@ -1444,9 +1987,11 @@
18519 /*
18520 * Performs the wakeup of the the top-waiter and re-enables preemption.
18521 */
18522-void rt_mutex_postunlock(struct wake_q_head *wake_q)
18523+void rt_mutex_postunlock(struct wake_q_head *wake_q,
18524+ struct wake_q_head *wake_sleeper_q)
18525 {
18526 wake_up_q(wake_q);
18527+ wake_up_q_sleeper(wake_sleeper_q);
18528
18529 /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
18530 preempt_enable();
18531@@ -1455,15 +2000,40 @@
18532 static inline void
18533 rt_mutex_fastunlock(struct rt_mutex *lock,
18534 bool (*slowfn)(struct rt_mutex *lock,
18535- struct wake_q_head *wqh))
18536+ struct wake_q_head *wqh,
18537+ struct wake_q_head *wq_sleeper))
18538 {
18539 DEFINE_WAKE_Q(wake_q);
18540+ DEFINE_WAKE_Q(wake_sleeper_q);
18541
18542 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18543 return;
18544
18545- if (slowfn(lock, &wake_q))
18546- rt_mutex_postunlock(&wake_q);
18547+ if (slowfn(lock, &wake_q, &wake_sleeper_q))
18548+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
1a6e0f06
JK
18549+}
18550+
e4b2b4a8 18551+int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
1a6e0f06 18552+{
e4b2b4a8
JK
18553+ might_sleep();
18554+ return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
1a6e0f06
JK
18555+}
18556+
e4b2b4a8
JK
18557+/**
18558+ * rt_mutex_lock_state - lock a rt_mutex with a given state
18559+ *
18560+ * @lock: The rt_mutex to be locked
18561+ * @state: The state to set when blocking on the rt_mutex
18562+ */
18563+static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
1a6e0f06 18564+{
e4b2b4a8 18565+ int ret;
1a6e0f06 18566+
e4b2b4a8
JK
18567+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18568+ ret = __rt_mutex_lock_state(lock, state);
18569+ if (ret)
18570+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18571+ return ret;
18572 }
18573
18574 /**
18575@@ -1473,10 +2043,7 @@
18576 */
18577 void __sched rt_mutex_lock(struct rt_mutex *lock)
18578 {
18579- might_sleep();
18580-
18581- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18582- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
18583+ rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
18584 }
18585 EXPORT_SYMBOL_GPL(rt_mutex_lock);
18586
18587@@ -1491,16 +2058,7 @@
18588 */
18589 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
18590 {
18591- int ret;
18592-
18593- might_sleep();
18594-
18595- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18596- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
18597- if (ret)
18598- mutex_release(&lock->dep_map, 1, _RET_IP_);
18599-
18600- return ret;
18601+ return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
18602 }
18603 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
18604
18605@@ -1518,6 +2076,22 @@
18606 }
18607
18608 /**
18609+ * rt_mutex_lock_killable - lock a rt_mutex killable
18610+ *
18611+ * @lock: the rt_mutex to be locked
18612+ * @detect_deadlock: deadlock detection on/off
18613+ *
18614+ * Returns:
18615+ * 0 on success
18616+ * -EINTR when interrupted by a signal
18617+ */
18618+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
1a6e0f06 18619+{
e4b2b4a8 18620+ return rt_mutex_lock_state(lock, TASK_KILLABLE);
1a6e0f06 18621+}
e4b2b4a8 18622+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
1a6e0f06 18623+
e4b2b4a8
JK
18624+/**
18625 * rt_mutex_timed_lock - lock a rt_mutex interruptible
18626 * the timeout structure is provided
18627 * by the caller
18628@@ -1540,6 +2114,7 @@
18629 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18630 ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
18631 RT_MUTEX_MIN_CHAINWALK,
18632+ NULL,
18633 rt_mutex_slowlock);
18634 if (ret)
18635 mutex_release(&lock->dep_map, 1, _RET_IP_);
18636@@ -1548,6 +2123,18 @@
18637 }
18638 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
18639
18640+int __sched __rt_mutex_trylock(struct rt_mutex *lock)
1a6e0f06 18641+{
e4b2b4a8
JK
18642+#ifdef CONFIG_PREEMPT_RT_FULL
18643+ if (WARN_ON_ONCE(in_irq() || in_nmi()))
1a6e0f06 18644+#else
e4b2b4a8
JK
18645+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
18646+#endif
18647+ return 0;
1a6e0f06 18648+
e4b2b4a8 18649+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
1a6e0f06 18650+}
e4b2b4a8
JK
18651+
18652 /**
18653 * rt_mutex_trylock - try to lock a rt_mutex
18654 *
18655@@ -1563,10 +2150,7 @@
18656 {
18657 int ret;
1a6e0f06 18658
e4b2b4a8
JK
18659- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
18660- return 0;
18661-
18662- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
18663+ ret = __rt_mutex_trylock(lock);
18664 if (ret)
18665 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
1a6e0f06 18666
e4b2b4a8
JK
18667@@ -1574,6 +2158,11 @@
18668 }
18669 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
1a6e0f06 18670
e4b2b4a8
JK
18671+void __sched __rt_mutex_unlock(struct rt_mutex *lock)
18672+{
18673+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
18674+}
1a6e0f06 18675+
e4b2b4a8
JK
18676 /**
18677 * rt_mutex_unlock - unlock a rt_mutex
18678 *
18679@@ -1582,16 +2171,13 @@
18680 void __sched rt_mutex_unlock(struct rt_mutex *lock)
18681 {
18682 mutex_release(&lock->dep_map, 1, _RET_IP_);
18683- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
18684+ __rt_mutex_unlock(lock);
18685 }
18686 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
18687
18688-/**
18689- * Futex variant, that since futex variants do not use the fast-path, can be
18690- * simple and will not need to retry.
18691- */
18692-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
18693- struct wake_q_head *wake_q)
18694+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
18695+ struct wake_q_head *wake_q,
18696+ struct wake_q_head *wq_sleeper)
18697 {
18698 lockdep_assert_held(&lock->wait_lock);
18699
18700@@ -1608,22 +2194,35 @@
18701 * avoid inversion prior to the wakeup. preempt_disable()
18702 * therein pairs with rt_mutex_postunlock().
18703 */
18704- mark_wakeup_next_waiter(wake_q, lock);
18705+ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
1a6e0f06 18706
e4b2b4a8
JK
18707 return true; /* call postunlock() */
18708 }
1a6e0f06 18709
e4b2b4a8
JK
18710+/**
18711+ * Futex variant, that since futex variants do not use the fast-path, can be
18712+ * simple and will not need to retry.
18713+ */
18714+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
18715+ struct wake_q_head *wake_q,
18716+ struct wake_q_head *wq_sleeper)
1a6e0f06 18717+{
e4b2b4a8 18718+ return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
1a6e0f06
JK
18719+}
18720+
e4b2b4a8
JK
18721 void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
18722 {
18723 DEFINE_WAKE_Q(wake_q);
18724+ DEFINE_WAKE_Q(wake_sleeper_q);
18725+ unsigned long flags;
18726 bool postunlock;
1a6e0f06 18727
e4b2b4a8
JK
18728- raw_spin_lock_irq(&lock->wait_lock);
18729- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
18730- raw_spin_unlock_irq(&lock->wait_lock);
18731+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
18732+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
18733+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1a6e0f06 18734
e4b2b4a8
JK
18735 if (postunlock)
18736- rt_mutex_postunlock(&wake_q);
18737+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18738 }
1a6e0f06 18739
e4b2b4a8
JK
18740 /**
18741@@ -1662,7 +2261,7 @@
18742 if (name && key)
18743 debug_rt_mutex_init(lock, name, key);
18744 }
18745-EXPORT_SYMBOL_GPL(__rt_mutex_init);
18746+EXPORT_SYMBOL(__rt_mutex_init);
1a6e0f06 18747
e4b2b4a8
JK
18748 /**
18749 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
18750@@ -1682,6 +2281,14 @@
18751 struct task_struct *proxy_owner)
18752 {
18753 __rt_mutex_init(lock, NULL, NULL);
18754+#ifdef CONFIG_DEBUG_SPINLOCK
18755+ /*
18756+ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
18757+ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
18758+ * lock.
18759+ */
18760+ raw_spin_lock_init(&lock->wait_lock);
1a6e0f06 18761+#endif
e4b2b4a8
JK
18762 debug_rt_mutex_proxy_lock(lock, proxy_owner);
18763 rt_mutex_set_owner(lock, proxy_owner);
18764 }
18765@@ -1714,6 +2321,34 @@
18766 if (try_to_take_rt_mutex(lock, task, NULL))
18767 return 1;
1a6e0f06 18768
1a6e0f06 18769+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8
JK
18770+ /*
18771+ * In PREEMPT_RT there's an added race.
18772+ * If the task, that we are about to requeue, times out,
18773+ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
18774+ * to skip this task. But right after the task sets
18775+ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
18776+ * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
18777+ * This will replace the PI_WAKEUP_INPROGRESS with the actual
18778+ * lock that it blocks on. We *must not* place this task
18779+ * on this proxy lock in that case.
18780+ *
18781+ * To prevent this race, we first take the task's pi_lock
18782+ * and check if it has updated its pi_blocked_on. If it has,
18783+ * we assume that it woke up and we return -EAGAIN.
18784+ * Otherwise, we set the task's pi_blocked_on to
18785+ * PI_REQUEUE_INPROGRESS, so that if the task is waking up
18786+ * it will know that we are in the process of requeuing it.
18787+ */
18788+ raw_spin_lock(&task->pi_lock);
18789+ if (task->pi_blocked_on) {
18790+ raw_spin_unlock(&task->pi_lock);
18791+ return -EAGAIN;
18792+ }
18793+ task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
18794+ raw_spin_unlock(&task->pi_lock);
1a6e0f06 18795+#endif
1a6e0f06 18796+
e4b2b4a8
JK
18797 /* We enforce deadlock detection for futexes */
18798 ret = task_blocks_on_rt_mutex(lock, waiter, task,
18799 RT_MUTEX_FULL_CHAINWALK);
18800@@ -1728,7 +2363,7 @@
18801 ret = 0;
18802 }
1a6e0f06 18803
e4b2b4a8
JK
18804- if (unlikely(ret))
18805+ if (ret && rt_mutex_has_waiters(lock))
18806 remove_waiter(lock, waiter);
1a6e0f06 18807
e4b2b4a8
JK
18808 debug_rt_mutex_print_deadlock(waiter);
18809@@ -1803,17 +2438,36 @@
18810 struct hrtimer_sleeper *to,
18811 struct rt_mutex_waiter *waiter)
18812 {
18813+ struct task_struct *tsk = current;
18814 int ret;
18815
18816 raw_spin_lock_irq(&lock->wait_lock);
18817 /* sleep on the mutex */
18818 set_current_state(TASK_INTERRUPTIBLE);
18819- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
18820+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
18821 /*
18822 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
18823 * have to fix that up.
18824 */
18825 fixup_rt_mutex_waiters(lock);
18826+ /*
18827+ * RT has a problem here when the wait got interrupted by a timeout
18828+ * or a signal. task->pi_blocked_on is still set. The task must
18829+ * acquire the hash bucket lock when returning from this function.
18830+ *
18831+ * If the hash bucket lock is contended then the
18832+ * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
18833+ * task_blocks_on_rt_mutex() will trigger. This can be avoided by
18834+ * clearing task->pi_blocked_on which removes the task from the
18835+ * boosting chain of the rtmutex. That's correct because the task
18836+ * is not longer blocked on it.
18837+ */
18838+ if (ret) {
18839+ raw_spin_lock(&tsk->pi_lock);
18840+ tsk->pi_blocked_on = NULL;
18841+ raw_spin_unlock(&tsk->pi_lock);
18842+ }
1a6e0f06 18843+
e4b2b4a8 18844 raw_spin_unlock_irq(&lock->wait_lock);
1a6e0f06 18845
e4b2b4a8
JK
18846 return ret;
18847@@ -1874,3 +2528,99 @@
1a6e0f06 18848
e4b2b4a8 18849 return cleanup;
1a6e0f06 18850 }
e4b2b4a8
JK
18851+
18852+static inline int
18853+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
1a6e0f06 18854+{
e4b2b4a8
JK
18855+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
18856+ unsigned tmp;
18857+
18858+ if (ctx->deadlock_inject_countdown-- == 0) {
18859+ tmp = ctx->deadlock_inject_interval;
18860+ if (tmp > UINT_MAX/4)
18861+ tmp = UINT_MAX;
18862+ else
18863+ tmp = tmp*2 + tmp + tmp/2;
18864+
18865+ ctx->deadlock_inject_interval = tmp;
18866+ ctx->deadlock_inject_countdown = tmp;
18867+ ctx->contending_lock = lock;
18868+
18869+ ww_mutex_unlock(lock);
18870+
18871+ return -EDEADLK;
18872+ }
1a6e0f06
JK
18873+#endif
18874+
e4b2b4a8 18875+ return 0;
1a6e0f06
JK
18876+}
18877+
e4b2b4a8
JK
18878+#ifdef CONFIG_PREEMPT_RT_FULL
18879+int __sched
18880+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
1a6e0f06 18881+{
e4b2b4a8 18882+ int ret;
1a6e0f06 18883+
e4b2b4a8 18884+ might_sleep();
1a6e0f06 18885+
e4b2b4a8
JK
18886+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
18887+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
18888+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
18889+ ctx);
18890+ if (ret)
18891+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
18892+ else if (!ret && ctx && ctx->acquired > 1)
18893+ return ww_mutex_deadlock_injection(lock, ctx);
1a6e0f06 18894+
e4b2b4a8 18895+ return ret;
1a6e0f06 18896+}
e4b2b4a8 18897+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
1a6e0f06 18898+
e4b2b4a8
JK
18899+int __sched
18900+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
1a6e0f06 18901+{
e4b2b4a8 18902+ int ret;
1a6e0f06 18903+
e4b2b4a8 18904+ might_sleep();
1a6e0f06 18905+
e4b2b4a8
JK
18906+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
18907+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
18908+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
18909+ ctx);
18910+ if (ret)
18911+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
18912+ else if (!ret && ctx && ctx->acquired > 1)
18913+ return ww_mutex_deadlock_injection(lock, ctx);
18914+
18915+ return ret;
1a6e0f06 18916+}
e4b2b4a8 18917+EXPORT_SYMBOL_GPL(ww_mutex_lock);
1a6e0f06 18918+
e4b2b4a8 18919+void __sched ww_mutex_unlock(struct ww_mutex *lock)
1a6e0f06 18920+{
e4b2b4a8 18921+ int nest = !!lock->ctx;
1a6e0f06 18922+
e4b2b4a8
JK
18923+ /*
18924+ * The unlocking fastpath is the 0->1 transition from 'locked'
18925+ * into 'unlocked' state:
18926+ */
18927+ if (nest) {
18928+#ifdef CONFIG_DEBUG_MUTEXES
18929+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
1a6e0f06 18930+#endif
e4b2b4a8
JK
18931+ if (lock->ctx->acquired > 0)
18932+ lock->ctx->acquired--;
18933+ lock->ctx = NULL;
18934+ }
18935+
18936+ mutex_release(&lock->base.dep_map, nest, _RET_IP_);
18937+ __rt_mutex_unlock(&lock->base.lock);
1a6e0f06 18938+}
e4b2b4a8 18939+EXPORT_SYMBOL(ww_mutex_unlock);
1a6e0f06 18940+
e4b2b4a8 18941+int __rt_mutex_owner_current(struct rt_mutex *lock)
1a6e0f06 18942+{
e4b2b4a8 18943+ return rt_mutex_owner(lock) == current;
1a6e0f06 18944+}
e4b2b4a8 18945+EXPORT_SYMBOL(__rt_mutex_owner_current);
1a6e0f06 18946+#endif
e4b2b4a8
JK
18947diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rtmutex_common.h linux-4.14/kernel/locking/rtmutex_common.h
18948--- linux-4.14.orig/kernel/locking/rtmutex_common.h 2018-09-05 11:03:22.000000000 +0200
18949+++ linux-4.14/kernel/locking/rtmutex_common.h 2018-09-05 11:05:07.000000000 +0200
18950@@ -15,6 +15,7 @@
1a6e0f06 18951
e4b2b4a8
JK
18952 #include <linux/rtmutex.h>
18953 #include <linux/sched/wake_q.h>
18954+#include <linux/sched/debug.h>
1a6e0f06 18955
e4b2b4a8
JK
18956 /*
18957 * This is the control structure for tasks blocked on a rt_mutex,
18958@@ -29,6 +30,7 @@
18959 struct rb_node pi_tree_entry;
18960 struct task_struct *task;
18961 struct rt_mutex *lock;
18962+ bool savestate;
18963 #ifdef CONFIG_DEBUG_RT_MUTEXES
18964 unsigned long ip;
18965 struct pid *deadlock_task_pid;
18966@@ -129,12 +131,15 @@
18967 /*
18968 * PI-futex support (proxy locking functions, etc.):
18969 */
18970+#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
18971+#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
18972+
18973 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
18974 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
18975 struct task_struct *proxy_owner);
18976 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
18977 struct task_struct *proxy_owner);
18978-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
18979+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
18980 extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
18981 struct rt_mutex_waiter *waiter,
18982 struct task_struct *task);
18983@@ -152,9 +157,27 @@
18984
18985 extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
18986 extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
18987- struct wake_q_head *wqh);
18988+ struct wake_q_head *wqh,
18989+ struct wake_q_head *wq_sleeper);
18990
18991-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
18992+extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
18993+ struct wake_q_head *wake_sleeper_q);
18994+
18995+/* RW semaphore special interface */
18996+struct ww_acquire_ctx;
18997+
18998+extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
18999+extern int __rt_mutex_trylock(struct rt_mutex *lock);
19000+extern void __rt_mutex_unlock(struct rt_mutex *lock);
19001+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
19002+ struct hrtimer_sleeper *timeout,
19003+ enum rtmutex_chainwalk chwalk,
19004+ struct ww_acquire_ctx *ww_ctx,
19005+ struct rt_mutex_waiter *waiter);
19006+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
19007+ struct rt_mutex_waiter *waiter,
19008+ unsigned long flags);
19009+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
1a6e0f06 19010
e4b2b4a8
JK
19011 #ifdef CONFIG_DEBUG_RT_MUTEXES
19012 # include "rtmutex-debug.h"
19013diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rwlock-rt.c linux-4.14/kernel/locking/rwlock-rt.c
19014--- linux-4.14.orig/kernel/locking/rwlock-rt.c 1970-01-01 01:00:00.000000000 +0100
19015+++ linux-4.14/kernel/locking/rwlock-rt.c 2018-09-05 11:05:07.000000000 +0200
19016@@ -0,0 +1,378 @@
19017+/*
19018+ */
19019+#include <linux/sched/debug.h>
19020+#include <linux/export.h>
19021+
19022+#include "rtmutex_common.h"
19023+#include <linux/rwlock_types_rt.h>
19024+
19025+/*
19026+ * RT-specific reader/writer locks
19027+ *
19028+ * write_lock()
19029+ * 1) Lock lock->rtmutex
19030+ * 2) Remove the reader BIAS to force readers into the slow path
19031+ * 3) Wait until all readers have left the critical region
19032+ * 4) Mark it write locked
19033+ *
19034+ * write_unlock()
19035+ * 1) Remove the write locked marker
19036+ * 2) Set the reader BIAS so readers can use the fast path again
19037+ * 3) Unlock lock->rtmutex to release blocked readers
19038+ *
19039+ * read_lock()
19040+ * 1) Try fast path acquisition (reader BIAS is set)
19041+ * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag
19042+ * 3) If !writelocked, acquire it for read
19043+ * 4) If writelocked, block on lock->rtmutex
19044+ * 5) unlock lock->rtmutex, goto 1)
19045+ *
19046+ * read_unlock()
19047+ * 1) Try fast path release (reader count != 1)
19048+ * 2) Wake the writer waiting in write_lock()#3
19049+ *
19050+ * read_lock()#3 has the consequence, that rw locks on RT are not writer
19051+ * fair, but writers, which should be avoided in RT tasks (think tasklist
19052+ * lock), are subject to the rtmutex priority/DL inheritance mechanism.
19053+ *
19054+ * It's possible to make the rw locks writer fair by keeping a list of
19055+ * active readers. A blocked writer would force all newly incoming readers
19056+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
19057+ * for one reader after the other. We can't use multi-reader inheritance
19058+ * because there is no way to support that with
19059+ * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
19060+ * mechanism is a major surgery for a very dubious value.
19061+ *
19062+ * The risk of writer starvation is there, but the pathological use cases
19063+ * which trigger it are not necessarily the typical RT workloads.
19064+ */
19065+
19066+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
19067+ struct lock_class_key *key)
1a6e0f06 19068+{
e4b2b4a8
JK
19069+#ifdef CONFIG_DEBUG_LOCK_ALLOC
19070+ /*
19071+ * Make sure we are not reinitializing a held semaphore:
19072+ */
19073+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19074+ lockdep_init_map(&lock->dep_map, name, key, 0);
1a6e0f06 19075+#endif
e4b2b4a8
JK
19076+ atomic_set(&lock->readers, READER_BIAS);
19077+ rt_mutex_init(&lock->rtmutex);
19078+ lock->rtmutex.save_state = 1;
1a6e0f06
JK
19079+}
19080+
e4b2b4a8 19081+int __read_rt_trylock(struct rt_rw_lock *lock)
1a6e0f06 19082+{
e4b2b4a8 19083+ int r, old;
1a6e0f06 19084+
e4b2b4a8
JK
19085+ /*
19086+ * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
19087+ * set.
19088+ */
19089+ for (r = atomic_read(&lock->readers); r < 0;) {
19090+ old = atomic_cmpxchg(&lock->readers, r, r + 1);
19091+ if (likely(old == r))
19092+ return 1;
19093+ r = old;
19094+ }
19095+ return 0;
1a6e0f06
JK
19096+}
19097+
e4b2b4a8 19098+void __sched __read_rt_lock(struct rt_rw_lock *lock)
1a6e0f06 19099+{
e4b2b4a8
JK
19100+ struct rt_mutex *m = &lock->rtmutex;
19101+ struct rt_mutex_waiter waiter;
19102+ unsigned long flags;
1a6e0f06 19103+
e4b2b4a8
JK
19104+ if (__read_rt_trylock(lock))
19105+ return;
19106+
19107+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19108+ /*
19109+ * Allow readers as long as the writer has not completely
19110+ * acquired the semaphore for write.
19111+ */
19112+ if (atomic_read(&lock->readers) != WRITER_BIAS) {
19113+ atomic_inc(&lock->readers);
19114+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19115+ return;
19116+ }
19117+
19118+ /*
19119+ * Call into the slow lock path with the rtmutex->wait_lock
19120+ * held, so this can't result in the following race:
19121+ *
19122+ * Reader1 Reader2 Writer
19123+ * read_lock()
19124+ * write_lock()
19125+ * rtmutex_lock(m)
19126+ * swait()
19127+ * read_lock()
19128+ * unlock(m->wait_lock)
19129+ * read_unlock()
19130+ * swake()
19131+ * lock(m->wait_lock)
19132+ * lock->writelocked=true
19133+ * unlock(m->wait_lock)
19134+ *
19135+ * write_unlock()
19136+ * lock->writelocked=false
19137+ * rtmutex_unlock(m)
19138+ * read_lock()
19139+ * write_lock()
19140+ * rtmutex_lock(m)
19141+ * swait()
19142+ * rtmutex_lock(m)
19143+ *
19144+ * That would put Reader1 behind the writer waiting on
19145+ * Reader2 to call read_unlock() which might be unbound.
19146+ */
19147+ rt_mutex_init_waiter(&waiter, false);
19148+ rt_spin_lock_slowlock_locked(m, &waiter, flags);
19149+ /*
19150+ * The slowlock() above is guaranteed to return with the rtmutex is
19151+ * now held, so there can't be a writer active. Increment the reader
19152+ * count and immediately drop the rtmutex again.
19153+ */
19154+ atomic_inc(&lock->readers);
19155+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19156+ rt_spin_lock_slowunlock(m);
19157+
19158+ debug_rt_mutex_free_waiter(&waiter);
1a6e0f06
JK
19159+}
19160+
e4b2b4a8 19161+void __read_rt_unlock(struct rt_rw_lock *lock)
1a6e0f06 19162+{
e4b2b4a8
JK
19163+ struct rt_mutex *m = &lock->rtmutex;
19164+ struct task_struct *tsk;
19165+
19166+ /*
19167+ * sem->readers can only hit 0 when a writer is waiting for the
19168+ * active readers to leave the critical region.
19169+ */
19170+ if (!atomic_dec_and_test(&lock->readers))
19171+ return;
19172+
19173+ raw_spin_lock_irq(&m->wait_lock);
19174+ /*
19175+ * Wake the writer, i.e. the rtmutex owner. It might release the
19176+ * rtmutex concurrently in the fast path, but to clean up the rw
19177+ * lock it needs to acquire m->wait_lock. The worst case which can
19178+ * happen is a spurious wakeup.
19179+ */
19180+ tsk = rt_mutex_owner(m);
19181+ if (tsk)
19182+ wake_up_process(tsk);
19183+
19184+ raw_spin_unlock_irq(&m->wait_lock);
1a6e0f06
JK
19185+}
19186+
e4b2b4a8
JK
19187+static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
19188+ unsigned long flags)
1a6e0f06 19189+{
e4b2b4a8 19190+ struct rt_mutex *m = &lock->rtmutex;
1a6e0f06 19191+
e4b2b4a8
JK
19192+ atomic_add(READER_BIAS - bias, &lock->readers);
19193+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19194+ rt_spin_lock_slowunlock(m);
1a6e0f06
JK
19195+}
19196+
e4b2b4a8 19197+void __sched __write_rt_lock(struct rt_rw_lock *lock)
1a6e0f06 19198+{
e4b2b4a8
JK
19199+ struct rt_mutex *m = &lock->rtmutex;
19200+ struct task_struct *self = current;
19201+ unsigned long flags;
19202+
19203+ /* Take the rtmutex as a first step */
19204+ __rt_spin_lock(m);
19205+
19206+ /* Force readers into slow path */
19207+ atomic_sub(READER_BIAS, &lock->readers);
19208+
19209+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19210+
19211+ raw_spin_lock(&self->pi_lock);
19212+ self->saved_state = self->state;
19213+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19214+ raw_spin_unlock(&self->pi_lock);
19215+
19216+ for (;;) {
19217+ /* Have all readers left the critical region? */
19218+ if (!atomic_read(&lock->readers)) {
19219+ atomic_set(&lock->readers, WRITER_BIAS);
19220+ raw_spin_lock(&self->pi_lock);
19221+ __set_current_state_no_track(self->saved_state);
19222+ self->saved_state = TASK_RUNNING;
19223+ raw_spin_unlock(&self->pi_lock);
19224+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19225+ return;
19226+ }
19227+
19228+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19229+
19230+ if (atomic_read(&lock->readers) != 0)
19231+ schedule();
19232+
19233+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19234+
19235+ raw_spin_lock(&self->pi_lock);
19236+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19237+ raw_spin_unlock(&self->pi_lock);
19238+ }
19239+}
19240+
19241+int __write_rt_trylock(struct rt_rw_lock *lock)
1a6e0f06 19242+{
e4b2b4a8
JK
19243+ struct rt_mutex *m = &lock->rtmutex;
19244+ unsigned long flags;
19245+
19246+ if (!__rt_mutex_trylock(m))
19247+ return 0;
19248+
19249+ atomic_sub(READER_BIAS, &lock->readers);
19250+
19251+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19252+ if (!atomic_read(&lock->readers)) {
19253+ atomic_set(&lock->readers, WRITER_BIAS);
19254+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19255+ return 1;
19256+ }
19257+ __write_unlock_common(lock, 0, flags);
19258+ return 0;
1a6e0f06
JK
19259+}
19260+
e4b2b4a8
JK
19261+void __write_rt_unlock(struct rt_rw_lock *lock)
19262+{
19263+ struct rt_mutex *m = &lock->rtmutex;
19264+ unsigned long flags;
1a6e0f06 19265+
e4b2b4a8
JK
19266+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19267+ __write_unlock_common(lock, WRITER_BIAS, flags);
19268+}
1a6e0f06 19269+
e4b2b4a8
JK
19270+/* Map the reader biased implementation */
19271+static inline int do_read_rt_trylock(rwlock_t *rwlock)
1a6e0f06 19272+{
e4b2b4a8 19273+ return __read_rt_trylock(rwlock);
1a6e0f06
JK
19274+}
19275+
e4b2b4a8
JK
19276+static inline int do_write_rt_trylock(rwlock_t *rwlock)
19277+{
19278+ return __write_rt_trylock(rwlock);
19279+}
1a6e0f06 19280+
e4b2b4a8
JK
19281+static inline void do_read_rt_lock(rwlock_t *rwlock)
19282+{
19283+ __read_rt_lock(rwlock);
19284+}
1a6e0f06 19285+
e4b2b4a8
JK
19286+static inline void do_write_rt_lock(rwlock_t *rwlock)
19287+{
19288+ __write_rt_lock(rwlock);
19289+}
1a6e0f06 19290+
e4b2b4a8
JK
19291+static inline void do_read_rt_unlock(rwlock_t *rwlock)
19292+{
19293+ __read_rt_unlock(rwlock);
19294+}
1a6e0f06 19295+
e4b2b4a8
JK
19296+static inline void do_write_rt_unlock(rwlock_t *rwlock)
19297+{
19298+ __write_rt_unlock(rwlock);
19299+}
1a6e0f06 19300+
e4b2b4a8
JK
19301+static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
19302+ struct lock_class_key *key)
19303+{
19304+ __rwlock_biased_rt_init(rwlock, name, key);
19305+}
1a6e0f06 19306+
e4b2b4a8
JK
19307+int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
19308+{
19309+ return atomic_read(&rwlock->readers) < 0;
19310+}
1a6e0f06 19311+
e4b2b4a8
JK
19312+int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
19313+{
19314+ return atomic_read(&rwlock->readers) == READER_BIAS;
19315+}
1a6e0f06
JK
19316+
19317+/*
e4b2b4a8 19318+ * The common functions which get wrapped into the rwlock API.
1a6e0f06 19319+ */
e4b2b4a8
JK
19320+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19321+{
19322+ int ret;
1a6e0f06 19323+
e4b2b4a8
JK
19324+ sleeping_lock_inc();
19325+ migrate_disable();
19326+ ret = do_read_rt_trylock(rwlock);
19327+ if (ret) {
19328+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
19329+ } else {
19330+ migrate_enable();
19331+ sleeping_lock_dec();
19332+ }
19333+ return ret;
19334+}
19335+EXPORT_SYMBOL(rt_read_trylock);
1a6e0f06 19336+
e4b2b4a8
JK
19337+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19338+{
19339+ int ret;
1a6e0f06 19340+
e4b2b4a8
JK
19341+ sleeping_lock_inc();
19342+ migrate_disable();
19343+ ret = do_write_rt_trylock(rwlock);
19344+ if (ret) {
19345+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19346+ } else {
19347+ migrate_enable();
19348+ sleeping_lock_dec();
19349+ }
19350+ return ret;
19351+}
19352+EXPORT_SYMBOL(rt_write_trylock);
1a6e0f06 19353+
e4b2b4a8
JK
19354+void __lockfunc rt_read_lock(rwlock_t *rwlock)
19355+{
19356+ sleeping_lock_inc();
19357+ migrate_disable();
19358+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
19359+ do_read_rt_lock(rwlock);
19360+}
19361+EXPORT_SYMBOL(rt_read_lock);
1a6e0f06 19362+
e4b2b4a8
JK
19363+void __lockfunc rt_write_lock(rwlock_t *rwlock)
19364+{
19365+ sleeping_lock_inc();
19366+ migrate_disable();
19367+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19368+ do_write_rt_lock(rwlock);
19369+}
19370+EXPORT_SYMBOL(rt_write_lock);
1a6e0f06 19371+
e4b2b4a8
JK
19372+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19373+{
19374+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19375+ do_read_rt_unlock(rwlock);
19376+ migrate_enable();
19377+ sleeping_lock_dec();
19378+}
19379+EXPORT_SYMBOL(rt_read_unlock);
1a6e0f06 19380+
e4b2b4a8
JK
19381+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19382+{
19383+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19384+ do_write_rt_unlock(rwlock);
19385+ migrate_enable();
19386+ sleeping_lock_dec();
19387+}
19388+EXPORT_SYMBOL(rt_write_unlock);
1a6e0f06 19389+
e4b2b4a8
JK
19390+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19391+{
19392+ do_rwlock_rt_init(rwlock, name, key);
19393+}
19394+EXPORT_SYMBOL(__rt_rwlock_init);
19395diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rwsem-rt.c linux-4.14/kernel/locking/rwsem-rt.c
19396--- linux-4.14.orig/kernel/locking/rwsem-rt.c 1970-01-01 01:00:00.000000000 +0100
19397+++ linux-4.14/kernel/locking/rwsem-rt.c 2018-09-05 11:05:07.000000000 +0200
19398@@ -0,0 +1,269 @@
19399+/*
19400+ */
19401+#include <linux/rwsem.h>
19402+#include <linux/sched/debug.h>
19403+#include <linux/sched/signal.h>
19404+#include <linux/export.h>
1a6e0f06 19405+
e4b2b4a8
JK
19406+#include "rtmutex_common.h"
19407+
19408+/*
19409+ * RT-specific reader/writer semaphores
19410+ *
19411+ * down_write()
19412+ * 1) Lock sem->rtmutex
19413+ * 2) Remove the reader BIAS to force readers into the slow path
19414+ * 3) Wait until all readers have left the critical region
19415+ * 4) Mark it write locked
19416+ *
19417+ * up_write()
19418+ * 1) Remove the write locked marker
19419+ * 2) Set the reader BIAS so readers can use the fast path again
19420+ * 3) Unlock sem->rtmutex to release blocked readers
19421+ *
19422+ * down_read()
19423+ * 1) Try fast path acquisition (reader BIAS is set)
19424+ * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag
19425+ * 3) If !writelocked, acquire it for read
19426+ * 4) If writelocked, block on sem->rtmutex
19427+ * 5) unlock sem->rtmutex, goto 1)
19428+ *
19429+ * up_read()
19430+ * 1) Try fast path release (reader count != 1)
19431+ * 2) Wake the writer waiting in down_write()#3
19432+ *
19433+ * down_read()#3 has the consequence, that rw semaphores on RT are not writer
19434+ * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
19435+ * are subject to the rtmutex priority/DL inheritance mechanism.
19436+ *
19437+ * It's possible to make the rw semaphores writer fair by keeping a list of
19438+ * active readers. A blocked writer would force all newly incoming readers to
19439+ * block on the rtmutex, but the rtmutex would have to be proxy locked for one
19440+ * reader after the other. We can't use multi-reader inheritance because there
19441+ * is no way to support that with SCHED_DEADLINE. Implementing the one by one
19442+ * reader boosting/handover mechanism is a major surgery for a very dubious
19443+ * value.
19444+ *
19445+ * The risk of writer starvation is there, but the pathological use cases
19446+ * which trigger it are not necessarily the typical RT workloads.
19447+ */
19448+
19449+void __rwsem_init(struct rw_semaphore *sem, const char *name,
19450+ struct lock_class_key *key)
19451+{
19452+#ifdef CONFIG_DEBUG_LOCK_ALLOC
19453+ /*
19454+ * Make sure we are not reinitializing a held semaphore:
19455+ */
19456+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
19457+ lockdep_init_map(&sem->dep_map, name, key, 0);
1a6e0f06 19458+#endif
e4b2b4a8
JK
19459+ atomic_set(&sem->readers, READER_BIAS);
19460+}
19461+EXPORT_SYMBOL(__rwsem_init);
19462+
19463+int __down_read_trylock(struct rw_semaphore *sem)
19464+{
19465+ int r, old;
19466+
19467+ /*
19468+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
19469+ * set.
19470+ */
19471+ for (r = atomic_read(&sem->readers); r < 0;) {
19472+ old = atomic_cmpxchg(&sem->readers, r, r + 1);
19473+ if (likely(old == r))
19474+ return 1;
19475+ r = old;
19476+ }
19477+ return 0;
19478+}
19479+
19480+void __sched __down_read(struct rw_semaphore *sem)
19481+{
19482+ struct rt_mutex *m = &sem->rtmutex;
19483+ struct rt_mutex_waiter waiter;
19484+
19485+ if (__down_read_trylock(sem))
19486+ return;
19487+
19488+ might_sleep();
19489+ raw_spin_lock_irq(&m->wait_lock);
19490+ /*
19491+ * Allow readers as long as the writer has not completely
19492+ * acquired the semaphore for write.
19493+ */
19494+ if (atomic_read(&sem->readers) != WRITER_BIAS) {
19495+ atomic_inc(&sem->readers);
19496+ raw_spin_unlock_irq(&m->wait_lock);
19497+ return;
19498+ }
1a6e0f06 19499+
e4b2b4a8
JK
19500+ /*
19501+ * Call into the slow lock path with the rtmutex->wait_lock
19502+ * held, so this can't result in the following race:
19503+ *
19504+ * Reader1 Reader2 Writer
19505+ * down_read()
19506+ * down_write()
19507+ * rtmutex_lock(m)
19508+ * swait()
19509+ * down_read()
19510+ * unlock(m->wait_lock)
19511+ * up_read()
19512+ * swake()
19513+ * lock(m->wait_lock)
19514+ * sem->writelocked=true
19515+ * unlock(m->wait_lock)
19516+ *
19517+ * up_write()
19518+ * sem->writelocked=false
19519+ * rtmutex_unlock(m)
19520+ * down_read()
19521+ * down_write()
19522+ * rtmutex_lock(m)
19523+ * swait()
19524+ * rtmutex_lock(m)
19525+ *
19526+ * That would put Reader1 behind the writer waiting on
19527+ * Reader2 to call up_read() which might be unbound.
19528+ */
19529+ rt_mutex_init_waiter(&waiter, false);
19530+ rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
19531+ RT_MUTEX_MIN_CHAINWALK, NULL,
19532+ &waiter);
19533+ /*
19534+ * The slowlock() above is guaranteed to return with the rtmutex is
19535+ * now held, so there can't be a writer active. Increment the reader
19536+ * count and immediately drop the rtmutex again.
19537+ */
19538+ atomic_inc(&sem->readers);
19539+ raw_spin_unlock_irq(&m->wait_lock);
19540+ __rt_mutex_unlock(m);
1a6e0f06 19541+
e4b2b4a8
JK
19542+ debug_rt_mutex_free_waiter(&waiter);
19543+}
19544+
19545+void __up_read(struct rw_semaphore *sem)
1a6e0f06 19546+{
e4b2b4a8
JK
19547+ struct rt_mutex *m = &sem->rtmutex;
19548+ struct task_struct *tsk;
19549+
19550+ /*
19551+ * sem->readers can only hit 0 when a writer is waiting for the
19552+ * active readers to leave the critical region.
19553+ */
19554+ if (!atomic_dec_and_test(&sem->readers))
19555+ return;
19556+
19557+ might_sleep();
19558+ raw_spin_lock_irq(&m->wait_lock);
19559+ /*
19560+ * Wake the writer, i.e. the rtmutex owner. It might release the
19561+ * rtmutex concurrently in the fast path (due to a signal), but to
19562+ * clean up the rwsem it needs to acquire m->wait_lock. The worst
19563+ * case which can happen is a spurious wakeup.
19564+ */
19565+ tsk = rt_mutex_owner(m);
19566+ if (tsk)
19567+ wake_up_process(tsk);
19568+
19569+ raw_spin_unlock_irq(&m->wait_lock);
1a6e0f06
JK
19570+}
19571+
e4b2b4a8
JK
19572+static void __up_write_unlock(struct rw_semaphore *sem, int bias,
19573+ unsigned long flags)
19574+{
19575+ struct rt_mutex *m = &sem->rtmutex;
1a6e0f06 19576+
e4b2b4a8
JK
19577+ atomic_add(READER_BIAS - bias, &sem->readers);
19578+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19579+ __rt_mutex_unlock(m);
19580+}
1a6e0f06 19581+
e4b2b4a8
JK
19582+static int __sched __down_write_common(struct rw_semaphore *sem, int state)
19583+{
19584+ struct rt_mutex *m = &sem->rtmutex;
19585+ unsigned long flags;
1a6e0f06 19586+
e4b2b4a8
JK
19587+ /* Take the rtmutex as a first step */
19588+ if (__rt_mutex_lock_state(m, state))
19589+ return -EINTR;
1a6e0f06 19590+
e4b2b4a8
JK
19591+ /* Force readers into slow path */
19592+ atomic_sub(READER_BIAS, &sem->readers);
19593+ might_sleep();
1a6e0f06 19594+
e4b2b4a8
JK
19595+ set_current_state(state);
19596+ for (;;) {
19597+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19598+ /* Have all readers left the critical region? */
19599+ if (!atomic_read(&sem->readers)) {
19600+ atomic_set(&sem->readers, WRITER_BIAS);
19601+ __set_current_state(TASK_RUNNING);
19602+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19603+ return 0;
19604+ }
1a6e0f06 19605+
e4b2b4a8
JK
19606+ if (signal_pending_state(state, current)) {
19607+ __set_current_state(TASK_RUNNING);
19608+ __up_write_unlock(sem, 0, flags);
19609+ return -EINTR;
19610+ }
19611+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
1a6e0f06 19612+
e4b2b4a8
JK
19613+ if (atomic_read(&sem->readers) != 0) {
19614+ schedule();
19615+ set_current_state(state);
19616+ }
19617+ }
19618+}
1a6e0f06 19619+
e4b2b4a8
JK
19620+void __sched __down_write(struct rw_semaphore *sem)
19621+{
19622+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
19623+}
1a6e0f06 19624+
e4b2b4a8 19625+int __sched __down_write_killable(struct rw_semaphore *sem)
1a6e0f06 19626+{
e4b2b4a8 19627+ return __down_write_common(sem, TASK_KILLABLE);
1a6e0f06
JK
19628+}
19629+
e4b2b4a8 19630+int __down_write_trylock(struct rw_semaphore *sem)
1a6e0f06 19631+{
e4b2b4a8
JK
19632+ struct rt_mutex *m = &sem->rtmutex;
19633+ unsigned long flags;
19634+
19635+ if (!__rt_mutex_trylock(m))
19636+ return 0;
19637+
19638+ atomic_sub(READER_BIAS, &sem->readers);
19639+
19640+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19641+ if (!atomic_read(&sem->readers)) {
19642+ atomic_set(&sem->readers, WRITER_BIAS);
19643+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19644+ return 1;
19645+ }
19646+ __up_write_unlock(sem, 0, flags);
19647+ return 0;
1a6e0f06
JK
19648+}
19649+
e4b2b4a8 19650+void __up_write(struct rw_semaphore *sem)
1a6e0f06 19651+{
e4b2b4a8
JK
19652+ struct rt_mutex *m = &sem->rtmutex;
19653+ unsigned long flags;
19654+
19655+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19656+ __up_write_unlock(sem, WRITER_BIAS, flags);
1a6e0f06
JK
19657+}
19658+
e4b2b4a8
JK
19659+void __downgrade_write(struct rw_semaphore *sem)
19660+{
19661+ struct rt_mutex *m = &sem->rtmutex;
19662+ unsigned long flags;
1a6e0f06 19663+
e4b2b4a8
JK
19664+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19665+ /* Release it and account current as reader */
19666+ __up_write_unlock(sem, WRITER_BIAS - 1, flags);
19667+}
19668diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/spinlock.c linux-4.14/kernel/locking/spinlock.c
19669--- linux-4.14.orig/kernel/locking/spinlock.c 2017-11-12 19:46:13.000000000 +0100
19670+++ linux-4.14/kernel/locking/spinlock.c 2018-09-05 11:05:07.000000000 +0200
19671@@ -125,8 +125,11 @@
19672 * __[spin|read|write]_lock_bh()
1a6e0f06 19673 */
e4b2b4a8 19674 BUILD_LOCK_OPS(spin, raw_spinlock);
1a6e0f06
JK
19675+
19676+#ifndef CONFIG_PREEMPT_RT_FULL
e4b2b4a8
JK
19677 BUILD_LOCK_OPS(read, rwlock);
19678 BUILD_LOCK_OPS(write, rwlock);
19679+#endif
19680
1a6e0f06
JK
19681 #endif
19682
e4b2b4a8
JK
19683@@ -210,6 +213,8 @@
19684 EXPORT_SYMBOL(_raw_spin_unlock_bh);
19685 #endif
19686
19687+#ifndef CONFIG_PREEMPT_RT_FULL
1a6e0f06 19688+
e4b2b4a8
JK
19689 #ifndef CONFIG_INLINE_READ_TRYLOCK
19690 int __lockfunc _raw_read_trylock(rwlock_t *lock)
19691 {
19692@@ -354,6 +359,8 @@
19693 EXPORT_SYMBOL(_raw_write_unlock_bh);
19694 #endif
19695
19696+#endif /* !PREEMPT_RT_FULL */
1a6e0f06 19697+
e4b2b4a8
JK
19698 #ifdef CONFIG_DEBUG_LOCK_ALLOC
19699
19700 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
19701diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/spinlock_debug.c linux-4.14/kernel/locking/spinlock_debug.c
19702--- linux-4.14.orig/kernel/locking/spinlock_debug.c 2017-11-12 19:46:13.000000000 +0100
19703+++ linux-4.14/kernel/locking/spinlock_debug.c 2018-09-05 11:05:07.000000000 +0200
19704@@ -31,6 +31,7 @@
19705
19706 EXPORT_SYMBOL(__raw_spin_lock_init);
19707
19708+#ifndef CONFIG_PREEMPT_RT_FULL
19709 void __rwlock_init(rwlock_t *lock, const char *name,
19710 struct lock_class_key *key)
19711 {
19712@@ -48,6 +49,7 @@
19713 }
19714
19715 EXPORT_SYMBOL(__rwlock_init);
1a6e0f06 19716+#endif
e4b2b4a8
JK
19717
19718 static void spin_dump(raw_spinlock_t *lock, const char *msg)
19719 {
19720@@ -135,6 +137,7 @@
19721 arch_spin_unlock(&lock->raw_lock);
19722 }
19723
19724+#ifndef CONFIG_PREEMPT_RT_FULL
19725 static void rwlock_bug(rwlock_t *lock, const char *msg)
19726 {
19727 if (!debug_locks_off())
19728@@ -224,3 +227,5 @@
19729 debug_write_unlock(lock);
19730 arch_write_unlock(&lock->raw_lock);
19731 }
1a6e0f06
JK
19732+
19733+#endif
e4b2b4a8
JK
19734diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/panic.c linux-4.14/kernel/panic.c
19735--- linux-4.14.orig/kernel/panic.c 2017-11-12 19:46:13.000000000 +0100
19736+++ linux-4.14/kernel/panic.c 2018-09-05 11:05:07.000000000 +0200
19737@@ -482,9 +482,11 @@
19738
19739 static int init_oops_id(void)
19740 {
19741+#ifndef CONFIG_PREEMPT_RT_FULL
19742 if (!oops_id)
19743 get_random_bytes(&oops_id, sizeof(oops_id));
19744 else
1a6e0f06 19745+#endif
e4b2b4a8
JK
19746 oops_id++;
19747
19748 return 0;
19749diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/power/hibernate.c linux-4.14/kernel/power/hibernate.c
19750--- linux-4.14.orig/kernel/power/hibernate.c 2017-11-12 19:46:13.000000000 +0100
19751+++ linux-4.14/kernel/power/hibernate.c 2018-09-05 11:05:07.000000000 +0200
19752@@ -287,6 +287,8 @@
19753
19754 local_irq_disable();
19755
19756+ system_state = SYSTEM_SUSPEND;
1a6e0f06 19757+
e4b2b4a8
JK
19758 error = syscore_suspend();
19759 if (error) {
19760 pr_err("Some system devices failed to power down, aborting hibernation\n");
19761@@ -317,6 +319,7 @@
19762 syscore_resume();
19763
19764 Enable_irqs:
19765+ system_state = SYSTEM_RUNNING;
19766 local_irq_enable();
19767
19768 Enable_cpus:
19769@@ -445,6 +448,7 @@
19770 goto Enable_cpus;
19771
19772 local_irq_disable();
19773+ system_state = SYSTEM_SUSPEND;
19774
19775 error = syscore_suspend();
19776 if (error)
19777@@ -478,6 +482,7 @@
19778 syscore_resume();
19779
19780 Enable_irqs:
19781+ system_state = SYSTEM_RUNNING;
19782 local_irq_enable();
19783
19784 Enable_cpus:
19785@@ -563,6 +568,7 @@
19786 goto Enable_cpus;
19787
19788 local_irq_disable();
19789+ system_state = SYSTEM_SUSPEND;
19790 syscore_suspend();
19791 if (pm_wakeup_pending()) {
19792 error = -EAGAIN;
19793@@ -575,6 +581,7 @@
19794
19795 Power_up:
19796 syscore_resume();
19797+ system_state = SYSTEM_RUNNING;
19798 local_irq_enable();
19799
19800 Enable_cpus:
19801@@ -672,6 +679,10 @@
19802 return error;
19803 }
19804
19805+#ifndef CONFIG_SUSPEND
19806+bool pm_in_action;
1a6e0f06 19807+#endif
1a6e0f06 19808+
e4b2b4a8
JK
19809 /**
19810 * hibernate - Carry out system hibernation, including saving the image.
19811 */
19812@@ -685,6 +696,8 @@
19813 return -EPERM;
19814 }
19815
19816+ pm_in_action = true;
1a6e0f06 19817+
e4b2b4a8
JK
19818 lock_system_sleep();
19819 /* The snapshot device should not be opened while we're running */
19820 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
19821@@ -763,6 +776,7 @@
19822 atomic_inc(&snapshot_device_available);
19823 Unlock:
19824 unlock_system_sleep();
19825+ pm_in_action = false;
19826 pr_info("hibernation exit\n");
19827
19828 return error;
19829diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/power/suspend.c linux-4.14/kernel/power/suspend.c
19830--- linux-4.14.orig/kernel/power/suspend.c 2018-09-05 11:03:22.000000000 +0200
19831+++ linux-4.14/kernel/power/suspend.c 2018-09-05 11:05:07.000000000 +0200
19832@@ -428,6 +428,8 @@
19833 arch_suspend_disable_irqs();
19834 BUG_ON(!irqs_disabled());
19835
19836+ system_state = SYSTEM_SUSPEND;
1a6e0f06 19837+
e4b2b4a8
JK
19838 error = syscore_suspend();
19839 if (!error) {
19840 *wakeup = pm_wakeup_pending();
19841@@ -443,6 +445,8 @@
19842 syscore_resume();
19843 }
19844
19845+ system_state = SYSTEM_RUNNING;
1a6e0f06 19846+
e4b2b4a8
JK
19847 arch_suspend_enable_irqs();
19848 BUG_ON(irqs_disabled());
19849
19850@@ -589,6 +593,8 @@
19851 return error;
19852 }
19853
19854+bool pm_in_action;
1a6e0f06 19855+
e4b2b4a8
JK
19856 /**
19857 * pm_suspend - Externally visible function for suspending the system.
19858 * @state: System sleep state to enter.
19859@@ -603,6 +609,7 @@
19860 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
19861 return -EINVAL;
19862
19863+ pm_in_action = true;
19864 pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
19865 error = enter_state(state);
19866 if (error) {
19867@@ -612,6 +619,7 @@
19868 suspend_stats.success++;
19869 }
19870 pr_info("suspend exit\n");
19871+ pm_in_action = false;
19872 return error;
19873 }
19874 EXPORT_SYMBOL(pm_suspend);
19875diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/printk/printk.c linux-4.14/kernel/printk/printk.c
19876--- linux-4.14.orig/kernel/printk/printk.c 2017-11-12 19:46:13.000000000 +0100
19877+++ linux-4.14/kernel/printk/printk.c 2018-09-05 11:05:07.000000000 +0200
19878@@ -400,6 +400,65 @@
19879 printk_safe_exit_irqrestore(flags); \
19880 } while (0)
19881
19882+#ifdef CONFIG_EARLY_PRINTK
19883+struct console *early_console;
1a6e0f06 19884+
e4b2b4a8
JK
19885+static void early_vprintk(const char *fmt, va_list ap)
19886+{
19887+ if (early_console) {
19888+ char buf[512];
19889+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1a6e0f06 19890+
e4b2b4a8
JK
19891+ early_console->write(early_console, buf, n);
19892+ }
19893+}
1a6e0f06 19894+
e4b2b4a8
JK
19895+asmlinkage void early_printk(const char *fmt, ...)
19896+{
19897+ va_list ap;
1a6e0f06 19898+
e4b2b4a8
JK
19899+ va_start(ap, fmt);
19900+ early_vprintk(fmt, ap);
19901+ va_end(ap);
19902+}
1a6e0f06
JK
19903+
19904+/*
e4b2b4a8
JK
19905+ * This is independent of any log levels - a global
19906+ * kill switch that turns off all of printk.
19907+ *
19908+ * Used by the NMI watchdog if early-printk is enabled.
1a6e0f06 19909+ */
e4b2b4a8
JK
19910+static bool __read_mostly printk_killswitch;
19911+
19912+static int __init force_early_printk_setup(char *str)
19913+{
19914+ printk_killswitch = true;
19915+ return 0;
19916+}
19917+early_param("force_early_printk", force_early_printk_setup);
19918+
19919+void printk_kill(void)
19920+{
19921+ printk_killswitch = true;
19922+}
19923+
19924+#ifdef CONFIG_PRINTK
19925+static int forced_early_printk(const char *fmt, va_list ap)
19926+{
19927+ if (!printk_killswitch)
19928+ return 0;
19929+ early_vprintk(fmt, ap);
19930+ return 1;
19931+}
1a6e0f06 19932+#endif
1a6e0f06 19933+
1a6e0f06 19934+#else
e4b2b4a8
JK
19935+static inline int forced_early_printk(const char *fmt, va_list ap)
19936+{
19937+ return 0;
19938+}
19939+#endif
19940+
19941 #ifdef CONFIG_PRINTK
19942 DECLARE_WAIT_QUEUE_HEAD(log_wait);
19943 /* the next printk record to read by syslog(READ) or /proc/kmsg */
19944@@ -1348,6 +1407,8 @@
19945 {
19946 char *text;
19947 int len = 0;
19948+ int attempts = 0;
19949+ int num_msg;
19950
19951 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
19952 if (!text)
19953@@ -1359,6 +1420,14 @@
19954 u64 seq;
19955 u32 idx;
19956
19957+try_again:
19958+ attempts++;
19959+ if (attempts > 10) {
19960+ len = -EBUSY;
19961+ goto out;
19962+ }
19963+ num_msg = 0;
19964+
19965 /*
19966 * Find first record that fits, including all following records,
19967 * into the user-provided buffer for this dump.
19968@@ -1371,6 +1440,14 @@
19969 len += msg_print_text(msg, true, NULL, 0);
19970 idx = log_next(idx);
19971 seq++;
19972+ num_msg++;
19973+ if (num_msg > 5) {
19974+ num_msg = 0;
19975+ logbuf_unlock_irq();
19976+ logbuf_lock_irq();
19977+ if (clear_seq < log_first_seq)
19978+ goto try_again;
19979+ }
19980 }
19981
19982 /* move first record forward until length fits into the buffer */
19983@@ -1382,6 +1459,14 @@
19984 len -= msg_print_text(msg, true, NULL, 0);
19985 idx = log_next(idx);
19986 seq++;
19987+ num_msg++;
19988+ if (num_msg > 5) {
19989+ num_msg = 0;
19990+ logbuf_unlock_irq();
19991+ logbuf_lock_irq();
19992+ if (clear_seq < log_first_seq)
19993+ goto try_again;
19994+ }
19995 }
19996
19997 /* last message fitting into this dump */
19998@@ -1420,6 +1505,7 @@
19999 clear_seq = log_next_seq;
20000 clear_idx = log_next_idx;
20001 }
20002+out:
20003 logbuf_unlock_irq();
20004
20005 kfree(text);
20006@@ -1558,6 +1644,12 @@
20007 if (!console_drivers)
20008 return;
20009
20010+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20011+ if (in_irq() || in_nmi())
20012+ return;
1a6e0f06 20013+ }
1a6e0f06 20014+
e4b2b4a8
JK
20015+ migrate_disable();
20016 for_each_console(con) {
20017 if (exclusive_console && con != exclusive_console)
20018 continue;
20019@@ -1573,6 +1665,7 @@
20020 else
20021 con->write(con, text, len);
20022 }
20023+ migrate_enable();
20024 }
20025
20026 int printk_delay_msec __read_mostly;
20027@@ -1692,6 +1785,13 @@
20028 int printed_len;
20029 bool in_sched = false;
20030
20031+ /*
20032+ * Fall back to early_printk if a debugging subsystem has
20033+ * killed printk output
20034+ */
20035+ if (unlikely(forced_early_printk(fmt, args)))
20036+ return 1;
1a6e0f06 20037+
e4b2b4a8
JK
20038 if (level == LOGLEVEL_SCHED) {
20039 level = LOGLEVEL_DEFAULT;
20040 in_sched = true;
20041@@ -1748,12 +1848,22 @@
20042
20043 /* If called from the scheduler, we can not call up(). */
20044 if (!in_sched) {
20045+ int may_trylock = 1;
1a6e0f06 20046+
e4b2b4a8
JK
20047+#ifdef CONFIG_PREEMPT_RT_FULL
20048+ /*
20049+ * we can't take a sleeping lock with IRQs or preeption disabled
20050+ * so we can't print in these contexts
20051+ */
20052+ if (!(preempt_count() == 0 && !irqs_disabled()))
20053+ may_trylock = 0;
1a6e0f06 20054+#endif
e4b2b4a8
JK
20055 /*
20056 * Try to acquire and then immediately release the console
20057 * semaphore. The release will print out buffers and wake up
20058 * /dev/kmsg and syslog() users.
20059 */
20060- if (console_trylock())
20061+ if (may_trylock && console_trylock())
20062 console_unlock();
20063 }
1a6e0f06 20064
e4b2b4a8 20065@@ -1863,26 +1973,6 @@
1a6e0f06 20066
e4b2b4a8 20067 #endif /* CONFIG_PRINTK */
1a6e0f06 20068
e4b2b4a8
JK
20069-#ifdef CONFIG_EARLY_PRINTK
20070-struct console *early_console;
20071-
20072-asmlinkage __visible void early_printk(const char *fmt, ...)
20073-{
20074- va_list ap;
20075- char buf[512];
20076- int n;
20077-
20078- if (!early_console)
20079- return;
20080-
20081- va_start(ap, fmt);
20082- n = vscnprintf(buf, sizeof(buf), fmt, ap);
20083- va_end(ap);
20084-
20085- early_console->write(early_console, buf, n);
20086-}
20087-#endif
20088-
20089 static int __add_preferred_console(char *name, int idx, char *options,
20090 char *brl_options)
20091 {
20092@@ -2229,10 +2319,15 @@
20093 console_seq++;
20094 raw_spin_unlock(&logbuf_lock);
1a6e0f06 20095
e4b2b4a8
JK
20096+#ifdef CONFIG_PREEMPT_RT_FULL
20097+ printk_safe_exit_irqrestore(flags);
20098+ call_console_drivers(ext_text, ext_len, text, len);
1a6e0f06 20099+#else
e4b2b4a8
JK
20100 stop_critical_timings(); /* don't trace print latency */
20101 call_console_drivers(ext_text, ext_len, text, len);
20102 start_critical_timings();
20103 printk_safe_exit_irqrestore(flags);
1a6e0f06 20104+#endif
1a6e0f06 20105
e4b2b4a8
JK
20106 if (do_cond_resched)
20107 cond_resched();
20108@@ -2286,6 +2381,11 @@
1a6e0f06 20109 {
e4b2b4a8 20110 struct console *c;
1a6e0f06 20111
e4b2b4a8
JK
20112+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20113+ if (in_irq() || in_nmi())
20114+ return;
20115+ }
1a6e0f06 20116+
e4b2b4a8
JK
20117 /*
20118 * console_unblank can no longer be called in interrupt context unless
20119 * oops_in_progress is set to 1..
20120diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/ptrace.c linux-4.14/kernel/ptrace.c
20121--- linux-4.14.orig/kernel/ptrace.c 2017-11-12 19:46:13.000000000 +0100
20122+++ linux-4.14/kernel/ptrace.c 2018-09-05 11:05:07.000000000 +0200
20123@@ -175,7 +175,14 @@
20124
20125 spin_lock_irq(&task->sighand->siglock);
20126 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20127- task->state = __TASK_TRACED;
20128+ unsigned long flags;
1a6e0f06 20129+
e4b2b4a8
JK
20130+ raw_spin_lock_irqsave(&task->pi_lock, flags);
20131+ if (task->state & __TASK_TRACED)
20132+ task->state = __TASK_TRACED;
20133+ else
20134+ task->saved_state = __TASK_TRACED;
20135+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20136 ret = true;
20137 }
20138 spin_unlock_irq(&task->sighand->siglock);
20139diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/Kconfig linux-4.14/kernel/rcu/Kconfig
20140--- linux-4.14.orig/kernel/rcu/Kconfig 2017-11-12 19:46:13.000000000 +0100
20141+++ linux-4.14/kernel/rcu/Kconfig 2018-09-05 11:05:07.000000000 +0200
20142@@ -36,7 +36,7 @@
1a6e0f06 20143
e4b2b4a8
JK
20144 config RCU_EXPERT
20145 bool "Make expert-level adjustments to RCU configuration"
20146- default n
20147+ default y if PREEMPT_RT_FULL
20148 help
20149 This option needs to be enabled if you wish to make
20150 expert-level adjustments to RCU configuration. By default,
20151@@ -172,7 +172,7 @@
20152
20153 config RCU_FAST_NO_HZ
20154 bool "Accelerate last non-dyntick-idle CPU's grace periods"
20155- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
20156+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
20157 default n
20158 help
20159 This option permits CPUs to enter dynticks-idle state even if
20160@@ -191,7 +191,7 @@
20161 config RCU_BOOST
20162 bool "Enable RCU priority boosting"
20163 depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
20164- default n
20165+ default y if PREEMPT_RT_FULL
20166 help
20167 This option boosts the priority of preempted RCU readers that
20168 block the current preemptible RCU grace period for too long.
20169diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcu.h linux-4.14/kernel/rcu/rcu.h
20170--- linux-4.14.orig/kernel/rcu/rcu.h 2017-11-12 19:46:13.000000000 +0100
20171+++ linux-4.14/kernel/rcu/rcu.h 2018-09-05 11:05:07.000000000 +0200
20172@@ -462,18 +462,26 @@
20173 extern unsigned long rcutorture_testseq;
20174 extern unsigned long rcutorture_vernum;
20175 unsigned long rcu_batches_started(void);
20176-unsigned long rcu_batches_started_bh(void);
20177 unsigned long rcu_batches_started_sched(void);
20178 unsigned long rcu_batches_completed(void);
20179-unsigned long rcu_batches_completed_bh(void);
20180 unsigned long rcu_batches_completed_sched(void);
20181 unsigned long rcu_exp_batches_completed(void);
20182 unsigned long rcu_exp_batches_completed_sched(void);
20183 unsigned long srcu_batches_completed(struct srcu_struct *sp);
20184 void show_rcu_gp_kthreads(void);
20185 void rcu_force_quiescent_state(void);
20186-void rcu_bh_force_quiescent_state(void);
20187 void rcu_sched_force_quiescent_state(void);
1a6e0f06 20188+
e4b2b4a8
JK
20189+#ifndef CONFIG_PREEMPT_RT_FULL
20190+void rcu_bh_force_quiescent_state(void);
20191+unsigned long rcu_batches_started_bh(void);
20192+unsigned long rcu_batches_completed_bh(void);
1a6e0f06 20193+#else
e4b2b4a8
JK
20194+# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
20195+# define rcu_batches_completed_bh rcu_batches_completed
20196+# define rcu_batches_started_bh rcu_batches_completed
1a6e0f06 20197+#endif
e4b2b4a8
JK
20198+
20199 #endif /* #else #ifdef CONFIG_TINY_RCU */
1a6e0f06 20200
e4b2b4a8
JK
20201 #ifdef CONFIG_RCU_NOCB_CPU
20202diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcu_segcblist.c linux-4.14/kernel/rcu/rcu_segcblist.c
20203--- linux-4.14.orig/kernel/rcu/rcu_segcblist.c 2017-11-12 19:46:13.000000000 +0100
20204+++ linux-4.14/kernel/rcu/rcu_segcblist.c 2018-09-05 11:05:07.000000000 +0200
20205@@ -23,6 +23,7 @@
20206 #include <linux/types.h>
20207 #include <linux/kernel.h>
20208 #include <linux/interrupt.h>
20209+#include <linux/rcupdate.h>
1a6e0f06 20210
e4b2b4a8 20211 #include "rcu_segcblist.h"
1a6e0f06 20212
e4b2b4a8
JK
20213diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcutorture.c linux-4.14/kernel/rcu/rcutorture.c
20214--- linux-4.14.orig/kernel/rcu/rcutorture.c 2017-11-12 19:46:13.000000000 +0100
20215+++ linux-4.14/kernel/rcu/rcutorture.c 2018-09-05 11:05:07.000000000 +0200
20216@@ -417,6 +417,7 @@
20217 .name = "rcu"
1a6e0f06
JK
20218 };
20219
e4b2b4a8
JK
20220+#ifndef CONFIG_PREEMPT_RT_FULL
20221 /*
20222 * Definitions for rcu_bh torture testing.
1a6e0f06 20223 */
e4b2b4a8
JK
20224@@ -456,6 +457,12 @@
20225 .name = "rcu_bh"
20226 };
1a6e0f06 20227
e4b2b4a8
JK
20228+#else
20229+static struct rcu_torture_ops rcu_bh_ops = {
20230+ .ttype = INVALID_RCU_FLAVOR,
20231+};
20232+#endif
20233+
1a6e0f06 20234 /*
e4b2b4a8
JK
20235 * Don't even think about trying any of these in real life!!!
20236 * The names includes "busted", and they really means it!
20237diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/srcutree.c linux-4.14/kernel/rcu/srcutree.c
20238--- linux-4.14.orig/kernel/rcu/srcutree.c 2017-11-12 19:46:13.000000000 +0100
20239+++ linux-4.14/kernel/rcu/srcutree.c 2018-09-05 11:05:07.000000000 +0200
20240@@ -36,6 +36,8 @@
20241 #include <linux/delay.h>
20242 #include <linux/module.h>
20243 #include <linux/srcu.h>
20244+#include <linux/cpu.h>
20245+#include <linux/locallock.h>
1a6e0f06 20246
e4b2b4a8
JK
20247 #include "rcu.h"
20248 #include "rcu_segcblist.h"
20249@@ -53,6 +55,33 @@
20250 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
20251 static void process_srcu(struct work_struct *work);
20252
20253+/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
20254+#define spin_lock_rcu_node(p) \
20255+do { \
20256+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
20257+ smp_mb__after_unlock_lock(); \
20258+} while (0)
20259+
20260+#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
20261+
20262+#define spin_lock_irq_rcu_node(p) \
20263+do { \
20264+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
20265+ smp_mb__after_unlock_lock(); \
20266+} while (0)
20267+
20268+#define spin_unlock_irq_rcu_node(p) \
20269+ spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
20270+
20271+#define spin_lock_irqsave_rcu_node(p, flags) \
20272+do { \
20273+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
20274+ smp_mb__after_unlock_lock(); \
20275+} while (0)
20276+
20277+#define spin_unlock_irqrestore_rcu_node(p, flags) \
20278+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
20279+
20280 /*
20281 * Initialize SRCU combining tree. Note that statically allocated
20282 * srcu_struct structures might already have srcu_read_lock() and
20283@@ -77,7 +106,7 @@
20284
20285 /* Each pass through this loop initializes one srcu_node structure. */
20286 rcu_for_each_node_breadth_first(sp, snp) {
20287- raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20288+ spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20289 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
20290 ARRAY_SIZE(snp->srcu_data_have_cbs));
20291 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
20292@@ -111,7 +140,7 @@
20293 snp_first = sp->level[level];
20294 for_each_possible_cpu(cpu) {
20295 sdp = per_cpu_ptr(sp->sda, cpu);
20296- raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20297+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20298 rcu_segcblist_init(&sdp->srcu_cblist);
20299 sdp->srcu_cblist_invoking = false;
20300 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
20301@@ -170,7 +199,7 @@
20302 /* Don't re-initialize a lock while it is held. */
20303 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
20304 lockdep_init_map(&sp->dep_map, name, key, 0);
20305- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20306+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20307 return init_srcu_struct_fields(sp, false);
20308 }
20309 EXPORT_SYMBOL_GPL(__init_srcu_struct);
20310@@ -187,7 +216,7 @@
1a6e0f06 20311 */
e4b2b4a8
JK
20312 int init_srcu_struct(struct srcu_struct *sp)
20313 {
20314- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20315+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20316 return init_srcu_struct_fields(sp, false);
20317 }
20318 EXPORT_SYMBOL_GPL(init_srcu_struct);
20319@@ -210,13 +239,13 @@
20320 /* The smp_load_acquire() pairs with the smp_store_release(). */
20321 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
20322 return; /* Already initialized. */
20323- raw_spin_lock_irqsave_rcu_node(sp, flags);
20324+ spin_lock_irqsave_rcu_node(sp, flags);
20325 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
20326- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20327+ spin_unlock_irqrestore_rcu_node(sp, flags);
20328 return;
20329 }
20330 init_srcu_struct_fields(sp, true);
20331- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20332+ spin_unlock_irqrestore_rcu_node(sp, flags);
1a6e0f06
JK
20333 }
20334
e4b2b4a8
JK
20335 /*
20336@@ -425,21 +454,6 @@
20337 }
1a6e0f06 20338
e4b2b4a8
JK
20339 /*
20340- * Track online CPUs to guide callback workqueue placement.
20341- */
20342-DEFINE_PER_CPU(bool, srcu_online);
20343-
20344-void srcu_online_cpu(unsigned int cpu)
20345-{
20346- WRITE_ONCE(per_cpu(srcu_online, cpu), true);
20347-}
20348-
20349-void srcu_offline_cpu(unsigned int cpu)
20350-{
20351- WRITE_ONCE(per_cpu(srcu_online, cpu), false);
20352-}
20353-
20354-/*
20355 * Place the workqueue handler on the specified CPU if online, otherwise
20356 * just run it whereever. This is useful for placing workqueue handlers
20357 * that are to invoke the specified CPU's callbacks.
20358@@ -450,12 +464,12 @@
1a6e0f06 20359 {
e4b2b4a8
JK
20360 bool ret;
20361
20362- preempt_disable();
20363- if (READ_ONCE(per_cpu(srcu_online, cpu)))
20364+ cpus_read_lock();
20365+ if (cpu_online(cpu))
20366 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
20367 else
20368 ret = queue_delayed_work(wq, dwork, delay);
20369- preempt_enable();
20370+ cpus_read_unlock();
20371 return ret;
1a6e0f06
JK
20372 }
20373
e4b2b4a8
JK
20374@@ -513,7 +527,7 @@
20375 mutex_lock(&sp->srcu_cb_mutex);
20376
20377 /* End the current grace period. */
20378- raw_spin_lock_irq_rcu_node(sp);
20379+ spin_lock_irq_rcu_node(sp);
20380 idx = rcu_seq_state(sp->srcu_gp_seq);
20381 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
20382 cbdelay = srcu_get_delay(sp);
20383@@ -522,7 +536,7 @@
20384 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
20385 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
20386 sp->srcu_gp_seq_needed_exp = gpseq;
20387- raw_spin_unlock_irq_rcu_node(sp);
20388+ spin_unlock_irq_rcu_node(sp);
20389 mutex_unlock(&sp->srcu_gp_mutex);
20390 /* A new grace period can start at this point. But only one. */
20391
20392@@ -530,7 +544,7 @@
20393 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
20394 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
20395 rcu_for_each_node_breadth_first(sp, snp) {
20396- raw_spin_lock_irq_rcu_node(snp);
20397+ spin_lock_irq_rcu_node(snp);
20398 cbs = false;
20399 if (snp >= sp->level[rcu_num_lvls - 1])
20400 cbs = snp->srcu_have_cbs[idx] == gpseq;
20401@@ -540,7 +554,7 @@
20402 snp->srcu_gp_seq_needed_exp = gpseq;
20403 mask = snp->srcu_data_have_cbs[idx];
20404 snp->srcu_data_have_cbs[idx] = 0;
20405- raw_spin_unlock_irq_rcu_node(snp);
20406+ spin_unlock_irq_rcu_node(snp);
20407 if (cbs)
20408 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
20409
20410@@ -548,11 +562,11 @@
20411 if (!(gpseq & counter_wrap_check))
20412 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
20413 sdp = per_cpu_ptr(sp->sda, cpu);
20414- raw_spin_lock_irqsave_rcu_node(sdp, flags);
20415+ spin_lock_irqsave_rcu_node(sdp, flags);
20416 if (ULONG_CMP_GE(gpseq,
20417 sdp->srcu_gp_seq_needed + 100))
20418 sdp->srcu_gp_seq_needed = gpseq;
20419- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
20420+ spin_unlock_irqrestore_rcu_node(sdp, flags);
20421 }
20422 }
1a6e0f06 20423
e4b2b4a8
JK
20424@@ -560,17 +574,17 @@
20425 mutex_unlock(&sp->srcu_cb_mutex);
20426
20427 /* Start a new grace period if needed. */
20428- raw_spin_lock_irq_rcu_node(sp);
20429+ spin_lock_irq_rcu_node(sp);
20430 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
20431 if (!rcu_seq_state(gpseq) &&
20432 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
20433 srcu_gp_start(sp);
20434- raw_spin_unlock_irq_rcu_node(sp);
20435+ spin_unlock_irq_rcu_node(sp);
20436 /* Throttle expedited grace periods: Should be rare! */
20437 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
20438 ? 0 : SRCU_INTERVAL);
20439 } else {
20440- raw_spin_unlock_irq_rcu_node(sp);
20441+ spin_unlock_irq_rcu_node(sp);
20442 }
20443 }
1a6e0f06 20444
e4b2b4a8
JK
20445@@ -590,18 +604,18 @@
20446 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
20447 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
20448 return;
20449- raw_spin_lock_irqsave_rcu_node(snp, flags);
20450+ spin_lock_irqsave_rcu_node(snp, flags);
20451 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
20452- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20453+ spin_unlock_irqrestore_rcu_node(snp, flags);
20454 return;
20455 }
20456 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
20457- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20458+ spin_unlock_irqrestore_rcu_node(snp, flags);
20459 }
20460- raw_spin_lock_irqsave_rcu_node(sp, flags);
20461+ spin_lock_irqsave_rcu_node(sp, flags);
20462 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
20463 sp->srcu_gp_seq_needed_exp = s;
20464- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20465+ spin_unlock_irqrestore_rcu_node(sp, flags);
20466 }
1a6e0f06 20467
e4b2b4a8
JK
20468 /*
20469@@ -623,12 +637,12 @@
20470 for (; snp != NULL; snp = snp->srcu_parent) {
20471 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
20472 return; /* GP already done and CBs recorded. */
20473- raw_spin_lock_irqsave_rcu_node(snp, flags);
20474+ spin_lock_irqsave_rcu_node(snp, flags);
20475 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
20476 snp_seq = snp->srcu_have_cbs[idx];
20477 if (snp == sdp->mynode && snp_seq == s)
20478 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
20479- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20480+ spin_unlock_irqrestore_rcu_node(snp, flags);
20481 if (snp == sdp->mynode && snp_seq != s) {
20482 srcu_schedule_cbs_sdp(sdp, do_norm
20483 ? SRCU_INTERVAL
20484@@ -644,11 +658,11 @@
20485 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
20486 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
20487 snp->srcu_gp_seq_needed_exp = s;
20488- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20489+ spin_unlock_irqrestore_rcu_node(snp, flags);
20490 }
20491
20492 /* Top of tree, must ensure the grace period will be started. */
20493- raw_spin_lock_irqsave_rcu_node(sp, flags);
20494+ spin_lock_irqsave_rcu_node(sp, flags);
20495 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
20496 /*
20497 * Record need for grace period s. Pair with load
20498@@ -667,7 +681,7 @@
20499 queue_delayed_work(system_power_efficient_wq, &sp->work,
20500 srcu_get_delay(sp));
20501 }
20502- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20503+ spin_unlock_irqrestore_rcu_node(sp, flags);
1a6e0f06 20504 }
1a6e0f06 20505
e4b2b4a8
JK
20506 /*
20507@@ -736,6 +750,8 @@
20508 * negligible when amoritized over that time period, and the extra latency
20509 * of a needlessly non-expedited grace period is similarly negligible.
20510 */
20511+static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
20512+
20513 static bool srcu_might_be_idle(struct srcu_struct *sp)
1a6e0f06 20514 {
e4b2b4a8
JK
20515 unsigned long curseq;
20516@@ -744,13 +760,13 @@
20517 unsigned long t;
1a6e0f06 20518
e4b2b4a8
JK
20519 /* If the local srcu_data structure has callbacks, not idle. */
20520- local_irq_save(flags);
20521+ local_lock_irqsave(sp_llock, flags);
20522 sdp = this_cpu_ptr(sp->sda);
20523 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
20524- local_irq_restore(flags);
20525+ local_unlock_irqrestore(sp_llock, flags);
20526 return false; /* Callbacks already present, so not idle. */
20527 }
20528- local_irq_restore(flags);
20529+ local_unlock_irqrestore(sp_llock, flags);
1a6e0f06 20530
e4b2b4a8
JK
20531 /*
20532 * No local callbacks, so probabalistically probe global state.
20533@@ -828,9 +844,9 @@
20534 return;
20535 }
20536 rhp->func = func;
20537- local_irq_save(flags);
20538+ local_lock_irqsave(sp_llock, flags);
20539 sdp = this_cpu_ptr(sp->sda);
20540- raw_spin_lock_rcu_node(sdp);
20541+ spin_lock_rcu_node(sdp);
20542 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
20543 rcu_segcblist_advance(&sdp->srcu_cblist,
20544 rcu_seq_current(&sp->srcu_gp_seq));
20545@@ -844,7 +860,8 @@
20546 sdp->srcu_gp_seq_needed_exp = s;
20547 needexp = true;
20548 }
20549- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
20550+ spin_unlock_rcu_node(sdp);
20551+ local_unlock_irqrestore(sp_llock, flags);
20552 if (needgp)
20553 srcu_funnel_gp_start(sp, sdp, s, do_norm);
20554 else if (needexp)
20555@@ -900,7 +917,7 @@
20556
20557 /*
20558 * Make sure that later code is ordered after the SRCU grace
20559- * period. This pairs with the raw_spin_lock_irq_rcu_node()
20560+ * period. This pairs with the spin_lock_irq_rcu_node()
20561 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
20562 * because the current CPU might have been totally uninvolved with
20563 * (and thus unordered against) that grace period.
20564@@ -1024,7 +1041,7 @@
20565 */
20566 for_each_possible_cpu(cpu) {
20567 sdp = per_cpu_ptr(sp->sda, cpu);
20568- raw_spin_lock_irq_rcu_node(sdp);
20569+ spin_lock_irq_rcu_node(sdp);
20570 atomic_inc(&sp->srcu_barrier_cpu_cnt);
20571 sdp->srcu_barrier_head.func = srcu_barrier_cb;
20572 debug_rcu_head_queue(&sdp->srcu_barrier_head);
20573@@ -1033,7 +1050,7 @@
20574 debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
20575 atomic_dec(&sp->srcu_barrier_cpu_cnt);
20576 }
20577- raw_spin_unlock_irq_rcu_node(sdp);
20578+ spin_unlock_irq_rcu_node(sdp);
20579 }
20580
20581 /* Remove the initial count, at which point reaching zero can happen. */
20582@@ -1082,17 +1099,17 @@
20583 */
20584 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
20585 if (idx == SRCU_STATE_IDLE) {
20586- raw_spin_lock_irq_rcu_node(sp);
20587+ spin_lock_irq_rcu_node(sp);
20588 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
20589 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
20590- raw_spin_unlock_irq_rcu_node(sp);
20591+ spin_unlock_irq_rcu_node(sp);
20592 mutex_unlock(&sp->srcu_gp_mutex);
20593 return;
20594 }
20595 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
20596 if (idx == SRCU_STATE_IDLE)
20597 srcu_gp_start(sp);
20598- raw_spin_unlock_irq_rcu_node(sp);
20599+ spin_unlock_irq_rcu_node(sp);
20600 if (idx != SRCU_STATE_IDLE) {
20601 mutex_unlock(&sp->srcu_gp_mutex);
20602 return; /* Someone else started the grace period. */
20603@@ -1141,19 +1158,19 @@
20604 sdp = container_of(work, struct srcu_data, work.work);
20605 sp = sdp->sp;
20606 rcu_cblist_init(&ready_cbs);
20607- raw_spin_lock_irq_rcu_node(sdp);
20608+ spin_lock_irq_rcu_node(sdp);
20609 rcu_segcblist_advance(&sdp->srcu_cblist,
20610 rcu_seq_current(&sp->srcu_gp_seq));
20611 if (sdp->srcu_cblist_invoking ||
20612 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
20613- raw_spin_unlock_irq_rcu_node(sdp);
20614+ spin_unlock_irq_rcu_node(sdp);
20615 return; /* Someone else on the job or nothing to do. */
20616 }
20617
20618 /* We are on the job! Extract and invoke ready callbacks. */
20619 sdp->srcu_cblist_invoking = true;
20620 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
20621- raw_spin_unlock_irq_rcu_node(sdp);
20622+ spin_unlock_irq_rcu_node(sdp);
20623 rhp = rcu_cblist_dequeue(&ready_cbs);
20624 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
20625 debug_rcu_head_unqueue(rhp);
20626@@ -1166,13 +1183,13 @@
20627 * Update counts, accelerate new callbacks, and if needed,
20628 * schedule another round of callback invocation.
20629 */
20630- raw_spin_lock_irq_rcu_node(sdp);
20631+ spin_lock_irq_rcu_node(sdp);
20632 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
20633 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
20634 rcu_seq_snap(&sp->srcu_gp_seq));
20635 sdp->srcu_cblist_invoking = false;
20636 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
20637- raw_spin_unlock_irq_rcu_node(sdp);
20638+ spin_unlock_irq_rcu_node(sdp);
20639 if (more)
20640 srcu_schedule_cbs_sdp(sdp, 0);
20641 }
20642@@ -1185,7 +1202,7 @@
20643 {
20644 bool pushgp = true;
20645
20646- raw_spin_lock_irq_rcu_node(sp);
20647+ spin_lock_irq_rcu_node(sp);
20648 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
20649 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
20650 /* All requests fulfilled, time to go idle. */
20651@@ -1195,7 +1212,7 @@
20652 /* Outstanding request and no GP. Start one. */
20653 srcu_gp_start(sp);
20654 }
20655- raw_spin_unlock_irq_rcu_node(sp);
20656+ spin_unlock_irq_rcu_node(sp);
20657
20658 if (pushgp)
20659 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
20660diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree.c linux-4.14/kernel/rcu/tree.c
20661--- linux-4.14.orig/kernel/rcu/tree.c 2017-11-12 19:46:13.000000000 +0100
20662+++ linux-4.14/kernel/rcu/tree.c 2018-09-05 11:05:07.000000000 +0200
20663@@ -58,6 +58,11 @@
20664 #include <linux/trace_events.h>
20665 #include <linux/suspend.h>
20666 #include <linux/ftrace.h>
20667+#include <linux/delay.h>
20668+#include <linux/gfp.h>
20669+#include <linux/oom.h>
20670+#include <linux/smpboot.h>
20671+#include "../time/tick-internal.h"
20672
20673 #include "tree.h"
20674 #include "rcu.h"
20675@@ -243,6 +248,19 @@
20676 this_cpu_ptr(&rcu_sched_data), true);
20677 }
20678
20679+#ifdef CONFIG_PREEMPT_RT_FULL
20680+static void rcu_preempt_qs(void);
1a6e0f06 20681+
e4b2b4a8
JK
20682+void rcu_bh_qs(void)
20683+{
20684+ unsigned long flags;
1a6e0f06 20685+
e4b2b4a8
JK
20686+ /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
20687+ local_irq_save(flags);
20688+ rcu_preempt_qs();
20689+ local_irq_restore(flags);
20690+}
1a6e0f06 20691+#else
e4b2b4a8
JK
20692 void rcu_bh_qs(void)
20693 {
20694 RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
20695@@ -253,6 +271,7 @@
20696 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
20697 }
20698 }
1a6e0f06 20699+#endif
1a6e0f06 20700
e4b2b4a8
JK
20701 /*
20702 * Steal a bit from the bottom of ->dynticks for idle entry/exit
20703@@ -564,11 +583,13 @@
20704 /*
20705 * Return the number of RCU BH batches started thus far for debug & stats.
20706 */
20707+#ifndef CONFIG_PREEMPT_RT_FULL
20708 unsigned long rcu_batches_started_bh(void)
20709 {
20710 return rcu_bh_state.gpnum;
20711 }
20712 EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
20713+#endif
1a6e0f06 20714
e4b2b4a8
JK
20715 /*
20716 * Return the number of RCU batches completed thus far for debug & stats.
20717@@ -588,6 +609,7 @@
20718 }
20719 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
1a6e0f06 20720
e4b2b4a8
JK
20721+#ifndef CONFIG_PREEMPT_RT_FULL
20722 /*
20723 * Return the number of RCU BH batches completed thus far for debug & stats.
20724 */
20725@@ -596,6 +618,7 @@
20726 return rcu_bh_state.completed;
20727 }
20728 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
1a6e0f06 20729+#endif
e4b2b4a8
JK
20730
20731 /*
20732 * Return the number of RCU expedited batches completed thus far for
20733@@ -619,6 +642,7 @@
1a6e0f06 20734 }
e4b2b4a8 20735 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
1a6e0f06 20736
e4b2b4a8
JK
20737+#ifndef CONFIG_PREEMPT_RT_FULL
20738 /*
20739 * Force a quiescent state.
20740 */
20741@@ -637,6 +661,13 @@
1a6e0f06 20742 }
e4b2b4a8 20743 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
1a6e0f06 20744
1a6e0f06 20745+#else
e4b2b4a8
JK
20746+void rcu_force_quiescent_state(void)
20747+{
20748+}
20749+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1a6e0f06 20750+#endif
e4b2b4a8
JK
20751+
20752 /*
20753 * Force a quiescent state for RCU-sched.
20754 */
20755@@ -687,9 +718,11 @@
20756 case RCU_FLAVOR:
20757 rsp = rcu_state_p;
20758 break;
20759+#ifndef CONFIG_PREEMPT_RT_FULL
20760 case RCU_BH_FLAVOR:
20761 rsp = &rcu_bh_state;
20762 break;
20763+#endif
20764 case RCU_SCHED_FLAVOR:
20765 rsp = &rcu_sched_state;
20766 break;
20767@@ -2918,18 +2951,17 @@
20768 /*
20769 * Do RCU core processing for the current CPU.
20770 */
20771-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
20772+static __latent_entropy void rcu_process_callbacks(void)
20773 {
20774 struct rcu_state *rsp;
1a6e0f06 20775
e4b2b4a8
JK
20776 if (cpu_is_offline(smp_processor_id()))
20777 return;
20778- trace_rcu_utilization(TPS("Start RCU core"));
20779 for_each_rcu_flavor(rsp)
20780 __rcu_process_callbacks(rsp);
20781- trace_rcu_utilization(TPS("End RCU core"));
1a6e0f06
JK
20782 }
20783
e4b2b4a8
JK
20784+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
20785 /*
20786 * Schedule RCU callback invocation. If the specified type of RCU
20787 * does not support RCU priority boosting, just do a direct call,
20788@@ -2941,18 +2973,105 @@
1a6e0f06 20789 {
e4b2b4a8
JK
20790 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
20791 return;
20792- if (likely(!rsp->boost)) {
20793- rcu_do_batch(rsp, rdp);
20794+ rcu_do_batch(rsp, rdp);
20795+}
1a6e0f06 20796+
e4b2b4a8
JK
20797+static void rcu_wake_cond(struct task_struct *t, int status)
20798+{
20799+ /*
20800+ * If the thread is yielding, only wake it when this
20801+ * is invoked from idle
20802+ */
20803+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
20804+ wake_up_process(t);
20805+}
1a6e0f06 20806+
e4b2b4a8
JK
20807+/*
20808+ * Wake up this CPU's rcuc kthread to do RCU core processing.
20809+ */
20810+static void invoke_rcu_core(void)
20811+{
20812+ unsigned long flags;
20813+ struct task_struct *t;
1a6e0f06 20814+
e4b2b4a8
JK
20815+ if (!cpu_online(smp_processor_id()))
20816 return;
20817+ local_irq_save(flags);
20818+ __this_cpu_write(rcu_cpu_has_work, 1);
20819+ t = __this_cpu_read(rcu_cpu_kthread_task);
20820+ if (t != NULL && current != t)
20821+ rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
20822+ local_irq_restore(flags);
20823+}
1a6e0f06 20824+
e4b2b4a8
JK
20825+static void rcu_cpu_kthread_park(unsigned int cpu)
20826+{
20827+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
20828+}
1a6e0f06 20829+
e4b2b4a8 20830+static int rcu_cpu_kthread_should_run(unsigned int cpu)
1a6e0f06 20831+{
e4b2b4a8 20832+ return __this_cpu_read(rcu_cpu_has_work);
1a6e0f06
JK
20833+}
20834+
e4b2b4a8
JK
20835+/*
20836+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
20837+ * RCU softirq used in flavors and configurations of RCU that do not
20838+ * support RCU priority boosting.
20839+ */
20840+static void rcu_cpu_kthread(unsigned int cpu)
20841+{
20842+ unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
20843+ char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
20844+ int spincnt;
20845+
20846+ for (spincnt = 0; spincnt < 10; spincnt++) {
20847+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
20848+ local_bh_disable();
20849+ *statusp = RCU_KTHREAD_RUNNING;
20850+ this_cpu_inc(rcu_cpu_kthread_loops);
20851+ local_irq_disable();
20852+ work = *workp;
20853+ *workp = 0;
20854+ local_irq_enable();
20855+ if (work)
20856+ rcu_process_callbacks();
20857+ local_bh_enable();
20858+ if (*workp == 0) {
20859+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
20860+ *statusp = RCU_KTHREAD_WAITING;
20861+ return;
20862+ }
20863 }
20864- invoke_rcu_callbacks_kthread();
20865+ *statusp = RCU_KTHREAD_YIELDING;
20866+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
20867+ schedule_timeout_interruptible(2);
20868+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
20869+ *statusp = RCU_KTHREAD_WAITING;
20870 }
1a6e0f06 20871
e4b2b4a8
JK
20872-static void invoke_rcu_core(void)
20873+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
20874+ .store = &rcu_cpu_kthread_task,
20875+ .thread_should_run = rcu_cpu_kthread_should_run,
20876+ .thread_fn = rcu_cpu_kthread,
20877+ .thread_comm = "rcuc/%u",
20878+ .setup = rcu_cpu_kthread_setup,
20879+ .park = rcu_cpu_kthread_park,
20880+};
20881+
20882+/*
20883+ * Spawn per-CPU RCU core processing kthreads.
20884+ */
20885+static int __init rcu_spawn_core_kthreads(void)
1a6e0f06 20886 {
e4b2b4a8
JK
20887- if (cpu_online(smp_processor_id()))
20888- raise_softirq(RCU_SOFTIRQ);
20889+ int cpu;
20890+
20891+ for_each_possible_cpu(cpu)
20892+ per_cpu(rcu_cpu_has_work, cpu) = 0;
20893+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
20894+ return 0;
20895 }
20896+early_initcall(rcu_spawn_core_kthreads);
1a6e0f06 20897
e4b2b4a8
JK
20898 /*
20899 * Handle any core-RCU processing required by a call_rcu() invocation.
20900@@ -3113,6 +3232,7 @@
1a6e0f06 20901 }
e4b2b4a8 20902 EXPORT_SYMBOL_GPL(call_rcu_sched);
1a6e0f06 20903
e4b2b4a8 20904+#ifndef CONFIG_PREEMPT_RT_FULL
1a6e0f06 20905 /**
e4b2b4a8
JK
20906 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
20907 * @head: structure to be used for queueing the RCU updates.
20908@@ -3140,6 +3260,7 @@
20909 __call_rcu(head, func, &rcu_bh_state, -1, 0);
1a6e0f06 20910 }
e4b2b4a8
JK
20911 EXPORT_SYMBOL_GPL(call_rcu_bh);
20912+#endif
1a6e0f06 20913
e4b2b4a8
JK
20914 /*
20915 * Queue an RCU callback for lazy invocation after a grace period.
20916@@ -3225,6 +3346,7 @@
20917 }
20918 EXPORT_SYMBOL_GPL(synchronize_sched);
1a6e0f06 20919
e4b2b4a8
JK
20920+#ifndef CONFIG_PREEMPT_RT_FULL
20921 /**
20922 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
20923 *
20924@@ -3251,6 +3373,7 @@
20925 wait_rcu_gp(call_rcu_bh);
20926 }
20927 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
20928+#endif
1a6e0f06 20929
e4b2b4a8
JK
20930 /**
20931 * get_state_synchronize_rcu - Snapshot current RCU state
20932@@ -3601,6 +3724,7 @@
20933 mutex_unlock(&rsp->barrier_mutex);
20934 }
1a6e0f06 20935
e4b2b4a8
JK
20936+#ifndef CONFIG_PREEMPT_RT_FULL
20937 /**
20938 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
20939 */
20940@@ -3609,6 +3733,7 @@
20941 _rcu_barrier(&rcu_bh_state);
20942 }
20943 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
20944+#endif
1a6e0f06 20945
e4b2b4a8
JK
20946 /**
20947 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
20948@@ -3741,8 +3866,6 @@
20949 {
20950 sync_sched_exp_online_cleanup(cpu);
20951 rcutree_affinity_setting(cpu, -1);
20952- if (IS_ENABLED(CONFIG_TREE_SRCU))
20953- srcu_online_cpu(cpu);
20954 return 0;
20955 }
1a6e0f06 20956
e4b2b4a8
JK
20957@@ -3753,8 +3876,6 @@
20958 int rcutree_offline_cpu(unsigned int cpu)
20959 {
20960 rcutree_affinity_setting(cpu, cpu);
20961- if (IS_ENABLED(CONFIG_TREE_SRCU))
20962- srcu_offline_cpu(cpu);
20963 return 0;
20964 }
1a6e0f06 20965
e4b2b4a8 20966@@ -4184,12 +4305,13 @@
1a6e0f06 20967
e4b2b4a8
JK
20968 rcu_bootup_announce();
20969 rcu_init_geometry();
20970+#ifndef CONFIG_PREEMPT_RT_FULL
20971 rcu_init_one(&rcu_bh_state);
20972+#endif
20973 rcu_init_one(&rcu_sched_state);
20974 if (dump_tree)
20975 rcu_dump_rcu_node_tree(&rcu_sched_state);
20976 __rcu_init_preempt();
20977- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1a6e0f06 20978
e4b2b4a8
JK
20979 /*
20980 * We don't need protection against CPU-hotplug here because
20981@@ -4200,8 +4322,6 @@
20982 for_each_online_cpu(cpu) {
20983 rcutree_prepare_cpu(cpu);
20984 rcu_cpu_starting(cpu);
20985- if (IS_ENABLED(CONFIG_TREE_SRCU))
20986- srcu_online_cpu(cpu);
20987 }
1a6e0f06
JK
20988 }
20989
e4b2b4a8
JK
20990diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree.h linux-4.14/kernel/rcu/tree.h
20991--- linux-4.14.orig/kernel/rcu/tree.h 2017-11-12 19:46:13.000000000 +0100
20992+++ linux-4.14/kernel/rcu/tree.h 2018-09-05 11:05:07.000000000 +0200
20993@@ -427,7 +427,9 @@
20994 */
20995 extern struct rcu_state rcu_sched_state;
1a6e0f06 20996
e4b2b4a8
JK
20997+#ifndef CONFIG_PREEMPT_RT_FULL
20998 extern struct rcu_state rcu_bh_state;
20999+#endif
1a6e0f06 21000
e4b2b4a8
JK
21001 #ifdef CONFIG_PREEMPT_RCU
21002 extern struct rcu_state rcu_preempt_state;
21003@@ -436,12 +438,10 @@
21004 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
21005 bool rcu_eqs_special_set(int cpu);
1a6e0f06 21006
e4b2b4a8
JK
21007-#ifdef CONFIG_RCU_BOOST
21008 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21009 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21010 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21011 DECLARE_PER_CPU(char, rcu_cpu_has_work);
21012-#endif /* #ifdef CONFIG_RCU_BOOST */
1a6e0f06 21013
e4b2b4a8
JK
21014 #ifndef RCU_TREE_NONCORE
21015
21016@@ -461,10 +461,9 @@
21017 static void __init __rcu_init_preempt(void);
21018 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21019 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21020-static void invoke_rcu_callbacks_kthread(void);
21021 static bool rcu_is_callbacks_kthread(void);
21022+static void rcu_cpu_kthread_setup(unsigned int cpu);
21023 #ifdef CONFIG_RCU_BOOST
21024-static void rcu_preempt_do_callbacks(void);
21025 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21026 struct rcu_node *rnp);
21027 #endif /* #ifdef CONFIG_RCU_BOOST */
21028diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree_plugin.h linux-4.14/kernel/rcu/tree_plugin.h
21029--- linux-4.14.orig/kernel/rcu/tree_plugin.h 2018-09-05 11:03:22.000000000 +0200
21030+++ linux-4.14/kernel/rcu/tree_plugin.h 2018-09-05 11:05:07.000000000 +0200
21031@@ -24,39 +24,16 @@
21032 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21033 */
21034
21035-#include <linux/delay.h>
21036-#include <linux/gfp.h>
21037-#include <linux/oom.h>
21038-#include <linux/sched/debug.h>
21039-#include <linux/smpboot.h>
21040-#include <uapi/linux/sched/types.h>
21041-#include "../time/tick-internal.h"
21042-
21043-#ifdef CONFIG_RCU_BOOST
21044-
21045 #include "../locking/rtmutex_common.h"
21046
21047 /*
21048 * Control variables for per-CPU and per-rcu_node kthreads. These
21049 * handle all flavors of RCU.
21050 */
21051-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21052 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21053 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21054 DEFINE_PER_CPU(char, rcu_cpu_has_work);
21055
21056-#else /* #ifdef CONFIG_RCU_BOOST */
21057-
21058-/*
21059- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
21060- * all uses are in dead code. Provide a definition to keep the compiler
21061- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
21062- * This probably needs to be excluded from -rt builds.
21063- */
21064-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
21065-
21066-#endif /* #else #ifdef CONFIG_RCU_BOOST */
21067-
21068 #ifdef CONFIG_RCU_NOCB_CPU
21069 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21070 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
21071@@ -324,9 +301,13 @@
21072 struct task_struct *t = current;
21073 struct rcu_data *rdp;
21074 struct rcu_node *rnp;
21075+ int sleeping_l = 0;
21076
21077 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
21078- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
21079+#if defined(CONFIG_PREEMPT_RT_FULL)
21080+ sleeping_l = t->sleeping_lock;
21081+#endif
21082+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
21083 if (t->rcu_read_lock_nesting > 0 &&
21084 !t->rcu_read_unlock_special.b.blocked) {
21085
21086@@ -463,7 +444,7 @@
21087 }
21088
21089 /* Hardware IRQ handlers cannot block, complain if they get here. */
21090- if (in_irq() || in_serving_softirq()) {
21091+ if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21092 lockdep_rcu_suspicious(__FILE__, __LINE__,
21093 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21094 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21095@@ -530,7 +511,7 @@
21096
21097 /* Unboost if we were boosted. */
21098 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
21099- rt_mutex_unlock(&rnp->boost_mtx);
21100+ rt_mutex_futex_unlock(&rnp->boost_mtx);
21101
21102 /*
21103 * If this was the last task on the expedited lists,
21104@@ -684,15 +665,6 @@
21105 t->rcu_read_unlock_special.b.need_qs = true;
21106 }
21107
21108-#ifdef CONFIG_RCU_BOOST
21109-
21110-static void rcu_preempt_do_callbacks(void)
21111-{
21112- rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21113-}
21114-
21115-#endif /* #ifdef CONFIG_RCU_BOOST */
21116-
21117 /**
21118 * call_rcu() - Queue an RCU callback for invocation after a grace period.
21119 * @head: structure to be used for queueing the RCU updates.
21120@@ -915,20 +887,23 @@
21121
21122 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1a6e0f06 21123
1a6e0f06 21124+/*
e4b2b4a8 21125+ * If boosting, set rcuc kthreads to realtime priority.
1a6e0f06 21126+ */
e4b2b4a8 21127+static void rcu_cpu_kthread_setup(unsigned int cpu)
1a6e0f06 21128+{
e4b2b4a8
JK
21129 #ifdef CONFIG_RCU_BOOST
21130+ struct sched_param sp;
21131
21132-#include "../locking/rtmutex_common.h"
21133-
21134-static void rcu_wake_cond(struct task_struct *t, int status)
21135-{
21136- /*
21137- * If the thread is yielding, only wake it when this
21138- * is invoked from idle
21139- */
21140- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21141- wake_up_process(t);
21142+ sp.sched_priority = kthread_prio;
21143+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21144+#endif /* #ifdef CONFIG_RCU_BOOST */
21145 }
21146
21147+#ifdef CONFIG_RCU_BOOST
1a6e0f06 21148+
e4b2b4a8 21149+#include "../locking/rtmutex_common.h"
1a6e0f06 21150+
e4b2b4a8
JK
21151 /*
21152 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21153 * or ->boost_tasks, advancing the pointer to the next task in the
21154@@ -1071,23 +1046,6 @@
21155 }
1a6e0f06 21156
e4b2b4a8
JK
21157 /*
21158- * Wake up the per-CPU kthread to invoke RCU callbacks.
21159- */
21160-static void invoke_rcu_callbacks_kthread(void)
21161-{
21162- unsigned long flags;
21163-
21164- local_irq_save(flags);
21165- __this_cpu_write(rcu_cpu_has_work, 1);
21166- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21167- current != __this_cpu_read(rcu_cpu_kthread_task)) {
21168- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21169- __this_cpu_read(rcu_cpu_kthread_status));
21170- }
21171- local_irq_restore(flags);
21172-}
21173-
21174-/*
21175 * Is the current CPU running the RCU-callbacks kthread?
21176 * Caller must have preemption disabled.
21177 */
21178@@ -1141,67 +1099,6 @@
21179 return 0;
21180 }
1a6e0f06 21181
e4b2b4a8
JK
21182-static void rcu_kthread_do_work(void)
21183-{
21184- rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21185- rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21186- rcu_preempt_do_callbacks();
21187-}
21188-
21189-static void rcu_cpu_kthread_setup(unsigned int cpu)
21190-{
21191- struct sched_param sp;
21192-
21193- sp.sched_priority = kthread_prio;
21194- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21195-}
21196-
21197-static void rcu_cpu_kthread_park(unsigned int cpu)
21198-{
21199- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21200-}
21201-
21202-static int rcu_cpu_kthread_should_run(unsigned int cpu)
21203-{
21204- return __this_cpu_read(rcu_cpu_has_work);
21205-}
21206-
21207-/*
21208- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
21209- * RCU softirq used in flavors and configurations of RCU that do not
21210- * support RCU priority boosting.
21211- */
21212-static void rcu_cpu_kthread(unsigned int cpu)
21213-{
21214- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21215- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21216- int spincnt;
21217-
21218- for (spincnt = 0; spincnt < 10; spincnt++) {
21219- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21220- local_bh_disable();
21221- *statusp = RCU_KTHREAD_RUNNING;
21222- this_cpu_inc(rcu_cpu_kthread_loops);
21223- local_irq_disable();
21224- work = *workp;
21225- *workp = 0;
21226- local_irq_enable();
21227- if (work)
21228- rcu_kthread_do_work();
21229- local_bh_enable();
21230- if (*workp == 0) {
21231- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21232- *statusp = RCU_KTHREAD_WAITING;
21233- return;
21234- }
21235- }
21236- *statusp = RCU_KTHREAD_YIELDING;
21237- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21238- schedule_timeout_interruptible(2);
21239- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21240- *statusp = RCU_KTHREAD_WAITING;
21241-}
21242-
21243 /*
21244 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21245 * served by the rcu_node in question. The CPU hotplug lock is still
21246@@ -1232,26 +1129,12 @@
21247 free_cpumask_var(cm);
21248 }
1a6e0f06 21249
e4b2b4a8
JK
21250-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21251- .store = &rcu_cpu_kthread_task,
21252- .thread_should_run = rcu_cpu_kthread_should_run,
21253- .thread_fn = rcu_cpu_kthread,
21254- .thread_comm = "rcuc/%u",
21255- .setup = rcu_cpu_kthread_setup,
21256- .park = rcu_cpu_kthread_park,
21257-};
21258-
21259 /*
21260 * Spawn boost kthreads -- called as soon as the scheduler is running.
21261 */
21262 static void __init rcu_spawn_boost_kthreads(void)
21263 {
21264 struct rcu_node *rnp;
21265- int cpu;
21266-
21267- for_each_possible_cpu(cpu)
21268- per_cpu(rcu_cpu_has_work, cpu) = 0;
21269- BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21270 rcu_for_each_leaf_node(rcu_state_p, rnp)
21271 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21272 }
21273@@ -1274,11 +1157,6 @@
21274 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
21275 }
1a6e0f06 21276
e4b2b4a8
JK
21277-static void invoke_rcu_callbacks_kthread(void)
21278-{
21279- WARN_ON_ONCE(1);
21280-}
21281-
21282 static bool rcu_is_callbacks_kthread(void)
21283 {
21284 return false;
21285@@ -1302,7 +1180,7 @@
1a6e0f06 21286
e4b2b4a8 21287 #endif /* #else #ifdef CONFIG_RCU_BOOST */
1a6e0f06 21288
e4b2b4a8
JK
21289-#if !defined(CONFIG_RCU_FAST_NO_HZ)
21290+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
1a6e0f06 21291
e4b2b4a8
JK
21292 /*
21293 * Check to see if any future RCU-related work will need to be done
21294@@ -1318,7 +1196,9 @@
21295 *nextevt = KTIME_MAX;
21296 return rcu_cpu_has_callbacks(NULL);
1a6e0f06 21297 }
e4b2b4a8 21298+#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
1a6e0f06 21299
e4b2b4a8
JK
21300+#if !defined(CONFIG_RCU_FAST_NO_HZ)
21301 /*
21302 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21303 * after it.
21304@@ -1414,6 +1294,8 @@
21305 return cbs_ready;
21306 }
1f39f580 21307
e4b2b4a8
JK
21308+#ifndef CONFIG_PREEMPT_RT_FULL
21309+
21310 /*
21311 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21312 * to invoke. If the CPU has callbacks, try to advance them. Tell the
21313@@ -1456,6 +1338,7 @@
21314 *nextevt = basemono + dj * TICK_NSEC;
21315 return 0;
21316 }
21317+#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
1f39f580 21318
e4b2b4a8
JK
21319 /*
21320 * Prepare a CPU for idle from an RCU perspective. The first major task
21321diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/update.c linux-4.14/kernel/rcu/update.c
21322--- linux-4.14.orig/kernel/rcu/update.c 2018-09-05 11:03:22.000000000 +0200
21323+++ linux-4.14/kernel/rcu/update.c 2018-09-05 11:05:07.000000000 +0200
21324@@ -66,7 +66,7 @@
21325 module_param(rcu_expedited, int, 0);
21326 extern int rcu_normal; /* from sysctl */
21327 module_param(rcu_normal, int, 0);
21328-static int rcu_normal_after_boot;
21329+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
21330 module_param(rcu_normal_after_boot, int, 0);
21331 #endif /* #ifndef CONFIG_TINY_RCU */
1f39f580 21332
e4b2b4a8
JK
21333@@ -333,6 +333,7 @@
21334 }
21335 EXPORT_SYMBOL_GPL(rcu_read_lock_held);
1f39f580 21336
e4b2b4a8
JK
21337+#ifndef CONFIG_PREEMPT_RT_FULL
21338 /**
21339 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21340 *
21341@@ -359,6 +360,7 @@
21342 return in_softirq() || irqs_disabled();
21343 }
21344 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
21345+#endif
1f39f580 21346
e4b2b4a8 21347 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
1f39f580 21348
e4b2b4a8
JK
21349diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/completion.c linux-4.14/kernel/sched/completion.c
21350--- linux-4.14.orig/kernel/sched/completion.c 2017-11-12 19:46:13.000000000 +0100
21351+++ linux-4.14/kernel/sched/completion.c 2018-09-05 11:05:07.000000000 +0200
21352@@ -32,7 +32,7 @@
21353 {
21354 unsigned long flags;
1f39f580 21355
e4b2b4a8
JK
21356- spin_lock_irqsave(&x->wait.lock, flags);
21357+ raw_spin_lock_irqsave(&x->wait.lock, flags);
1f39f580 21358
e4b2b4a8
JK
21359 /*
21360 * Perform commit of crossrelease here.
21361@@ -41,8 +41,8 @@
1f39f580 21362
e4b2b4a8
JK
21363 if (x->done != UINT_MAX)
21364 x->done++;
21365- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
21366- spin_unlock_irqrestore(&x->wait.lock, flags);
21367+ swake_up_locked(&x->wait);
21368+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21369 }
21370 EXPORT_SYMBOL(complete);
1f39f580 21371
e4b2b4a8
JK
21372@@ -66,10 +66,10 @@
21373 {
21374 unsigned long flags;
1f39f580 21375
e4b2b4a8
JK
21376- spin_lock_irqsave(&x->wait.lock, flags);
21377+ raw_spin_lock_irqsave(&x->wait.lock, flags);
21378 x->done = UINT_MAX;
21379- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
21380- spin_unlock_irqrestore(&x->wait.lock, flags);
21381+ swake_up_all_locked(&x->wait);
21382+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21383 }
21384 EXPORT_SYMBOL(complete_all);
1f39f580 21385
e4b2b4a8
JK
21386@@ -78,20 +78,20 @@
21387 long (*action)(long), long timeout, int state)
21388 {
21389 if (!x->done) {
21390- DECLARE_WAITQUEUE(wait, current);
21391+ DECLARE_SWAITQUEUE(wait);
1f39f580 21392
e4b2b4a8
JK
21393- __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
21394+ __prepare_to_swait(&x->wait, &wait);
21395 do {
21396 if (signal_pending_state(state, current)) {
21397 timeout = -ERESTARTSYS;
21398 break;
21399 }
21400 __set_current_state(state);
21401- spin_unlock_irq(&x->wait.lock);
21402+ raw_spin_unlock_irq(&x->wait.lock);
21403 timeout = action(timeout);
21404- spin_lock_irq(&x->wait.lock);
21405+ raw_spin_lock_irq(&x->wait.lock);
21406 } while (!x->done && timeout);
21407- __remove_wait_queue(&x->wait, &wait);
21408+ __finish_swait(&x->wait, &wait);
21409 if (!x->done)
21410 return timeout;
21411 }
21412@@ -108,9 +108,9 @@
1f39f580 21413
e4b2b4a8 21414 complete_acquire(x);
1f39f580 21415
e4b2b4a8
JK
21416- spin_lock_irq(&x->wait.lock);
21417+ raw_spin_lock_irq(&x->wait.lock);
21418 timeout = do_wait_for_common(x, action, timeout, state);
21419- spin_unlock_irq(&x->wait.lock);
21420+ raw_spin_unlock_irq(&x->wait.lock);
1f39f580 21421
e4b2b4a8
JK
21422 complete_release(x);
21423
21424@@ -299,12 +299,12 @@
21425 if (!READ_ONCE(x->done))
21426 return 0;
21427
21428- spin_lock_irqsave(&x->wait.lock, flags);
21429+ raw_spin_lock_irqsave(&x->wait.lock, flags);
21430 if (!x->done)
21431 ret = 0;
21432 else if (x->done != UINT_MAX)
21433 x->done--;
21434- spin_unlock_irqrestore(&x->wait.lock, flags);
21435+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
1f39f580
JK
21436 return ret;
21437 }
e4b2b4a8
JK
21438 EXPORT_SYMBOL(try_wait_for_completion);
21439@@ -330,8 +330,8 @@
21440 * otherwise we can end up freeing the completion before complete()
21441 * is done referencing it.
21442 */
21443- spin_lock_irqsave(&x->wait.lock, flags);
21444- spin_unlock_irqrestore(&x->wait.lock, flags);
21445+ raw_spin_lock_irqsave(&x->wait.lock, flags);
21446+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21447 return true;
21448 }
21449 EXPORT_SYMBOL(completion_done);
21450diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/core.c linux-4.14/kernel/sched/core.c
21451--- linux-4.14.orig/kernel/sched/core.c 2018-09-05 11:03:22.000000000 +0200
21452+++ linux-4.14/kernel/sched/core.c 2018-09-05 11:05:07.000000000 +0200
21453@@ -59,7 +59,11 @@
21454 * Number of tasks to iterate in a single balance run.
21455 * Limited because this is done with IRQs disabled.
21456 */
21457+#ifndef CONFIG_PREEMPT_RT_FULL
21458 const_debug unsigned int sysctl_sched_nr_migrate = 32;
21459+#else
21460+const_debug unsigned int sysctl_sched_nr_migrate = 8;
21461+#endif
1f39f580 21462
e4b2b4a8
JK
21463 /*
21464 * period over which we average the RT time consumption, measured
21465@@ -341,7 +345,7 @@
21466 rq->hrtick_csd.info = rq;
21467 #endif
1f39f580 21468
e4b2b4a8
JK
21469- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21470+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
21471 rq->hrtick_timer.function = hrtick;
21472 }
21473 #else /* CONFIG_SCHED_HRTICK */
21474@@ -423,9 +427,15 @@
21475 #endif
21476 #endif
1f39f580 21477
e4b2b4a8
JK
21478-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
21479+void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
21480+ bool sleeper)
21481 {
21482- struct wake_q_node *node = &task->wake_q;
21483+ struct wake_q_node *node;
21484+
21485+ if (sleeper)
21486+ node = &task->wake_q_sleeper;
21487+ else
21488+ node = &task->wake_q;
1f39f580 21489
e4b2b4a8
JK
21490 /*
21491 * Atomically grab the task, if ->wake_q is !nil already it means
21492@@ -447,24 +457,32 @@
21493 head->lastp = &node->next;
21494 }
21495
21496-void wake_up_q(struct wake_q_head *head)
21497+void __wake_up_q(struct wake_q_head *head, bool sleeper)
21498 {
21499 struct wake_q_node *node = head->first;
21500
21501 while (node != WAKE_Q_TAIL) {
21502 struct task_struct *task;
21503
21504- task = container_of(node, struct task_struct, wake_q);
21505+ if (sleeper)
21506+ task = container_of(node, struct task_struct, wake_q_sleeper);
21507+ else
21508+ task = container_of(node, struct task_struct, wake_q);
21509 BUG_ON(!task);
21510 /* Task can safely be re-inserted now: */
21511 node = node->next;
21512- task->wake_q.next = NULL;
21513-
21514+ if (sleeper)
21515+ task->wake_q_sleeper.next = NULL;
21516+ else
21517+ task->wake_q.next = NULL;
21518 /*
21519 * wake_up_process() implies a wmb() to pair with the queueing
21520 * in wake_q_add() so as not to miss wakeups.
21521 */
21522- wake_up_process(task);
21523+ if (sleeper)
21524+ wake_up_lock_sleeper(task);
21525+ else
21526+ wake_up_process(task);
21527 put_task_struct(task);
1f39f580 21528 }
e4b2b4a8
JK
21529 }
21530@@ -500,6 +518,48 @@
21531 trace_sched_wake_idle_without_ipi(cpu);
21532 }
1f39f580 21533
e4b2b4a8
JK
21534+#ifdef CONFIG_PREEMPT_LAZY
21535+
21536+static int tsk_is_polling(struct task_struct *p)
21537+{
21538+#ifdef TIF_POLLING_NRFLAG
21539+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
21540+#else
21541+ return 0;
21542+#endif
21543+}
21544+
21545+void resched_curr_lazy(struct rq *rq)
21546+{
21547+ struct task_struct *curr = rq->curr;
21548+ int cpu;
21549+
21550+ if (!sched_feat(PREEMPT_LAZY)) {
21551+ resched_curr(rq);
21552+ return;
21553+ }
21554+
21555+ lockdep_assert_held(&rq->lock);
21556+
21557+ if (test_tsk_need_resched(curr))
21558+ return;
21559+
21560+ if (test_tsk_need_resched_lazy(curr))
21561+ return;
21562+
21563+ set_tsk_need_resched_lazy(curr);
21564+
21565+ cpu = cpu_of(rq);
21566+ if (cpu == smp_processor_id())
21567+ return;
21568+
21569+ /* NEED_RESCHED_LAZY must be visible before we test polling */
21570+ smp_mb();
21571+ if (!tsk_is_polling(curr))
21572+ smp_send_reschedule(cpu);
21573+}
21574+#endif
21575+
21576 void resched_cpu(int cpu)
1f39f580 21577 {
e4b2b4a8
JK
21578 struct rq *rq = cpu_rq(cpu);
21579@@ -523,11 +583,14 @@
21580 */
21581 int get_nohz_timer_target(void)
21582 {
21583- int i, cpu = smp_processor_id();
21584+ int i, cpu;
21585 struct sched_domain *sd;
1f39f580 21586
e4b2b4a8
JK
21587+ preempt_disable_rt();
21588+ cpu = smp_processor_id();
21589+
21590 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
21591- return cpu;
21592+ goto preempt_en_rt;
1f39f580 21593
e4b2b4a8
JK
21594 rcu_read_lock();
21595 for_each_domain(cpu, sd) {
21596@@ -546,6 +609,8 @@
21597 cpu = housekeeping_any_cpu();
21598 unlock:
21599 rcu_read_unlock();
21600+preempt_en_rt:
21601+ preempt_enable_rt();
21602 return cpu;
1f39f580
JK
21603 }
21604
e4b2b4a8
JK
21605@@ -912,7 +977,7 @@
21606 */
21607 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1f39f580 21608 {
e4b2b4a8
JK
21609- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
21610+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
21611 return false;
1f39f580 21612
e4b2b4a8
JK
21613 if (is_per_cpu_kthread(p))
21614@@ -1007,7 +1072,7 @@
21615 local_irq_disable();
1f39f580 21616 /*
e4b2b4a8
JK
21617 * We need to explicitly wake pending tasks before running
21618- * __migrate_task() such that we will not miss enforcing cpus_allowed
21619+ * __migrate_task() such that we will not miss enforcing cpus_ptr
21620 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
21621 */
21622 sched_ttwu_pending();
21623@@ -1038,11 +1103,19 @@
21624 */
21625 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1f39f580 21626 {
e4b2b4a8
JK
21627- cpumask_copy(&p->cpus_allowed, new_mask);
21628+ cpumask_copy(&p->cpus_mask, new_mask);
21629 p->nr_cpus_allowed = cpumask_weight(new_mask);
21630 }
1f39f580 21631
e4b2b4a8
JK
21632-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21633+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21634+int __migrate_disabled(struct task_struct *p)
21635+{
21636+ return p->migrate_disable;
21637+}
21638+#endif
21639+
21640+static void __do_set_cpus_allowed_tail(struct task_struct *p,
21641+ const struct cpumask *new_mask)
21642 {
21643 struct rq *rq = task_rq(p);
21644 bool queued, running;
21645@@ -1071,6 +1144,20 @@
21646 set_curr_task(rq, p);
1f39f580
JK
21647 }
21648
e4b2b4a8
JK
21649+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21650+{
21651+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21652+ if (__migrate_disabled(p)) {
21653+ lockdep_assert_held(&p->pi_lock);
21654+
21655+ cpumask_copy(&p->cpus_mask, new_mask);
21656+ p->migrate_disable_update = 1;
21657+ return;
21658+ }
21659+#endif
21660+ __do_set_cpus_allowed_tail(p, new_mask);
21661+}
21662+
21663 /*
21664 * Change a given task's CPU affinity. Migrate the thread to a
21665 * proper CPU and schedule it away if the CPU it's executing on
21666@@ -1108,7 +1195,7 @@
21667 goto out;
21668 }
1f39f580 21669
e4b2b4a8
JK
21670- if (cpumask_equal(&p->cpus_allowed, new_mask))
21671+ if (cpumask_equal(p->cpus_ptr, new_mask))
21672 goto out;
1f39f580 21673
e4b2b4a8
JK
21674 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
21675@@ -1129,9 +1216,16 @@
21676 }
1f39f580 21677
e4b2b4a8
JK
21678 /* Can the task run on the task's current CPU? If so, we're done */
21679- if (cpumask_test_cpu(task_cpu(p), new_mask))
21680+ if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
21681 goto out;
1f39f580 21682
e4b2b4a8
JK
21683+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21684+ if (__migrate_disabled(p)) {
21685+ p->migrate_disable_update = 1;
21686+ goto out;
21687+ }
21688+#endif
21689+
21690 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
21691 if (task_running(rq, p) || p->state == TASK_WAKING) {
21692 struct migration_arg arg = { p, dest_cpu };
21693@@ -1269,10 +1363,10 @@
21694 if (task_cpu(arg->src_task) != arg->src_cpu)
21695 goto unlock;
1f39f580 21696
e4b2b4a8
JK
21697- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
21698+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
21699 goto unlock;
1f39f580 21700
e4b2b4a8
JK
21701- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
21702+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
21703 goto unlock;
1a6e0f06 21704
e4b2b4a8
JK
21705 __migrate_swap_task(arg->src_task, arg->dst_cpu);
21706@@ -1313,10 +1407,10 @@
21707 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
21708 goto out;
1a6e0f06 21709
e4b2b4a8
JK
21710- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
21711+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
21712 goto out;
1a6e0f06 21713
e4b2b4a8
JK
21714- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
21715+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
21716 goto out;
1a6e0f06 21717
e4b2b4a8
JK
21718 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
21719@@ -1326,6 +1420,18 @@
21720 return ret;
1a6e0f06
JK
21721 }
21722
e4b2b4a8
JK
21723+static bool check_task_state(struct task_struct *p, long match_state)
21724+{
21725+ bool match = false;
21726+
21727+ raw_spin_lock_irq(&p->pi_lock);
21728+ if (p->state == match_state || p->saved_state == match_state)
21729+ match = true;
21730+ raw_spin_unlock_irq(&p->pi_lock);
21731+
21732+ return match;
21733+}
21734+
21735 /*
21736 * wait_task_inactive - wait for a thread to unschedule.
21737 *
21738@@ -1370,7 +1476,7 @@
21739 * is actually now running somewhere else!
21740 */
21741 while (task_running(rq, p)) {
21742- if (match_state && unlikely(p->state != match_state))
21743+ if (match_state && !check_task_state(p, match_state))
21744 return 0;
21745 cpu_relax();
21746 }
21747@@ -1385,7 +1491,8 @@
21748 running = task_running(rq, p);
21749 queued = task_on_rq_queued(p);
21750 ncsw = 0;
21751- if (!match_state || p->state == match_state)
21752+ if (!match_state || p->state == match_state ||
21753+ p->saved_state == match_state)
21754 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
21755 task_rq_unlock(rq, p, &rf);
1a6e0f06 21756
e4b2b4a8
JK
21757@@ -1460,7 +1567,7 @@
21758 EXPORT_SYMBOL_GPL(kick_process);
1a6e0f06 21759
e4b2b4a8
JK
21760 /*
21761- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
21762+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
21763 *
21764 * A few notes on cpu_active vs cpu_online:
21765 *
21766@@ -1500,14 +1607,14 @@
21767 for_each_cpu(dest_cpu, nodemask) {
21768 if (!cpu_active(dest_cpu))
21769 continue;
21770- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
21771+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
21772 return dest_cpu;
21773 }
21774 }
c7c16703 21775
e4b2b4a8
JK
21776 for (;;) {
21777 /* Any allowed, online CPU? */
21778- for_each_cpu(dest_cpu, &p->cpus_allowed) {
21779+ for_each_cpu(dest_cpu, p->cpus_ptr) {
21780 if (!is_cpu_allowed(p, dest_cpu))
21781 continue;
21782
21783@@ -1551,7 +1658,7 @@
21784 }
21785
21786 /*
21787- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
21788+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
21789 */
21790 static inline
21791 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
21792@@ -1561,11 +1668,11 @@
21793 if (p->nr_cpus_allowed > 1)
21794 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
21795 else
21796- cpu = cpumask_any(&p->cpus_allowed);
21797+ cpu = cpumask_any(p->cpus_ptr);
21798
21799 /*
21800 * In order not to call set_task_cpu() on a blocking task we need
21801- * to rely on ttwu() to place the task on a valid ->cpus_allowed
21802+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
21803 * CPU.
21804 *
21805 * Since this is common to all placement strategies, this lives here.
21806@@ -1668,10 +1775,6 @@
1a6e0f06 21807 {
e4b2b4a8
JK
21808 activate_task(rq, p, en_flags);
21809 p->on_rq = TASK_ON_RQ_QUEUED;
21810-
21811- /* If a worker is waking up, notify the workqueue: */
21812- if (p->flags & PF_WQ_WORKER)
21813- wq_worker_waking_up(p, cpu_of(rq));
21814 }
c7c16703 21815
e4b2b4a8
JK
21816 /*
21817@@ -1995,8 +2098,27 @@
21818 */
21819 raw_spin_lock_irqsave(&p->pi_lock, flags);
21820 smp_mb__after_spinlock();
21821- if (!(p->state & state))
21822+ if (!(p->state & state)) {
21823+ /*
21824+ * The task might be running due to a spinlock sleeper
21825+ * wakeup. Check the saved state and set it to running
21826+ * if the wakeup condition is true.
21827+ */
21828+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
21829+ if (p->saved_state & state) {
21830+ p->saved_state = TASK_RUNNING;
21831+ success = 1;
21832+ }
21833+ }
21834 goto out;
21835+ }
21836+
c7c16703 21837+ /*
e4b2b4a8
JK
21838+ * If this is a regular wakeup, then we can unconditionally
21839+ * clear the saved state of a "lock sleeper".
c7c16703 21840+ */
e4b2b4a8
JK
21841+ if (!(wake_flags & WF_LOCK_SLEEPER))
21842+ p->saved_state = TASK_RUNNING;
1a6e0f06 21843
e4b2b4a8 21844 trace_sched_waking(p);
1a6e0f06 21845
e4b2b4a8 21846@@ -2093,56 +2215,6 @@
1a6e0f06 21847 }
1a6e0f06 21848
e4b2b4a8
JK
21849 /**
21850- * try_to_wake_up_local - try to wake up a local task with rq lock held
21851- * @p: the thread to be awakened
21852- * @rf: request-queue flags for pinning
21853- *
21854- * Put @p on the run-queue if it's not already there. The caller must
21855- * ensure that this_rq() is locked, @p is bound to this_rq() and not
21856- * the current task.
21857- */
21858-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
21859-{
21860- struct rq *rq = task_rq(p);
21861-
21862- if (WARN_ON_ONCE(rq != this_rq()) ||
21863- WARN_ON_ONCE(p == current))
21864- return;
21865-
21866- lockdep_assert_held(&rq->lock);
21867-
21868- if (!raw_spin_trylock(&p->pi_lock)) {
21869- /*
21870- * This is OK, because current is on_cpu, which avoids it being
21871- * picked for load-balance and preemption/IRQs are still
21872- * disabled avoiding further scheduler activity on it and we've
21873- * not yet picked a replacement task.
21874- */
21875- rq_unlock(rq, rf);
21876- raw_spin_lock(&p->pi_lock);
21877- rq_relock(rq, rf);
21878- }
21879-
21880- if (!(p->state & TASK_NORMAL))
21881- goto out;
21882-
21883- trace_sched_waking(p);
21884-
21885- if (!task_on_rq_queued(p)) {
21886- if (p->in_iowait) {
21887- delayacct_blkio_end(p);
21888- atomic_dec(&rq->nr_iowait);
21889- }
21890- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
21891- }
21892-
21893- ttwu_do_wakeup(rq, p, 0, rf);
21894- ttwu_stat(p, smp_processor_id(), 0);
21895-out:
21896- raw_spin_unlock(&p->pi_lock);
21897-}
21898-
21899-/**
21900 * wake_up_process - Wake up a specific process
21901 * @p: The process to be woken up.
21902 *
21903@@ -2160,6 +2232,18 @@
21904 }
21905 EXPORT_SYMBOL(wake_up_process);
21906
21907+/**
21908+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
21909+ * @p: The process to be woken up.
21910+ *
21911+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
21912+ * the nature of the wakeup.
1a6e0f06 21913+ */
e4b2b4a8 21914+int wake_up_lock_sleeper(struct task_struct *p)
1a6e0f06 21915+{
e4b2b4a8 21916+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
1a6e0f06 21917+}
1a6e0f06 21918+
e4b2b4a8 21919 int wake_up_state(struct task_struct *p, unsigned int state)
1a6e0f06 21920 {
e4b2b4a8
JK
21921 return try_to_wake_up(p, state, 0);
21922@@ -2420,6 +2504,9 @@
21923 p->on_cpu = 0;
21924 #endif
21925 init_task_preempt_count(p);
21926+#ifdef CONFIG_HAVE_PREEMPT_LAZY
21927+ task_thread_info(p)->preempt_lazy_count = 0;
1a6e0f06 21928+#endif
e4b2b4a8
JK
21929 #ifdef CONFIG_SMP
21930 plist_node_init(&p->pushable_tasks, MAX_PRIO);
21931 RB_CLEAR_NODE(&p->pushable_dl_tasks);
21932@@ -2462,7 +2549,7 @@
21933 #ifdef CONFIG_SMP
21934 /*
21935 * Fork balancing, do it here and not earlier because:
21936- * - cpus_allowed can change in the fork path
21937+ * - cpus_ptr can change in the fork path
21938 * - any previously selected CPU might disappear through hotplug
21939 *
21940 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
21941@@ -2675,21 +2762,16 @@
21942 finish_arch_post_lock_switch();
1a6e0f06 21943
e4b2b4a8
JK
21944 fire_sched_in_preempt_notifiers(current);
21945+ /*
21946+ * We use mmdrop_delayed() here so we don't have to do the
21947+ * full __mmdrop() when we are the last user.
21948+ */
21949 if (mm)
21950- mmdrop(mm);
21951+ mmdrop_delayed(mm);
21952 if (unlikely(prev_state == TASK_DEAD)) {
21953 if (prev->sched_class->task_dead)
21954 prev->sched_class->task_dead(prev);
1a6e0f06 21955
e4b2b4a8
JK
21956- /*
21957- * Remove function-return probe instances associated with this
21958- * task and put them back on the free list.
21959- */
21960- kprobe_flush_task(prev);
21961-
21962- /* Task is done with its stack. */
21963- put_task_stack(prev);
21964-
21965 put_task_struct(prev);
21966 }
1a6e0f06 21967
e4b2b4a8
JK
21968@@ -3336,25 +3418,13 @@
21969 atomic_inc(&rq->nr_iowait);
21970 delayacct_blkio_start();
21971 }
21972-
21973- /*
21974- * If a worker went to sleep, notify and ask workqueue
21975- * whether it wants to wake up a task to maintain
21976- * concurrency.
21977- */
21978- if (prev->flags & PF_WQ_WORKER) {
21979- struct task_struct *to_wakeup;
21980-
21981- to_wakeup = wq_worker_sleeping(prev);
21982- if (to_wakeup)
21983- try_to_wake_up_local(to_wakeup, &rf);
21984- }
21985 }
21986 switch_count = &prev->nvcsw;
21987 }
1a6e0f06 21988
e4b2b4a8
JK
21989 next = pick_next_task(rq, prev, &rf);
21990 clear_tsk_need_resched(prev);
21991+ clear_tsk_need_resched_lazy(prev);
21992 clear_preempt_need_resched();
1a6e0f06 21993
e4b2b4a8
JK
21994 if (likely(prev != next)) {
21995@@ -3407,8 +3477,19 @@
1a6e0f06 21996
e4b2b4a8
JK
21997 static inline void sched_submit_work(struct task_struct *tsk)
21998 {
21999- if (!tsk->state || tsk_is_pi_blocked(tsk))
22000+ if (!tsk->state)
22001+ return;
1a6e0f06 22002+ /*
e4b2b4a8
JK
22003+ * If a worker went to sleep, notify and ask workqueue whether
22004+ * it wants to wake up a task to maintain concurrency.
1a6e0f06 22005+ */
e4b2b4a8
JK
22006+ if (tsk->flags & PF_WQ_WORKER)
22007+ wq_worker_sleeping(tsk);
1a6e0f06 22008+
1a6e0f06 22009+
e4b2b4a8
JK
22010+ if (tsk_is_pi_blocked(tsk))
22011 return;
1a6e0f06 22012+
1a6e0f06 22013 /*
e4b2b4a8
JK
22014 * If we are going to sleep and we have plugged IO queued,
22015 * make sure to submit it to avoid deadlocks.
22016@@ -3417,6 +3498,12 @@
22017 blk_schedule_flush_plug(tsk);
22018 }
1a6e0f06 22019
e4b2b4a8
JK
22020+static void sched_update_worker(struct task_struct *tsk)
22021+{
22022+ if (tsk->flags & PF_WQ_WORKER)
22023+ wq_worker_running(tsk);
22024+}
22025+
22026 asmlinkage __visible void __sched schedule(void)
1a6e0f06 22027 {
e4b2b4a8
JK
22028 struct task_struct *tsk = current;
22029@@ -3427,6 +3514,7 @@
22030 __schedule(false);
22031 sched_preempt_enable_no_resched();
22032 } while (need_resched());
22033+ sched_update_worker(tsk);
22034 }
22035 EXPORT_SYMBOL(schedule);
1a6e0f06 22036
e4b2b4a8
JK
22037@@ -3515,6 +3603,30 @@
22038 } while (need_resched());
22039 }
1a6e0f06 22040
e4b2b4a8
JK
22041+#ifdef CONFIG_PREEMPT_LAZY
22042+/*
22043+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22044+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22045+ * preempt_lazy_count counter >0.
22046+ */
22047+static __always_inline int preemptible_lazy(void)
22048+{
22049+ if (test_thread_flag(TIF_NEED_RESCHED))
22050+ return 1;
22051+ if (current_thread_info()->preempt_lazy_count)
22052+ return 0;
22053+ return 1;
22054+}
22055+
1a6e0f06 22056+#else
e4b2b4a8
JK
22057+
22058+static inline int preemptible_lazy(void)
22059+{
22060+ return 1;
22061+}
22062+
1a6e0f06 22063+#endif
e4b2b4a8
JK
22064+
22065 #ifdef CONFIG_PREEMPT
22066 /*
22067 * this is the entry point to schedule() from in-kernel preemption
22068@@ -3529,7 +3641,8 @@
22069 */
22070 if (likely(!preemptible()))
22071 return;
22072-
22073+ if (!preemptible_lazy())
22074+ return;
22075 preempt_schedule_common();
22076 }
22077 NOKPROBE_SYMBOL(preempt_schedule);
22078@@ -3556,6 +3669,9 @@
22079 if (likely(!preemptible()))
22080 return;
1a6e0f06 22081
e4b2b4a8
JK
22082+ if (!preemptible_lazy())
22083+ return;
22084+
22085 do {
22086 /*
22087 * Because the function tracer can trace preempt_count_sub()
22088@@ -3578,7 +3694,16 @@
22089 * an infinite recursion.
22090 */
22091 prev_ctx = exception_enter();
22092+ /*
22093+ * The add/subtract must not be traced by the function
22094+ * tracer. But we still want to account for the
22095+ * preempt off latency tracer. Since the _notrace versions
22096+ * of add/subtract skip the accounting for latency tracer
22097+ * we must force it manually.
22098+ */
22099+ start_critical_timings();
22100 __schedule(true);
22101+ stop_critical_timings();
22102 exception_exit(prev_ctx);
1a6e0f06 22103
e4b2b4a8
JK
22104 preempt_latency_stop(1);
22105@@ -4164,7 +4289,7 @@
22106 * the entire root_domain to become SCHED_DEADLINE. We
22107 * will also fail if there's no bandwidth available.
22108 */
22109- if (!cpumask_subset(span, &p->cpus_allowed) ||
22110+ if (!cpumask_subset(span, p->cpus_ptr) ||
22111 rq->rd->dl_bw.bw == 0) {
22112 task_rq_unlock(rq, p, &rf);
22113 return -EPERM;
22114@@ -4758,7 +4883,7 @@
22115 goto out_unlock;
1a6e0f06 22116
e4b2b4a8
JK
22117 raw_spin_lock_irqsave(&p->pi_lock, flags);
22118- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
22119+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
22120 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
22121
22122 out_unlock:
22123@@ -4877,6 +5002,7 @@
22124 }
22125 EXPORT_SYMBOL(__cond_resched_lock);
22126
22127+#ifndef CONFIG_PREEMPT_RT_FULL
22128 int __sched __cond_resched_softirq(void)
22129 {
22130 BUG_ON(!in_softirq());
22131@@ -4890,6 +5016,7 @@
1a6e0f06
JK
22132 return 0;
22133 }
e4b2b4a8
JK
22134 EXPORT_SYMBOL(__cond_resched_softirq);
22135+#endif
1a6e0f06 22136
e4b2b4a8
JK
22137 /**
22138 * yield - yield the current processor to other threads.
22139@@ -5284,7 +5411,9 @@
1a6e0f06 22140
e4b2b4a8
JK
22141 /* Set the preempt count _outside_ the spinlocks! */
22142 init_idle_preempt_count(idle, cpu);
22143-
22144+#ifdef CONFIG_HAVE_PREEMPT_LAZY
22145+ task_thread_info(idle)->preempt_lazy_count = 0;
1a6e0f06 22146+#endif
e4b2b4a8
JK
22147 /*
22148 * The idle tasks have their own, simple scheduling class:
22149 */
22150@@ -5323,7 +5452,7 @@
22151 * allowed nodes is unnecessary. Thus, cpusets are not
22152 * applicable for such threads. This prevents checking for
22153 * success of set_cpus_allowed_ptr() on all attached tasks
22154- * before cpus_allowed may be changed.
22155+ * before cpus_mask may be changed.
22156 */
22157 if (p->flags & PF_NO_SETAFFINITY) {
22158 ret = -EINVAL;
22159@@ -5350,7 +5479,7 @@
22160 if (curr_cpu == target_cpu)
22161 return 0;
22162
22163- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
22164+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
22165 return -EINVAL;
22166
22167 /* TODO: This is not properly updating schedstats */
22168@@ -5389,6 +5518,8 @@
22169 #endif /* CONFIG_NUMA_BALANCING */
1a6e0f06 22170
e4b2b4a8
JK
22171 #ifdef CONFIG_HOTPLUG_CPU
22172+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22173+
22174 /*
22175 * Ensure that the idle task is using init_mm right before its CPU goes
22176 * offline.
22177@@ -5403,7 +5534,12 @@
22178 switch_mm(mm, &init_mm, current);
22179 finish_arch_post_lock_switch();
22180 }
22181- mmdrop(mm);
22182+ /*
22183+ * Defer the cleanup to an alive cpu. On RT we can neither
22184+ * call mmdrop() nor mmdrop_delayed() from here.
22185+ */
22186+ per_cpu(idle_last_mm, smp_processor_id()) = mm;
22187+
1a6e0f06 22188 }
1a6e0f06 22189
e4b2b4a8
JK
22190 /*
22191@@ -5487,7 +5623,7 @@
22192 put_prev_task(rq, next);
22193
22194 /*
22195- * Rules for changing task_struct::cpus_allowed are holding
22196+ * Rules for changing task_struct::cpus_mask are holding
22197 * both pi_lock and rq->lock, such that holding either
22198 * stabilizes the mask.
22199 *
22200@@ -5718,6 +5854,10 @@
22201 update_max_interval();
22202 nohz_balance_exit_idle(cpu);
22203 hrtick_clear(rq);
22204+ if (per_cpu(idle_last_mm, cpu)) {
22205+ mmdrop_delayed(per_cpu(idle_last_mm, cpu));
22206+ per_cpu(idle_last_mm, cpu) = NULL;
22207+ }
22208 return 0;
1a6e0f06 22209 }
e4b2b4a8
JK
22210 #endif
22211@@ -5964,7 +6104,7 @@
22212 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22213 static inline int preempt_count_equals(int preempt_offset)
22214 {
22215- int nested = preempt_count() + rcu_preempt_depth();
22216+ int nested = preempt_count() + sched_rcu_preempt_depth();
1a6e0f06 22217
e4b2b4a8
JK
22218 return (nested == preempt_offset);
22219 }
22220@@ -6756,3 +6896,197 @@
22221 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
22222 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
22223 };
1a6e0f06 22224+
e4b2b4a8 22225+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
1a6e0f06 22226+
e4b2b4a8
JK
22227+static inline void
22228+update_nr_migratory(struct task_struct *p, long delta)
1a6e0f06 22229+{
e4b2b4a8
JK
22230+ if (unlikely((p->sched_class == &rt_sched_class ||
22231+ p->sched_class == &dl_sched_class) &&
22232+ p->nr_cpus_allowed > 1)) {
22233+ if (p->sched_class == &rt_sched_class)
22234+ task_rq(p)->rt.rt_nr_migratory += delta;
22235+ else
22236+ task_rq(p)->dl.dl_nr_migratory += delta;
22237+ }
1a6e0f06
JK
22238+}
22239+
e4b2b4a8
JK
22240+static inline void
22241+migrate_disable_update_cpus_allowed(struct task_struct *p)
22242+{
22243+ struct rq *rq;
22244+ struct rq_flags rf;
1a6e0f06 22245+
e4b2b4a8
JK
22246+ p->cpus_ptr = cpumask_of(smp_processor_id());
22247+
22248+ rq = task_rq_lock(p, &rf);
22249+ update_nr_migratory(p, -1);
22250+ p->nr_cpus_allowed = 1;
22251+ task_rq_unlock(rq, p, &rf);
22252+}
22253+
22254+static inline void
22255+migrate_enable_update_cpus_allowed(struct task_struct *p)
1a6e0f06 22256+{
e4b2b4a8
JK
22257+ struct rq *rq;
22258+ struct rq_flags rf;
22259+
22260+ p->cpus_ptr = &p->cpus_mask;
22261+
22262+ rq = task_rq_lock(p, &rf);
22263+ p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
22264+ update_nr_migratory(p, 1);
22265+ task_rq_unlock(rq, p, &rf);
1a6e0f06 22266+}
1a6e0f06 22267+
e4b2b4a8
JK
22268+void migrate_disable(void)
22269+{
22270+ struct task_struct *p = current;
22271+
22272+ if (in_atomic() || irqs_disabled()) {
22273+#ifdef CONFIG_SCHED_DEBUG
22274+ p->migrate_disable_atomic++;
1a6e0f06 22275+#endif
e4b2b4a8
JK
22276+ return;
22277+ }
22278+#ifdef CONFIG_SCHED_DEBUG
22279+ if (unlikely(p->migrate_disable_atomic)) {
22280+ tracing_off();
22281+ WARN_ON_ONCE(1);
22282+ }
1a6e0f06 22283+#endif
1a6e0f06 22284+
e4b2b4a8
JK
22285+ if (p->migrate_disable) {
22286+ p->migrate_disable++;
22287+ return;
22288+ }
22289+
22290+ preempt_disable();
22291+ preempt_lazy_disable();
22292+ pin_current_cpu();
22293+
22294+ migrate_disable_update_cpus_allowed(p);
22295+ p->migrate_disable = 1;
22296+
22297+ preempt_enable();
1a6e0f06 22298+}
e4b2b4a8 22299+EXPORT_SYMBOL(migrate_disable);
1a6e0f06 22300+
e4b2b4a8 22301+void migrate_enable(void)
1a6e0f06 22302+{
e4b2b4a8
JK
22303+ struct task_struct *p = current;
22304+
22305+ if (in_atomic() || irqs_disabled()) {
22306+#ifdef CONFIG_SCHED_DEBUG
22307+ p->migrate_disable_atomic--;
22308+#endif
22309+ return;
22310+ }
22311+
22312+#ifdef CONFIG_SCHED_DEBUG
22313+ if (unlikely(p->migrate_disable_atomic)) {
22314+ tracing_off();
22315+ WARN_ON_ONCE(1);
22316+ }
22317+#endif
22318+
22319+ WARN_ON_ONCE(p->migrate_disable <= 0);
22320+ if (p->migrate_disable > 1) {
22321+ p->migrate_disable--;
22322+ return;
22323+ }
22324+
22325+ preempt_disable();
22326+
22327+ p->migrate_disable = 0;
22328+ migrate_enable_update_cpus_allowed(p);
22329+
22330+ if (p->migrate_disable_update) {
22331+ struct rq *rq;
22332+ struct rq_flags rf;
22333+
22334+ rq = task_rq_lock(p, &rf);
22335+ update_rq_clock(rq);
22336+
22337+ __do_set_cpus_allowed_tail(p, &p->cpus_mask);
22338+ task_rq_unlock(rq, p, &rf);
22339+
22340+ p->migrate_disable_update = 0;
22341+
22342+ WARN_ON(smp_processor_id() != task_cpu(p));
22343+ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
22344+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
22345+ struct migration_arg arg;
22346+ unsigned int dest_cpu;
22347+
22348+ if (p->flags & PF_KTHREAD) {
22349+ /*
22350+ * Kernel threads are allowed on online && !active CPUs
22351+ */
22352+ cpu_valid_mask = cpu_online_mask;
22353+ }
22354+ dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
22355+ arg.task = p;
22356+ arg.dest_cpu = dest_cpu;
22357+
22358+ unpin_current_cpu();
22359+ preempt_lazy_enable();
22360+ preempt_enable();
22361+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
22362+ tlb_migrate_finish(p->mm);
22363+
22364+ return;
22365+ }
22366+ }
22367+ unpin_current_cpu();
22368+ preempt_lazy_enable();
22369+ preempt_enable();
1a6e0f06 22370+}
e4b2b4a8 22371+EXPORT_SYMBOL(migrate_enable);
1a6e0f06 22372+
e4b2b4a8
JK
22373+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22374+void migrate_disable(void)
22375+{
22376+ struct task_struct *p = current;
22377+
22378+ if (in_atomic() || irqs_disabled()) {
22379+#ifdef CONFIG_SCHED_DEBUG
22380+ p->migrate_disable_atomic++;
1a6e0f06 22381+#endif
e4b2b4a8
JK
22382+ return;
22383+ }
22384+#ifdef CONFIG_SCHED_DEBUG
22385+ if (unlikely(p->migrate_disable_atomic)) {
22386+ tracing_off();
22387+ WARN_ON_ONCE(1);
22388+ }
22389+#endif
22390+
22391+ p->migrate_disable++;
22392+}
22393+EXPORT_SYMBOL(migrate_disable);
22394+
22395+void migrate_enable(void)
22396+{
22397+ struct task_struct *p = current;
22398+
22399+ if (in_atomic() || irqs_disabled()) {
22400+#ifdef CONFIG_SCHED_DEBUG
22401+ p->migrate_disable_atomic--;
22402+#endif
22403+ return;
22404+ }
22405+
22406+#ifdef CONFIG_SCHED_DEBUG
22407+ if (unlikely(p->migrate_disable_atomic)) {
22408+ tracing_off();
22409+ WARN_ON_ONCE(1);
22410+ }
1a6e0f06 22411+#endif
1a6e0f06 22412+
e4b2b4a8
JK
22413+ WARN_ON_ONCE(p->migrate_disable <= 0);
22414+ p->migrate_disable--;
22415+}
22416+EXPORT_SYMBOL(migrate_enable);
22417+#endif
22418diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/cpudeadline.c linux-4.14/kernel/sched/cpudeadline.c
22419--- linux-4.14.orig/kernel/sched/cpudeadline.c 2017-11-12 19:46:13.000000000 +0100
22420+++ linux-4.14/kernel/sched/cpudeadline.c 2018-09-05 11:05:07.000000000 +0200
22421@@ -127,13 +127,13 @@
22422 const struct sched_dl_entity *dl_se = &p->dl;
1a6e0f06 22423
e4b2b4a8
JK
22424 if (later_mask &&
22425- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
22426+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
22427 return 1;
22428 } else {
22429 int best_cpu = cpudl_maximum(cp);
22430 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
22431
22432- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
22433+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
22434 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
22435 if (later_mask)
22436 cpumask_set_cpu(best_cpu, later_mask);
22437diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/cpupri.c linux-4.14/kernel/sched/cpupri.c
22438--- linux-4.14.orig/kernel/sched/cpupri.c 2017-11-12 19:46:13.000000000 +0100
22439+++ linux-4.14/kernel/sched/cpupri.c 2018-09-05 11:05:07.000000000 +0200
22440@@ -103,11 +103,11 @@
22441 if (skip)
22442 continue;
1a6e0f06 22443
e4b2b4a8
JK
22444- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
22445+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
22446 continue;
1a6e0f06 22447
e4b2b4a8
JK
22448 if (lowest_mask) {
22449- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
22450+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
22451
22452 /*
22453 * We have to ensure that we have at least one bit
22454diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/deadline.c linux-4.14/kernel/sched/deadline.c
22455--- linux-4.14.orig/kernel/sched/deadline.c 2018-09-05 11:03:22.000000000 +0200
22456+++ linux-4.14/kernel/sched/deadline.c 2018-09-05 11:05:07.000000000 +0200
22457@@ -504,7 +504,7 @@
22458 * If we cannot preempt any rq, fall back to pick any
22459 * online cpu.
22460 */
22461- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
22462+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
22463 if (cpu >= nr_cpu_ids) {
22464 /*
22465 * Fail to find any suitable cpu.
22466@@ -1020,7 +1020,7 @@
1a6e0f06 22467 {
e4b2b4a8 22468 struct hrtimer *timer = &dl_se->dl_timer;
1a6e0f06 22469
e4b2b4a8
JK
22470- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22471+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
22472 timer->function = dl_task_timer;
22473 }
22474
22475@@ -1749,7 +1749,7 @@
22476 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
22477 {
22478 if (!task_running(rq, p) &&
22479- cpumask_test_cpu(cpu, &p->cpus_allowed))
22480+ cpumask_test_cpu(cpu, p->cpus_ptr))
22481 return 1;
22482 return 0;
22483 }
22484@@ -1899,7 +1899,7 @@
22485 /* Retry if something changed. */
22486 if (double_lock_balance(rq, later_rq)) {
22487 if (unlikely(task_rq(task) != rq ||
22488- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
22489+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
22490 task_running(rq, task) ||
22491 !dl_task(task) ||
22492 !task_on_rq_queued(task))) {
22493diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/debug.c linux-4.14/kernel/sched/debug.c
22494--- linux-4.14.orig/kernel/sched/debug.c 2017-11-12 19:46:13.000000000 +0100
22495+++ linux-4.14/kernel/sched/debug.c 2018-09-05 11:05:07.000000000 +0200
22496@@ -1017,6 +1017,10 @@
22497 P(dl.runtime);
22498 P(dl.deadline);
1a6e0f06 22499 }
e4b2b4a8
JK
22500+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
22501+ P(migrate_disable);
22502+#endif
22503+ P(nr_cpus_allowed);
22504 #undef PN_SCHEDSTAT
22505 #undef PN
22506 #undef __PN
22507diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/fair.c linux-4.14/kernel/sched/fair.c
22508--- linux-4.14.orig/kernel/sched/fair.c 2018-09-05 11:03:22.000000000 +0200
22509+++ linux-4.14/kernel/sched/fair.c 2018-09-05 11:05:07.000000000 +0200
22510@@ -1596,7 +1596,7 @@
22511 */
22512 if (cur) {
22513 /* Skip this swap candidate if cannot move to the source cpu */
22514- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
22515+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
22516 goto unlock;
1a6e0f06 22517
e4b2b4a8
JK
22518 /*
22519@@ -1706,7 +1706,7 @@
1a6e0f06 22520
e4b2b4a8
JK
22521 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
22522 /* Skip this CPU if the source task cannot migrate */
22523- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
22524+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
22525 continue;
1a6e0f06 22526
e4b2b4a8
JK
22527 env->dst_cpu = cpu;
22528@@ -3840,7 +3840,7 @@
22529 ideal_runtime = sched_slice(cfs_rq, curr);
22530 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
22531 if (delta_exec > ideal_runtime) {
22532- resched_curr(rq_of(cfs_rq));
22533+ resched_curr_lazy(rq_of(cfs_rq));
22534 /*
22535 * The current task ran long enough, ensure it doesn't get
22536 * re-elected due to buddy favours.
22537@@ -3864,7 +3864,7 @@
22538 return;
1a6e0f06 22539
e4b2b4a8
JK
22540 if (delta > ideal_runtime)
22541- resched_curr(rq_of(cfs_rq));
22542+ resched_curr_lazy(rq_of(cfs_rq));
22543 }
1a6e0f06 22544
e4b2b4a8
JK
22545 static void
22546@@ -4006,7 +4006,7 @@
22547 * validating it and just reschedule.
22548 */
22549 if (queued) {
22550- resched_curr(rq_of(cfs_rq));
22551+ resched_curr_lazy(rq_of(cfs_rq));
1a6e0f06 22552 return;
e4b2b4a8
JK
22553 }
22554 /*
22555@@ -4188,7 +4188,7 @@
22556 * hierarchy can be throttled
22557 */
22558 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
22559- resched_curr(rq_of(cfs_rq));
22560+ resched_curr_lazy(rq_of(cfs_rq));
1a6e0f06 22561 }
1a6e0f06 22562
e4b2b4a8
JK
22563 static __always_inline
22564@@ -4837,7 +4837,7 @@
1a6e0f06 22565
e4b2b4a8
JK
22566 if (delta < 0) {
22567 if (rq->curr == p)
22568- resched_curr(rq);
22569+ resched_curr_lazy(rq);
22570 return;
22571 }
22572 hrtick_start(rq, delta);
22573@@ -5475,7 +5475,7 @@
1a6e0f06 22574
e4b2b4a8
JK
22575 /* Skip over this group if it has no CPUs allowed */
22576 if (!cpumask_intersects(sched_group_span(group),
22577- &p->cpus_allowed))
22578+ p->cpus_ptr))
22579 continue;
1a6e0f06 22580
e4b2b4a8
JK
22581 local_group = cpumask_test_cpu(this_cpu,
22582@@ -5595,7 +5595,7 @@
22583 return cpumask_first(sched_group_span(group));
22584
22585 /* Traverse only the allowed CPUs */
22586- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
22587+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
22588 if (idle_cpu(i)) {
22589 struct rq *rq = cpu_rq(i);
22590 struct cpuidle_state *idle = idle_get_state(rq);
22591@@ -5698,7 +5698,7 @@
22592 if (!test_idle_cores(target, false))
22593 return -1;
22594
22595- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
22596+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
22597
22598 for_each_cpu_wrap(core, cpus, target) {
22599 bool idle = true;
22600@@ -5732,7 +5732,7 @@
22601 return -1;
22602
22603 for_each_cpu(cpu, cpu_smt_mask(target)) {
22604- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22605+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22606 continue;
22607 if (idle_cpu(cpu))
22608 return cpu;
22609@@ -5795,7 +5795,7 @@
22610 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
22611 if (!--nr)
22612 return -1;
22613- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22614+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22615 continue;
22616 if (idle_cpu(cpu))
22617 break;
22618@@ -5950,7 +5950,7 @@
22619 if (sd_flag & SD_BALANCE_WAKE) {
22620 record_wakee(p);
22621 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
22622- && cpumask_test_cpu(cpu, &p->cpus_allowed);
22623+ && cpumask_test_cpu(cpu, p->cpus_ptr);
22624 }
1a6e0f06 22625
e4b2b4a8
JK
22626 rcu_read_lock();
22627@@ -6231,7 +6231,7 @@
22628 return;
1a6e0f06 22629
e4b2b4a8
JK
22630 preempt:
22631- resched_curr(rq);
22632+ resched_curr_lazy(rq);
22633 /*
22634 * Only set the backward buddy when the current task is still
22635 * on the rq. This can happen when a wakeup gets interleaved
22636@@ -6699,14 +6699,14 @@
22637 /*
22638 * We do not migrate tasks that are:
22639 * 1) throttled_lb_pair, or
22640- * 2) cannot be migrated to this CPU due to cpus_allowed, or
22641+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
22642 * 3) running (obviously), or
22643 * 4) are cache-hot on their current CPU.
22644 */
22645 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
22646 return 0;
1a6e0f06 22647
e4b2b4a8
JK
22648- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
22649+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
22650 int cpu;
1a6e0f06 22651
e4b2b4a8
JK
22652 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
22653@@ -6726,7 +6726,7 @@
1a6e0f06 22654
e4b2b4a8
JK
22655 /* Prevent to re-select dst_cpu via env's cpus */
22656 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
22657- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
22658+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
22659 env->flags |= LBF_DST_PINNED;
22660 env->new_dst_cpu = cpu;
22661 break;
22662@@ -7295,7 +7295,7 @@
1a6e0f06 22663
e4b2b4a8
JK
22664 /*
22665 * Group imbalance indicates (and tries to solve) the problem where balancing
22666- * groups is inadequate due to ->cpus_allowed constraints.
22667+ * groups is inadequate due to ->cpus_ptr constraints.
22668 *
22669 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
22670 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
22671@@ -7871,7 +7871,7 @@
1a6e0f06 22672 /*
e4b2b4a8
JK
22673 * If the busiest group is imbalanced the below checks don't
22674 * work because they assume all things are equal, which typically
22675- * isn't true due to cpus_allowed constraints and the like.
22676+ * isn't true due to cpus_ptr constraints and the like.
22677 */
22678 if (busiest->group_type == group_imbalanced)
22679 goto force_balance;
22680@@ -8263,7 +8263,7 @@
22681 * if the curr task on busiest cpu can't be
22682 * moved to this_cpu
22683 */
22684- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
22685+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
22686 raw_spin_unlock_irqrestore(&busiest->lock,
22687 flags);
22688 env.flags |= LBF_ALL_PINNED;
22689@@ -9085,7 +9085,7 @@
22690 * 'current' within the tree based on its new key value.
22691 */
22692 swap(curr->vruntime, se->vruntime);
22693- resched_curr(rq);
22694+ resched_curr_lazy(rq);
1a6e0f06 22695 }
e4b2b4a8
JK
22696
22697 se->vruntime -= cfs_rq->min_vruntime;
22698@@ -9109,7 +9109,7 @@
22699 */
22700 if (rq->curr == p) {
22701 if (p->prio > oldprio)
22702- resched_curr(rq);
22703+ resched_curr_lazy(rq);
22704 } else
22705 check_preempt_curr(rq, p, 0);
22706 }
22707diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/features.h linux-4.14/kernel/sched/features.h
22708--- linux-4.14.orig/kernel/sched/features.h 2017-11-12 19:46:13.000000000 +0100
22709+++ linux-4.14/kernel/sched/features.h 2018-09-05 11:05:07.000000000 +0200
22710@@ -46,11 +46,19 @@
22711 */
22712 SCHED_FEAT(NONTASK_CAPACITY, true)
22713
22714+#ifdef CONFIG_PREEMPT_RT_FULL
22715+SCHED_FEAT(TTWU_QUEUE, false)
22716+# ifdef CONFIG_PREEMPT_LAZY
22717+SCHED_FEAT(PREEMPT_LAZY, true)
22718+# endif
22719+#else
22720+
22721 /*
22722 * Queue remote wakeups on the target CPU and process them
22723 * using the scheduler IPI. Reduces rq->lock contention/bounces.
22724 */
22725 SCHED_FEAT(TTWU_QUEUE, true)
1a6e0f06
JK
22726+#endif
22727
e4b2b4a8
JK
22728 /*
22729 * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
22730diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/Makefile linux-4.14/kernel/sched/Makefile
22731--- linux-4.14.orig/kernel/sched/Makefile 2017-11-12 19:46:13.000000000 +0100
22732+++ linux-4.14/kernel/sched/Makefile 2018-09-05 11:05:07.000000000 +0200
22733@@ -18,7 +18,7 @@
22734
22735 obj-y += core.o loadavg.o clock.o cputime.o
22736 obj-y += idle_task.o fair.o rt.o deadline.o
22737-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
22738+obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o
22739 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22740 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
22741 obj-$(CONFIG_SCHEDSTATS) += stats.o
22742diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/rt.c linux-4.14/kernel/sched/rt.c
22743--- linux-4.14.orig/kernel/sched/rt.c 2018-09-05 11:03:22.000000000 +0200
22744+++ linux-4.14/kernel/sched/rt.c 2018-09-05 11:05:07.000000000 +0200
22745@@ -47,8 +47,8 @@
c7c16703 22746
e4b2b4a8 22747 raw_spin_lock_init(&rt_b->rt_runtime_lock);
c7c16703 22748
e4b2b4a8
JK
22749- hrtimer_init(&rt_b->rt_period_timer,
22750- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22751+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
22752+ HRTIMER_MODE_REL_HARD);
22753 rt_b->rt_period_timer.function = sched_rt_period_timer;
22754 }
c7c16703 22755
e4b2b4a8
JK
22756@@ -1594,7 +1594,7 @@
22757 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
22758 {
22759 if (!task_running(rq, p) &&
22760- cpumask_test_cpu(cpu, &p->cpus_allowed))
22761+ cpumask_test_cpu(cpu, p->cpus_ptr))
22762 return 1;
22763 return 0;
c7c16703 22764 }
e4b2b4a8
JK
22765@@ -1729,7 +1729,7 @@
22766 * Also make sure that it wasn't scheduled on its rq.
22767 */
22768 if (unlikely(task_rq(task) != rq ||
22769- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
22770+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
22771 task_running(rq, task) ||
22772 !rt_task(task) ||
22773 !task_on_rq_queued(task))) {
22774diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/sched.h linux-4.14/kernel/sched/sched.h
22775--- linux-4.14.orig/kernel/sched/sched.h 2018-09-05 11:03:22.000000000 +0200
22776+++ linux-4.14/kernel/sched/sched.h 2018-09-05 11:05:07.000000000 +0200
22777@@ -1354,6 +1354,7 @@
22778 #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
22779 #define WF_FORK 0x02 /* child wakeup after fork */
22780 #define WF_MIGRATED 0x4 /* internal use, task got migrated */
22781+#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
c7c16703 22782
e4b2b4a8
JK
22783 /*
22784 * To aid in avoiding the subversion of "niceness" due to uneven distribution
22785@@ -1545,6 +1546,15 @@
22786 extern void resched_curr(struct rq *rq);
22787 extern void resched_cpu(int cpu);
22788
22789+#ifdef CONFIG_PREEMPT_LAZY
22790+extern void resched_curr_lazy(struct rq *rq);
22791+#else
22792+static inline void resched_curr_lazy(struct rq *rq)
1a6e0f06 22793+{
e4b2b4a8 22794+ resched_curr(rq);
1a6e0f06 22795+}
1a6e0f06
JK
22796+#endif
22797+
e4b2b4a8
JK
22798 extern struct rt_bandwidth def_rt_bandwidth;
22799 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
22800
22801diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/swait.c linux-4.14/kernel/sched/swait.c
22802--- linux-4.14.orig/kernel/sched/swait.c 2017-11-12 19:46:13.000000000 +0100
22803+++ linux-4.14/kernel/sched/swait.c 2018-09-05 11:05:07.000000000 +0200
22804@@ -1,6 +1,7 @@
22805 // SPDX-License-Identifier: GPL-2.0
22806 #include <linux/sched/signal.h>
22807 #include <linux/swait.h>
22808+#include <linux/suspend.h>
22809
22810 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
22811 struct lock_class_key *key)
22812@@ -30,6 +31,25 @@
22813 }
22814 EXPORT_SYMBOL(swake_up_locked);
22815
22816+void swake_up_all_locked(struct swait_queue_head *q)
1a6e0f06 22817+{
e4b2b4a8
JK
22818+ struct swait_queue *curr;
22819+ int wakes = 0;
1a6e0f06 22820+
e4b2b4a8 22821+ while (!list_empty(&q->task_list)) {
1a6e0f06 22822+
e4b2b4a8
JK
22823+ curr = list_first_entry(&q->task_list, typeof(*curr),
22824+ task_list);
22825+ wake_up_process(curr->task);
22826+ list_del_init(&curr->task_list);
22827+ wakes++;
22828+ }
22829+ if (pm_in_action)
22830+ return;
22831+ WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
1a6e0f06 22832+}
e4b2b4a8 22833+EXPORT_SYMBOL(swake_up_all_locked);
1a6e0f06 22834+
e4b2b4a8
JK
22835 void swake_up(struct swait_queue_head *q)
22836 {
22837 unsigned long flags;
22838@@ -49,6 +69,7 @@
22839 struct swait_queue *curr;
22840 LIST_HEAD(tmp);
22841
22842+ WARN_ON(irqs_disabled());
22843 raw_spin_lock_irq(&q->lock);
22844 list_splice_init(&q->task_list, &tmp);
22845 while (!list_empty(&tmp)) {
22846diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/swork.c linux-4.14/kernel/sched/swork.c
22847--- linux-4.14.orig/kernel/sched/swork.c 1970-01-01 01:00:00.000000000 +0100
22848+++ linux-4.14/kernel/sched/swork.c 2018-09-05 11:05:07.000000000 +0200
22849@@ -0,0 +1,173 @@
1a6e0f06 22850+/*
e4b2b4a8
JK
22851+ * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
22852+ *
22853+ * Provides a framework for enqueuing callbacks from irq context
22854+ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
1a6e0f06 22855+ */
1a6e0f06 22856+
e4b2b4a8
JK
22857+#include <linux/swait.h>
22858+#include <linux/swork.h>
22859+#include <linux/kthread.h>
22860+#include <linux/slab.h>
22861+#include <linux/spinlock.h>
22862+#include <linux/export.h>
22863+
22864+#define SWORK_EVENT_PENDING (1 << 0)
22865+
22866+static DEFINE_MUTEX(worker_mutex);
22867+static struct sworker *glob_worker;
22868+
22869+struct sworker {
22870+ struct list_head events;
22871+ struct swait_queue_head wq;
1a6e0f06 22872+
e4b2b4a8
JK
22873+ raw_spinlock_t lock;
22874+
22875+ struct task_struct *task;
22876+ int refs;
22877+};
1a6e0f06 22878+
e4b2b4a8 22879+static bool swork_readable(struct sworker *worker)
1a6e0f06 22880+{
e4b2b4a8 22881+ bool r;
1a6e0f06 22882+
e4b2b4a8
JK
22883+ if (kthread_should_stop())
22884+ return true;
22885+
22886+ raw_spin_lock_irq(&worker->lock);
22887+ r = !list_empty(&worker->events);
22888+ raw_spin_unlock_irq(&worker->lock);
22889+
22890+ return r;
1a6e0f06 22891+}
1a6e0f06 22892+
e4b2b4a8 22893+static int swork_kthread(void *arg)
1a6e0f06 22894+{
e4b2b4a8 22895+ struct sworker *worker = arg;
1a6e0f06 22896+
e4b2b4a8
JK
22897+ for (;;) {
22898+ swait_event_interruptible(worker->wq,
22899+ swork_readable(worker));
22900+ if (kthread_should_stop())
22901+ break;
1a6e0f06 22902+
e4b2b4a8
JK
22903+ raw_spin_lock_irq(&worker->lock);
22904+ while (!list_empty(&worker->events)) {
22905+ struct swork_event *sev;
1a6e0f06 22906+
e4b2b4a8
JK
22907+ sev = list_first_entry(&worker->events,
22908+ struct swork_event, item);
22909+ list_del(&sev->item);
22910+ raw_spin_unlock_irq(&worker->lock);
1a6e0f06 22911+
e4b2b4a8
JK
22912+ WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
22913+ &sev->flags));
22914+ sev->func(sev);
22915+ raw_spin_lock_irq(&worker->lock);
22916+ }
22917+ raw_spin_unlock_irq(&worker->lock);
22918+ }
22919+ return 0;
1a6e0f06 22920+}
1a6e0f06 22921+
e4b2b4a8 22922+static struct sworker *swork_create(void)
1a6e0f06 22923+{
e4b2b4a8 22924+ struct sworker *worker;
1a6e0f06 22925+
e4b2b4a8
JK
22926+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
22927+ if (!worker)
22928+ return ERR_PTR(-ENOMEM);
1a6e0f06 22929+
e4b2b4a8
JK
22930+ INIT_LIST_HEAD(&worker->events);
22931+ raw_spin_lock_init(&worker->lock);
22932+ init_swait_queue_head(&worker->wq);
1a6e0f06 22933+
e4b2b4a8
JK
22934+ worker->task = kthread_run(swork_kthread, worker, "kswork");
22935+ if (IS_ERR(worker->task)) {
22936+ kfree(worker);
22937+ return ERR_PTR(-ENOMEM);
1a6e0f06 22938+ }
1a6e0f06 22939+
e4b2b4a8 22940+ return worker;
1a6e0f06 22941+}
1a6e0f06 22942+
e4b2b4a8 22943+static void swork_destroy(struct sworker *worker)
1a6e0f06 22944+{
e4b2b4a8
JK
22945+ kthread_stop(worker->task);
22946+
22947+ WARN_ON(!list_empty(&worker->events));
22948+ kfree(worker);
1a6e0f06 22949+}
1a6e0f06 22950+
e4b2b4a8
JK
22951+/**
22952+ * swork_queue - queue swork
22953+ *
22954+ * Returns %false if @work was already on a queue, %true otherwise.
22955+ *
22956+ * The work is queued and processed on a random CPU
22957+ */
22958+bool swork_queue(struct swork_event *sev)
1a6e0f06 22959+{
e4b2b4a8 22960+ unsigned long flags;
1a6e0f06 22961+
e4b2b4a8
JK
22962+ if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
22963+ return false;
1a6e0f06 22964+
e4b2b4a8
JK
22965+ raw_spin_lock_irqsave(&glob_worker->lock, flags);
22966+ list_add_tail(&sev->item, &glob_worker->events);
22967+ raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
1a6e0f06 22968+
e4b2b4a8
JK
22969+ swake_up(&glob_worker->wq);
22970+ return true;
1a6e0f06 22971+}
e4b2b4a8 22972+EXPORT_SYMBOL_GPL(swork_queue);
1a6e0f06 22973+
e4b2b4a8
JK
22974+/**
22975+ * swork_get - get an instance of the sworker
22976+ *
22977+ * Returns an negative error code if the initialization if the worker did not
22978+ * work, %0 otherwise.
22979+ *
22980+ */
22981+int swork_get(void)
1a6e0f06 22982+{
e4b2b4a8 22983+ struct sworker *worker;
1a6e0f06 22984+
e4b2b4a8
JK
22985+ mutex_lock(&worker_mutex);
22986+ if (!glob_worker) {
22987+ worker = swork_create();
22988+ if (IS_ERR(worker)) {
22989+ mutex_unlock(&worker_mutex);
22990+ return -ENOMEM;
22991+ }
1a6e0f06 22992+
e4b2b4a8
JK
22993+ glob_worker = worker;
22994+ }
1a6e0f06 22995+
e4b2b4a8
JK
22996+ glob_worker->refs++;
22997+ mutex_unlock(&worker_mutex);
1a6e0f06 22998+
e4b2b4a8 22999+ return 0;
1a6e0f06 23000+}
e4b2b4a8 23001+EXPORT_SYMBOL_GPL(swork_get);
1a6e0f06 23002+
e4b2b4a8
JK
23003+/**
23004+ * swork_put - puts an instance of the sworker
23005+ *
23006+ * Will destroy the sworker thread. This function must not be called until all
23007+ * queued events have been completed.
1a6e0f06 23008+ */
e4b2b4a8 23009+void swork_put(void)
1a6e0f06 23010+{
e4b2b4a8 23011+ mutex_lock(&worker_mutex);
1a6e0f06 23012+
e4b2b4a8
JK
23013+ glob_worker->refs--;
23014+ if (glob_worker->refs > 0)
23015+ goto out;
1a6e0f06 23016+
e4b2b4a8
JK
23017+ swork_destroy(glob_worker);
23018+ glob_worker = NULL;
23019+out:
23020+ mutex_unlock(&worker_mutex);
1a6e0f06 23021+}
e4b2b4a8
JK
23022+EXPORT_SYMBOL_GPL(swork_put);
23023diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/topology.c linux-4.14/kernel/sched/topology.c
23024--- linux-4.14.orig/kernel/sched/topology.c 2018-09-05 11:03:22.000000000 +0200
23025+++ linux-4.14/kernel/sched/topology.c 2018-09-05 11:05:07.000000000 +0200
23026@@ -286,6 +286,7 @@
23027 rd->rto_cpu = -1;
23028 raw_spin_lock_init(&rd->rto_lock);
23029 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23030+ rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23031 #endif
23032
23033 init_dl_bw(&rd->dl_bw);
23034diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/signal.c linux-4.14/kernel/signal.c
23035--- linux-4.14.orig/kernel/signal.c 2018-09-05 11:03:22.000000000 +0200
23036+++ linux-4.14/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
23037@@ -19,6 +19,7 @@
23038 #include <linux/sched/task.h>
23039 #include <linux/sched/task_stack.h>
23040 #include <linux/sched/cputime.h>
23041+#include <linux/sched/rt.h>
23042 #include <linux/fs.h>
23043 #include <linux/tty.h>
23044 #include <linux/binfmts.h>
23045@@ -360,13 +361,30 @@
23046 return false;
23047 }
23048
23049+static inline struct sigqueue *get_task_cache(struct task_struct *t)
1a6e0f06 23050+{
e4b2b4a8 23051+ struct sigqueue *q = t->sigqueue_cache;
1a6e0f06 23052+
e4b2b4a8
JK
23053+ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23054+ return NULL;
23055+ return q;
1a6e0f06 23056+}
1a6e0f06 23057+
e4b2b4a8 23058+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
1a6e0f06 23059+{
e4b2b4a8
JK
23060+ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23061+ return 0;
23062+ return 1;
1a6e0f06 23063+}
1a6e0f06 23064+
e4b2b4a8
JK
23065 /*
23066 * allocate a new signal queue record
23067 * - this may be called without locks if and only if t == current, otherwise an
23068 * appropriate lock must be held to stop the target task from exiting
23069 */
23070 static struct sigqueue *
23071-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23072+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23073+ int override_rlimit, int fromslab)
23074 {
23075 struct sigqueue *q = NULL;
23076 struct user_struct *user;
23077@@ -383,7 +401,10 @@
23078 if (override_rlimit ||
23079 atomic_read(&user->sigpending) <=
23080 task_rlimit(t, RLIMIT_SIGPENDING)) {
23081- q = kmem_cache_alloc(sigqueue_cachep, flags);
23082+ if (!fromslab)
23083+ q = get_task_cache(t);
23084+ if (!q)
23085+ q = kmem_cache_alloc(sigqueue_cachep, flags);
23086 } else {
23087 print_dropped_signal(sig);
23088 }
23089@@ -400,6 +421,13 @@
23090 return q;
23091 }
23092
23093+static struct sigqueue *
23094+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23095+ int override_rlimit)
1a6e0f06 23096+{
e4b2b4a8 23097+ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
1a6e0f06 23098+}
1a6e0f06 23099+
e4b2b4a8
JK
23100 static void __sigqueue_free(struct sigqueue *q)
23101 {
23102 if (q->flags & SIGQUEUE_PREALLOC)
23103@@ -409,6 +437,21 @@
23104 kmem_cache_free(sigqueue_cachep, q);
23105 }
23106
23107+static void sigqueue_free_current(struct sigqueue *q)
1a6e0f06 23108+{
e4b2b4a8
JK
23109+ struct user_struct *up;
23110+
23111+ if (q->flags & SIGQUEUE_PREALLOC)
23112+ return;
23113+
23114+ up = q->user;
23115+ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23116+ atomic_dec(&up->sigpending);
23117+ free_uid(up);
23118+ } else
23119+ __sigqueue_free(q);
1a6e0f06 23120+}
1a6e0f06 23121+
e4b2b4a8
JK
23122 void flush_sigqueue(struct sigpending *queue)
23123 {
23124 struct sigqueue *q;
23125@@ -422,6 +465,21 @@
23126 }
23127
23128 /*
23129+ * Called from __exit_signal. Flush tsk->pending and
23130+ * tsk->sigqueue_cache
23131+ */
23132+void flush_task_sigqueue(struct task_struct *tsk)
1a6e0f06 23133+{
e4b2b4a8 23134+ struct sigqueue *q;
1a6e0f06 23135+
e4b2b4a8 23136+ flush_sigqueue(&tsk->pending);
1a6e0f06 23137+
e4b2b4a8
JK
23138+ q = get_task_cache(tsk);
23139+ if (q)
23140+ kmem_cache_free(sigqueue_cachep, q);
1a6e0f06
JK
23141+}
23142+
e4b2b4a8
JK
23143+/*
23144 * Flush all pending signals for this kthread.
23145 */
23146 void flush_signals(struct task_struct *t)
23147@@ -542,7 +600,7 @@
23148 (info->si_code == SI_TIMER) &&
23149 (info->si_sys_private);
23150
23151- __sigqueue_free(first);
23152+ sigqueue_free_current(first);
23153 } else {
23154 /*
23155 * Ok, it wasn't in the queue. This must be
23156@@ -578,6 +636,8 @@
23157 bool resched_timer = false;
23158 int signr;
23159
23160+ WARN_ON_ONCE(tsk != current);
23161+
23162 /* We only dequeue private signals from ourselves, we don't let
23163 * signalfd steal them
23164 */
23165@@ -1177,8 +1237,8 @@
23166 * We don't want to have recursive SIGSEGV's etc, for example,
23167 * that is why we also clear SIGNAL_UNKILLABLE.
23168 */
23169-int
23170-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23171+static int
23172+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23173 {
23174 unsigned long int flags;
23175 int ret, blocked, ignored;
23176@@ -1207,6 +1267,39 @@
23177 return ret;
23178 }
23179
23180+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1a6e0f06 23181+{
e4b2b4a8
JK
23182+/*
23183+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23184+ * since it can not enable preemption, and the signal code's spin_locks
23185+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23186+ * send the signal on exit of the trap.
23187+ */
23188+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23189+ if (in_atomic()) {
23190+ if (WARN_ON_ONCE(t != current))
23191+ return 0;
23192+ if (WARN_ON_ONCE(t->forced_info.si_signo))
23193+ return 0;
1a6e0f06 23194+
e4b2b4a8
JK
23195+ if (is_si_special(info)) {
23196+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
23197+ t->forced_info.si_signo = sig;
23198+ t->forced_info.si_errno = 0;
23199+ t->forced_info.si_code = SI_KERNEL;
23200+ t->forced_info.si_pid = 0;
23201+ t->forced_info.si_uid = 0;
23202+ } else {
23203+ t->forced_info = *info;
23204+ }
1a6e0f06 23205+
e4b2b4a8
JK
23206+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23207+ return 0;
23208+ }
23209+#endif
23210+ return do_force_sig_info(sig, info, t);
1a6e0f06 23211+}
1a6e0f06 23212+
e4b2b4a8
JK
23213 /*
23214 * Nuke all other threads in the group.
23215 */
23216@@ -1241,12 +1334,12 @@
23217 * Disable interrupts early to avoid deadlocks.
23218 * See rcu_read_unlock() comment header for details.
23219 */
23220- local_irq_save(*flags);
23221+ local_irq_save_nort(*flags);
23222 rcu_read_lock();
23223 sighand = rcu_dereference(tsk->sighand);
23224 if (unlikely(sighand == NULL)) {
23225 rcu_read_unlock();
23226- local_irq_restore(*flags);
23227+ local_irq_restore_nort(*flags);
23228 break;
23229 }
23230 /*
23231@@ -1267,7 +1360,7 @@
23232 }
23233 spin_unlock(&sighand->siglock);
23234 rcu_read_unlock();
23235- local_irq_restore(*flags);
23236+ local_irq_restore_nort(*flags);
23237 }
23238
23239 return sighand;
23240@@ -1514,7 +1607,8 @@
23241 */
23242 struct sigqueue *sigqueue_alloc(void)
23243 {
23244- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23245+ /* Preallocated sigqueue objects always from the slabcache ! */
23246+ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23247
23248 if (q)
23249 q->flags |= SIGQUEUE_PREALLOC;
23250@@ -1888,15 +1982,7 @@
23251 if (gstop_done && ptrace_reparented(current))
23252 do_notify_parent_cldstop(current, false, why);
23253
23254- /*
23255- * Don't want to allow preemption here, because
23256- * sys_ptrace() needs this task to be inactive.
23257- *
23258- * XXX: implement read_unlock_no_resched().
23259- */
23260- preempt_disable();
23261 read_unlock(&tasklist_lock);
23262- preempt_enable_no_resched();
23263 freezable_schedule();
23264 } else {
23265 /*
23266diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/softirq.c linux-4.14/kernel/softirq.c
23267--- linux-4.14.orig/kernel/softirq.c 2018-09-05 11:03:22.000000000 +0200
23268+++ linux-4.14/kernel/softirq.c 2018-09-05 11:05:07.000000000 +0200
23269@@ -21,11 +21,14 @@
23270 #include <linux/freezer.h>
23271 #include <linux/kthread.h>
23272 #include <linux/rcupdate.h>
23273+#include <linux/delay.h>
23274 #include <linux/ftrace.h>
23275 #include <linux/smp.h>
23276 #include <linux/smpboot.h>
23277 #include <linux/tick.h>
23278+#include <linux/locallock.h>
23279 #include <linux/irq.h>
23280+#include <linux/sched/types.h>
23281
23282 #define CREATE_TRACE_POINTS
23283 #include <trace/events/irq.h>
23284@@ -56,12 +59,108 @@
23285 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23286
23287 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23288+#ifdef CONFIG_PREEMPT_RT_FULL
23289+#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23290+DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23291+#endif
23292
23293 const char * const softirq_to_name[NR_SOFTIRQS] = {
23294 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
23295 "TASKLET", "SCHED", "HRTIMER", "RCU"
23296 };
23297
23298+#ifdef CONFIG_NO_HZ_COMMON
23299+# ifdef CONFIG_PREEMPT_RT_FULL
1a6e0f06 23300+
e4b2b4a8
JK
23301+struct softirq_runner {
23302+ struct task_struct *runner[NR_SOFTIRQS];
23303+};
1a6e0f06 23304+
e4b2b4a8 23305+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
1a6e0f06 23306+
e4b2b4a8 23307+static inline void softirq_set_runner(unsigned int sirq)
1a6e0f06 23308+{
e4b2b4a8 23309+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
1a6e0f06 23310+
e4b2b4a8 23311+ sr->runner[sirq] = current;
1a6e0f06 23312+}
1a6e0f06 23313+
e4b2b4a8 23314+static inline void softirq_clr_runner(unsigned int sirq)
1a6e0f06 23315+{
e4b2b4a8
JK
23316+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23317+
23318+ sr->runner[sirq] = NULL;
1a6e0f06 23319+}
1a6e0f06 23320+
e4b2b4a8
JK
23321+/*
23322+ * On preempt-rt a softirq running context might be blocked on a
23323+ * lock. There might be no other runnable task on this CPU because the
23324+ * lock owner runs on some other CPU. So we have to go into idle with
23325+ * the pending bit set. Therefor we need to check this otherwise we
23326+ * warn about false positives which confuses users and defeats the
23327+ * whole purpose of this test.
1a6e0f06 23328+ *
e4b2b4a8 23329+ * This code is called with interrupts disabled.
1a6e0f06 23330+ */
e4b2b4a8 23331+void softirq_check_pending_idle(void)
1a6e0f06 23332+{
e4b2b4a8
JK
23333+ static int rate_limit;
23334+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23335+ u32 warnpending;
23336+ int i;
23337+
23338+ if (rate_limit >= 10)
23339+ return;
23340+
23341+ warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
23342+ for (i = 0; i < NR_SOFTIRQS; i++) {
23343+ struct task_struct *tsk = sr->runner[i];
23344+
23345+ /*
23346+ * The wakeup code in rtmutex.c wakes up the task
23347+ * _before_ it sets pi_blocked_on to NULL under
23348+ * tsk->pi_lock. So we need to check for both: state
23349+ * and pi_blocked_on.
23350+ */
23351+ if (tsk) {
23352+ raw_spin_lock(&tsk->pi_lock);
23353+ if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
23354+ /* Clear all bits pending in that task */
23355+ warnpending &= ~(tsk->softirqs_raised);
23356+ warnpending &= ~(1 << i);
23357+ }
23358+ raw_spin_unlock(&tsk->pi_lock);
23359+ }
1a6e0f06 23360+ }
e4b2b4a8
JK
23361+
23362+ if (warnpending) {
23363+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23364+ warnpending);
23365+ rate_limit++;
23366+ }
23367+}
23368+# else
23369+/*
23370+ * On !PREEMPT_RT we just printk rate limited:
23371+ */
23372+void softirq_check_pending_idle(void)
1a6e0f06 23373+{
e4b2b4a8
JK
23374+ static int rate_limit;
23375+
23376+ if (rate_limit < 10 &&
23377+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
23378+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23379+ local_softirq_pending());
23380+ rate_limit++;
23381+ }
1a6e0f06 23382+}
e4b2b4a8
JK
23383+# endif
23384+
23385+#else /* !CONFIG_NO_HZ_COMMON */
23386+static inline void softirq_set_runner(unsigned int sirq) { }
23387+static inline void softirq_clr_runner(unsigned int sirq) { }
23388+#endif
1a6e0f06
JK
23389+
23390 /*
e4b2b4a8
JK
23391 * we cannot loop indefinitely here to avoid userspace starvation,
23392 * but we also don't want to introduce a worst case 1/HZ latency
23393@@ -77,6 +176,38 @@
23394 wake_up_process(tsk);
1a6e0f06
JK
23395 }
23396
e4b2b4a8
JK
23397+#ifdef CONFIG_PREEMPT_RT_FULL
23398+static void wakeup_timer_softirqd(void)
1a6e0f06 23399+{
e4b2b4a8
JK
23400+ /* Interrupts are disabled: no need to stop preemption */
23401+ struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
23402+
23403+ if (tsk && tsk->state != TASK_RUNNING)
23404+ wake_up_process(tsk);
1a6e0f06 23405+}
e4b2b4a8 23406+#endif
1a6e0f06 23407+
e4b2b4a8
JK
23408+static void handle_softirq(unsigned int vec_nr)
23409+{
23410+ struct softirq_action *h = softirq_vec + vec_nr;
23411+ int prev_count;
1a6e0f06 23412+
e4b2b4a8 23413+ prev_count = preempt_count();
1a6e0f06 23414+
e4b2b4a8 23415+ kstat_incr_softirqs_this_cpu(vec_nr);
1a6e0f06 23416+
e4b2b4a8
JK
23417+ trace_softirq_entry(vec_nr);
23418+ h->action(h);
23419+ trace_softirq_exit(vec_nr);
23420+ if (unlikely(prev_count != preempt_count())) {
23421+ pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23422+ vec_nr, softirq_to_name[vec_nr], h->action,
23423+ prev_count, preempt_count());
23424+ preempt_count_set(prev_count);
23425+ }
1a6e0f06
JK
23426+}
23427+
e4b2b4a8 23428+#ifndef CONFIG_PREEMPT_RT_FULL
1a6e0f06 23429 /*
e4b2b4a8
JK
23430 * If ksoftirqd is scheduled, we do not want to process pending softirqs
23431 * right now. Let ksoftirqd handle this at its own rate, to get fairness,
23432@@ -92,6 +223,47 @@
23433 return tsk && (tsk->state == TASK_RUNNING);
1a6e0f06
JK
23434 }
23435
e4b2b4a8 23436+static inline int ksoftirqd_softirq_pending(void)
1a6e0f06 23437+{
e4b2b4a8 23438+ return local_softirq_pending();
1a6e0f06
JK
23439+}
23440+
e4b2b4a8 23441+static void handle_pending_softirqs(u32 pending)
1a6e0f06 23442+{
e4b2b4a8
JK
23443+ struct softirq_action *h = softirq_vec;
23444+ int softirq_bit;
1a6e0f06 23445+
e4b2b4a8
JK
23446+ local_irq_enable();
23447+
23448+ h = softirq_vec;
23449+
23450+ while ((softirq_bit = ffs(pending))) {
23451+ unsigned int vec_nr;
23452+
23453+ h += softirq_bit - 1;
23454+ vec_nr = h - softirq_vec;
23455+ handle_softirq(vec_nr);
23456+
23457+ h++;
23458+ pending >>= softirq_bit;
1a6e0f06 23459+ }
e4b2b4a8
JK
23460+
23461+ rcu_bh_qs();
23462+ local_irq_disable();
1a6e0f06 23463+}
e4b2b4a8
JK
23464+
23465+static void run_ksoftirqd(unsigned int cpu)
1a6e0f06 23466+{
e4b2b4a8
JK
23467+ local_irq_disable();
23468+ if (ksoftirqd_softirq_pending()) {
23469+ __do_softirq();
23470+ local_irq_enable();
23471+ cond_resched_rcu_qs();
23472+ return;
23473+ }
23474+ local_irq_enable();
1a6e0f06 23475+}
1a6e0f06 23476+
e4b2b4a8
JK
23477 /*
23478 * preempt_count and SOFTIRQ_OFFSET usage:
23479 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
23480@@ -247,10 +419,8 @@
23481 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
23482 unsigned long old_flags = current->flags;
23483 int max_restart = MAX_SOFTIRQ_RESTART;
23484- struct softirq_action *h;
23485 bool in_hardirq;
23486 __u32 pending;
23487- int softirq_bit;
23488
23489 /*
23490 * Mask out PF_MEMALLOC s current task context is borrowed for the
23491@@ -269,36 +439,7 @@
23492 /* Reset the pending bitmask before enabling irqs */
23493 set_softirq_pending(0);
23494
23495- local_irq_enable();
23496-
23497- h = softirq_vec;
23498-
23499- while ((softirq_bit = ffs(pending))) {
23500- unsigned int vec_nr;
23501- int prev_count;
23502-
23503- h += softirq_bit - 1;
23504-
23505- vec_nr = h - softirq_vec;
23506- prev_count = preempt_count();
23507-
23508- kstat_incr_softirqs_this_cpu(vec_nr);
23509-
23510- trace_softirq_entry(vec_nr);
23511- h->action(h);
23512- trace_softirq_exit(vec_nr);
23513- if (unlikely(prev_count != preempt_count())) {
23514- pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23515- vec_nr, softirq_to_name[vec_nr], h->action,
23516- prev_count, preempt_count());
23517- preempt_count_set(prev_count);
23518- }
23519- h++;
23520- pending >>= softirq_bit;
23521- }
23522-
23523- rcu_bh_qs();
23524- local_irq_disable();
23525+ handle_pending_softirqs(pending);
23526
23527 pending = local_softirq_pending();
23528 if (pending) {
23529@@ -335,6 +476,309 @@
23530 }
23531
23532 /*
23533+ * This function must run with irqs disabled!
1a6e0f06 23534+ */
e4b2b4a8 23535+void raise_softirq_irqoff(unsigned int nr)
1a6e0f06 23536+{
e4b2b4a8 23537+ __raise_softirq_irqoff(nr);
1a6e0f06
JK
23538+
23539+ /*
e4b2b4a8
JK
23540+ * If we're in an interrupt or softirq, we're done
23541+ * (this also catches softirq-disabled code). We will
23542+ * actually run the softirq once we return from
23543+ * the irq or softirq.
23544+ *
23545+ * Otherwise we wake up ksoftirqd to make sure we
23546+ * schedule the softirq soon.
1a6e0f06 23547+ */
e4b2b4a8
JK
23548+ if (!in_interrupt())
23549+ wakeup_softirqd();
23550+}
1a6e0f06 23551+
e4b2b4a8
JK
23552+void __raise_softirq_irqoff(unsigned int nr)
23553+{
23554+ trace_softirq_raise(nr);
23555+ or_softirq_pending(1UL << nr);
23556+}
1a6e0f06 23557+
e4b2b4a8
JK
23558+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
23559+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
23560+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
1a6e0f06 23561+
e4b2b4a8 23562+#else /* !PREEMPT_RT_FULL */
1a6e0f06 23563+
e4b2b4a8
JK
23564+/*
23565+ * On RT we serialize softirq execution with a cpu local lock per softirq
23566+ */
23567+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
1a6e0f06 23568+
e4b2b4a8
JK
23569+void __init softirq_early_init(void)
23570+{
23571+ int i;
1a6e0f06 23572+
e4b2b4a8
JK
23573+ for (i = 0; i < NR_SOFTIRQS; i++)
23574+ local_irq_lock_init(local_softirq_locks[i]);
23575+}
1a6e0f06 23576+
e4b2b4a8
JK
23577+static void lock_softirq(int which)
23578+{
23579+ local_lock(local_softirq_locks[which]);
23580+}
1a6e0f06 23581+
e4b2b4a8
JK
23582+static void unlock_softirq(int which)
23583+{
23584+ local_unlock(local_softirq_locks[which]);
23585+}
1a6e0f06 23586+
e4b2b4a8
JK
23587+static void do_single_softirq(int which)
23588+{
23589+ unsigned long old_flags = current->flags;
1a6e0f06 23590+
e4b2b4a8
JK
23591+ current->flags &= ~PF_MEMALLOC;
23592+ vtime_account_irq_enter(current);
23593+ current->flags |= PF_IN_SOFTIRQ;
23594+ lockdep_softirq_enter();
23595+ local_irq_enable();
23596+ handle_softirq(which);
23597+ local_irq_disable();
23598+ lockdep_softirq_exit();
23599+ current->flags &= ~PF_IN_SOFTIRQ;
23600+ vtime_account_irq_enter(current);
23601+ current_restore_flags(old_flags, PF_MEMALLOC);
1a6e0f06
JK
23602+}
23603+
1a6e0f06 23604+/*
e4b2b4a8
JK
23605+ * Called with interrupts disabled. Process softirqs which were raised
23606+ * in current context (or on behalf of ksoftirqd).
1a6e0f06 23607+ */
e4b2b4a8 23608+static void do_current_softirqs(void)
1a6e0f06 23609+{
e4b2b4a8
JK
23610+ while (current->softirqs_raised) {
23611+ int i = __ffs(current->softirqs_raised);
23612+ unsigned int pending, mask = (1U << i);
1a6e0f06 23613+
e4b2b4a8
JK
23614+ current->softirqs_raised &= ~mask;
23615+ local_irq_enable();
1a6e0f06 23616+
e4b2b4a8
JK
23617+ /*
23618+ * If the lock is contended, we boost the owner to
23619+ * process the softirq or leave the critical section
23620+ * now.
23621+ */
23622+ lock_softirq(i);
23623+ local_irq_disable();
23624+ softirq_set_runner(i);
23625+ /*
23626+ * Check with the local_softirq_pending() bits,
23627+ * whether we need to process this still or if someone
23628+ * else took care of it.
23629+ */
23630+ pending = local_softirq_pending();
23631+ if (pending & mask) {
23632+ set_softirq_pending(pending & ~mask);
23633+ do_single_softirq(i);
23634+ }
23635+ softirq_clr_runner(i);
23636+ WARN_ON(current->softirq_nestcnt != 1);
23637+ local_irq_enable();
23638+ unlock_softirq(i);
23639+ local_irq_disable();
1a6e0f06 23640+ }
1a6e0f06
JK
23641+}
23642+
e4b2b4a8 23643+void __local_bh_disable(void)
1a6e0f06 23644+{
e4b2b4a8
JK
23645+ if (++current->softirq_nestcnt == 1)
23646+ migrate_disable();
23647+}
23648+EXPORT_SYMBOL(__local_bh_disable);
1a6e0f06 23649+
e4b2b4a8
JK
23650+void __local_bh_enable(void)
23651+{
23652+ if (WARN_ON(current->softirq_nestcnt == 0))
23653+ return;
1a6e0f06 23654+
e4b2b4a8
JK
23655+ local_irq_disable();
23656+ if (current->softirq_nestcnt == 1 && current->softirqs_raised)
23657+ do_current_softirqs();
23658+ local_irq_enable();
1a6e0f06 23659+
e4b2b4a8
JK
23660+ if (--current->softirq_nestcnt == 0)
23661+ migrate_enable();
1a6e0f06 23662+}
e4b2b4a8 23663+EXPORT_SYMBOL(__local_bh_enable);
1a6e0f06 23664+
e4b2b4a8 23665+void _local_bh_enable(void)
1a6e0f06 23666+{
e4b2b4a8
JK
23667+ if (WARN_ON(current->softirq_nestcnt == 0))
23668+ return;
23669+ if (--current->softirq_nestcnt == 0)
23670+ migrate_enable();
1a6e0f06 23671+}
e4b2b4a8 23672+EXPORT_SYMBOL(_local_bh_enable);
1a6e0f06 23673+
e4b2b4a8 23674+int in_serving_softirq(void)
1a6e0f06 23675+{
e4b2b4a8 23676+ return current->flags & PF_IN_SOFTIRQ;
1a6e0f06 23677+}
e4b2b4a8 23678+EXPORT_SYMBOL(in_serving_softirq);
1a6e0f06 23679+
e4b2b4a8
JK
23680+/* Called with preemption disabled */
23681+static void run_ksoftirqd(unsigned int cpu)
1a6e0f06 23682+{
e4b2b4a8
JK
23683+ local_irq_disable();
23684+ current->softirq_nestcnt++;
23685+
23686+ do_current_softirqs();
23687+ current->softirq_nestcnt--;
23688+ local_irq_enable();
23689+ cond_resched_rcu_qs();
1a6e0f06 23690+}
1a6e0f06 23691+
e4b2b4a8
JK
23692+/*
23693+ * Called from netif_rx_ni(). Preemption enabled, but migration
23694+ * disabled. So the cpu can't go away under us.
23695+ */
23696+void thread_do_softirq(void)
1a6e0f06 23697+{
e4b2b4a8
JK
23698+ if (!in_serving_softirq() && current->softirqs_raised) {
23699+ current->softirq_nestcnt++;
23700+ do_current_softirqs();
23701+ current->softirq_nestcnt--;
23702+ }
1a6e0f06 23703+}
1a6e0f06 23704+
e4b2b4a8 23705+static void do_raise_softirq_irqoff(unsigned int nr)
1a6e0f06 23706+{
e4b2b4a8
JK
23707+ unsigned int mask;
23708+
23709+ mask = 1UL << nr;
23710+
23711+ trace_softirq_raise(nr);
23712+ or_softirq_pending(mask);
23713+
23714+ /*
23715+ * If we are not in a hard interrupt and inside a bh disabled
23716+ * region, we simply raise the flag on current. local_bh_enable()
23717+ * will make sure that the softirq is executed. Otherwise we
23718+ * delegate it to ksoftirqd.
23719+ */
23720+ if (!in_irq() && current->softirq_nestcnt)
23721+ current->softirqs_raised |= mask;
23722+ else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
23723+ return;
23724+
23725+ if (mask & TIMER_SOFTIRQS)
23726+ __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
23727+ else
23728+ __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
1a6e0f06 23729+}
1a6e0f06 23730+
e4b2b4a8 23731+static void wakeup_proper_softirq(unsigned int nr)
1a6e0f06 23732+{
e4b2b4a8
JK
23733+ if ((1UL << nr) & TIMER_SOFTIRQS)
23734+ wakeup_timer_softirqd();
23735+ else
23736+ wakeup_softirqd();
1a6e0f06 23737+}
1a6e0f06 23738+
e4b2b4a8 23739+void __raise_softirq_irqoff(unsigned int nr)
1a6e0f06 23740+{
e4b2b4a8
JK
23741+ do_raise_softirq_irqoff(nr);
23742+ if (!in_irq() && !current->softirq_nestcnt)
23743+ wakeup_proper_softirq(nr);
1a6e0f06 23744+}
1a6e0f06 23745+
e4b2b4a8
JK
23746+/*
23747+ * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
23748+ */
23749+void __raise_softirq_irqoff_ksoft(unsigned int nr)
1a6e0f06 23750+{
e4b2b4a8 23751+ unsigned int mask;
1a6e0f06 23752+
e4b2b4a8
JK
23753+ if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
23754+ !__this_cpu_read(ktimer_softirqd)))
23755+ return;
23756+ mask = 1UL << nr;
1a6e0f06 23757+
e4b2b4a8
JK
23758+ trace_softirq_raise(nr);
23759+ or_softirq_pending(mask);
23760+ if (mask & TIMER_SOFTIRQS)
23761+ __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
23762+ else
23763+ __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
23764+ wakeup_proper_softirq(nr);
1a6e0f06 23765+}
1a6e0f06
JK
23766+
23767+/*
e4b2b4a8 23768+ * This function must run with irqs disabled!
1a6e0f06 23769+ */
e4b2b4a8 23770+void raise_softirq_irqoff(unsigned int nr)
1a6e0f06 23771+{
e4b2b4a8
JK
23772+ do_raise_softirq_irqoff(nr);
23773+
23774+ /*
23775+ * If we're in an hard interrupt we let irq return code deal
23776+ * with the wakeup of ksoftirqd.
23777+ */
23778+ if (in_irq())
23779+ return;
23780+ /*
23781+ * If we are in thread context but outside of a bh disabled
23782+ * region, we need to wake ksoftirqd as well.
23783+ *
23784+ * CHECKME: Some of the places which do that could be wrapped
23785+ * into local_bh_disable/enable pairs. Though it's unclear
23786+ * whether this is worth the effort. To find those places just
23787+ * raise a WARN() if the condition is met.
23788+ */
23789+ if (!current->softirq_nestcnt)
23790+ wakeup_proper_softirq(nr);
1a6e0f06 23791+}
1a6e0f06 23792+
e4b2b4a8 23793+static inline int ksoftirqd_softirq_pending(void)
1a6e0f06 23794+{
e4b2b4a8
JK
23795+ return current->softirqs_raised;
23796+}
1a6e0f06 23797+
e4b2b4a8
JK
23798+static inline void local_bh_disable_nort(void) { }
23799+static inline void _local_bh_enable_nort(void) { }
23800+
23801+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
23802+{
23803+ /* Take over all but timer pending softirqs when starting */
23804+ local_irq_disable();
23805+ current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
23806+ local_irq_enable();
1a6e0f06 23807+}
1a6e0f06 23808+
e4b2b4a8 23809+static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
1a6e0f06 23810+{
e4b2b4a8 23811+ struct sched_param param = { .sched_priority = 1 };
1a6e0f06 23812+
e4b2b4a8
JK
23813+ sched_setscheduler(current, SCHED_FIFO, &param);
23814+
23815+ /* Take over timer pending softirqs when starting */
23816+ local_irq_disable();
23817+ current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
23818+ local_irq_enable();
1a6e0f06 23819+}
1a6e0f06 23820+
e4b2b4a8
JK
23821+static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
23822+ bool online)
1a6e0f06 23823+{
e4b2b4a8 23824+ struct sched_param param = { .sched_priority = 0 };
1a6e0f06 23825+
e4b2b4a8 23826+ sched_setscheduler(current, SCHED_NORMAL, &param);
1a6e0f06 23827+}
1a6e0f06 23828+
e4b2b4a8 23829+static int ktimer_softirqd_should_run(unsigned int cpu)
1a6e0f06 23830+{
e4b2b4a8 23831+ return current->softirqs_raised;
1a6e0f06 23832+}
1a6e0f06 23833+
e4b2b4a8
JK
23834+#endif /* PREEMPT_RT_FULL */
23835+/*
23836 * Enter an interrupt context.
23837 */
23838 void irq_enter(void)
23839@@ -345,9 +789,9 @@
23840 * Prevent raise_softirq from needlessly waking up ksoftirqd
23841 * here, as softirq will be serviced on return from interrupt.
23842 */
23843- local_bh_disable();
23844+ local_bh_disable_nort();
23845 tick_irq_enter();
23846- _local_bh_enable();
23847+ _local_bh_enable_nort();
23848 }
23849
23850 __irq_enter();
23851@@ -355,6 +799,7 @@
23852
23853 static inline void invoke_softirq(void)
23854 {
23855+#ifndef CONFIG_PREEMPT_RT_FULL
23856 if (ksoftirqd_running(local_softirq_pending()))
23857 return;
23858
23859@@ -377,6 +822,18 @@
23860 } else {
23861 wakeup_softirqd();
23862 }
23863+#else /* PREEMPT_RT_FULL */
23864+ unsigned long flags;
23865+
23866+ local_irq_save(flags);
23867+ if (__this_cpu_read(ksoftirqd) &&
23868+ __this_cpu_read(ksoftirqd)->softirqs_raised)
23869+ wakeup_softirqd();
23870+ if (__this_cpu_read(ktimer_softirqd) &&
23871+ __this_cpu_read(ktimer_softirqd)->softirqs_raised)
23872+ wakeup_timer_softirqd();
23873+ local_irq_restore(flags);
23874+#endif
23875 }
23876
23877 static inline void tick_irq_exit(void)
23878@@ -385,7 +842,13 @@
23879 int cpu = smp_processor_id();
23880
23881 /* Make sure that timer wheel updates are propagated */
23882- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
23883+#ifdef CONFIG_PREEMPT_RT_BASE
23884+ if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) &&
23885+ !need_resched() && !local_softirq_pending())
23886+#else
23887+ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu))
23888+#endif
23889+ {
23890 if (!in_irq())
23891 tick_nohz_irq_exit();
23892 }
23893@@ -413,26 +876,6 @@
23894 trace_hardirq_exit(); /* must be last! */
23895 }
23896
23897-/*
23898- * This function must run with irqs disabled!
23899- */
23900-inline void raise_softirq_irqoff(unsigned int nr)
23901-{
23902- __raise_softirq_irqoff(nr);
23903-
23904- /*
23905- * If we're in an interrupt or softirq, we're done
23906- * (this also catches softirq-disabled code). We will
23907- * actually run the softirq once we return from
23908- * the irq or softirq.
23909- *
23910- * Otherwise we wake up ksoftirqd to make sure we
23911- * schedule the softirq soon.
23912- */
23913- if (!in_interrupt())
23914- wakeup_softirqd();
23915-}
23916-
23917 void raise_softirq(unsigned int nr)
23918 {
23919 unsigned long flags;
23920@@ -442,12 +885,6 @@
23921 local_irq_restore(flags);
23922 }
23923
23924-void __raise_softirq_irqoff(unsigned int nr)
23925-{
23926- trace_softirq_raise(nr);
23927- or_softirq_pending(1UL << nr);
23928-}
23929-
23930 void open_softirq(int nr, void (*action)(struct softirq_action *))
23931 {
23932 softirq_vec[nr].action = action;
23933@@ -464,15 +901,45 @@
23934 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
23935 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
23936
23937+static void inline
23938+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
1a6e0f06 23939+{
e4b2b4a8
JK
23940+ if (tasklet_trylock(t)) {
23941+again:
23942+ /* We may have been preempted before tasklet_trylock
23943+ * and __tasklet_action may have already run.
23944+ * So double check the sched bit while the takslet
23945+ * is locked before adding it to the list.
23946+ */
23947+ if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
23948+ t->next = NULL;
23949+ *head->tail = t;
23950+ head->tail = &(t->next);
23951+ raise_softirq_irqoff(nr);
23952+ tasklet_unlock(t);
23953+ } else {
23954+ /* This is subtle. If we hit the corner case above
23955+ * It is possible that we get preempted right here,
23956+ * and another task has successfully called
23957+ * tasklet_schedule(), then this function, and
23958+ * failed on the trylock. Thus we must be sure
23959+ * before releasing the tasklet lock, that the
23960+ * SCHED_BIT is clear. Otherwise the tasklet
23961+ * may get its SCHED_BIT set, but not added to the
23962+ * list
23963+ */
23964+ if (!tasklet_tryunlock(t))
23965+ goto again;
23966+ }
23967+ }
1a6e0f06 23968+}
1a6e0f06 23969+
e4b2b4a8
JK
23970 void __tasklet_schedule(struct tasklet_struct *t)
23971 {
23972 unsigned long flags;
23973
23974 local_irq_save(flags);
23975- t->next = NULL;
23976- *__this_cpu_read(tasklet_vec.tail) = t;
23977- __this_cpu_write(tasklet_vec.tail, &(t->next));
23978- raise_softirq_irqoff(TASKLET_SOFTIRQ);
23979+ __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
23980 local_irq_restore(flags);
23981 }
23982 EXPORT_SYMBOL(__tasklet_schedule);
23983@@ -482,50 +949,108 @@
23984 unsigned long flags;
23985
23986 local_irq_save(flags);
23987- t->next = NULL;
23988- *__this_cpu_read(tasklet_hi_vec.tail) = t;
23989- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
23990- raise_softirq_irqoff(HI_SOFTIRQ);
23991+ __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
23992 local_irq_restore(flags);
23993 }
23994 EXPORT_SYMBOL(__tasklet_hi_schedule);
23995
23996-static __latent_entropy void tasklet_action(struct softirq_action *a)
23997+void tasklet_enable(struct tasklet_struct *t)
23998 {
23999- struct tasklet_struct *list;
24000+ if (!atomic_dec_and_test(&t->count))
24001+ return;
24002+ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24003+ tasklet_schedule(t);
1a6e0f06 24004+}
e4b2b4a8
JK
24005+EXPORT_SYMBOL(tasklet_enable);
24006
24007- local_irq_disable();
24008- list = __this_cpu_read(tasklet_vec.head);
24009- __this_cpu_write(tasklet_vec.head, NULL);
24010- __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24011- local_irq_enable();
24012+static void __tasklet_action(struct softirq_action *a,
24013+ struct tasklet_struct *list)
24014+{
24015+ int loops = 1000000;
24016
24017 while (list) {
24018 struct tasklet_struct *t = list;
24019
24020 list = list->next;
24021
24022- if (tasklet_trylock(t)) {
24023- if (!atomic_read(&t->count)) {
24024- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24025- &t->state))
24026- BUG();
24027- t->func(t->data);
24028- tasklet_unlock(t);
24029- continue;
24030- }
24031- tasklet_unlock(t);
24032+ /*
24033+ * Should always succeed - after a tasklist got on the
24034+ * list (after getting the SCHED bit set from 0 to 1),
24035+ * nothing but the tasklet softirq it got queued to can
24036+ * lock it:
24037+ */
24038+ if (!tasklet_trylock(t)) {
24039+ WARN_ON(1);
24040+ continue;
24041 }
24042
24043- local_irq_disable();
24044 t->next = NULL;
24045- *__this_cpu_read(tasklet_vec.tail) = t;
24046- __this_cpu_write(tasklet_vec.tail, &(t->next));
24047- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24048- local_irq_enable();
1a6e0f06 24049+
e4b2b4a8
JK
24050+ /*
24051+ * If we cannot handle the tasklet because it's disabled,
24052+ * mark it as pending. tasklet_enable() will later
24053+ * re-schedule the tasklet.
24054+ */
24055+ if (unlikely(atomic_read(&t->count))) {
24056+out_disabled:
24057+ /* implicit unlock: */
24058+ wmb();
24059+ t->state = TASKLET_STATEF_PENDING;
24060+ continue;
24061+ }
1a6e0f06 24062+
e4b2b4a8
JK
24063+ /*
24064+ * After this point on the tasklet might be rescheduled
24065+ * on another CPU, but it can only be added to another
24066+ * CPU's tasklet list if we unlock the tasklet (which we
24067+ * dont do yet).
24068+ */
24069+ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24070+ WARN_ON(1);
1a6e0f06 24071+
e4b2b4a8
JK
24072+again:
24073+ t->func(t->data);
1a6e0f06 24074+
e4b2b4a8
JK
24075+ /*
24076+ * Try to unlock the tasklet. We must use cmpxchg, because
24077+ * another CPU might have scheduled or disabled the tasklet.
24078+ * We only allow the STATE_RUN -> 0 transition here.
24079+ */
24080+ while (!tasklet_tryunlock(t)) {
24081+ /*
24082+ * If it got disabled meanwhile, bail out:
24083+ */
24084+ if (atomic_read(&t->count))
24085+ goto out_disabled;
24086+ /*
24087+ * If it got scheduled meanwhile, re-execute
24088+ * the tasklet function:
24089+ */
24090+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24091+ goto again;
24092+ if (!--loops) {
24093+ printk("hm, tasklet state: %08lx\n", t->state);
24094+ WARN_ON(1);
24095+ tasklet_unlock(t);
24096+ break;
24097+ }
24098+ }
24099 }
24100 }
24101
24102+static __latent_entropy void tasklet_action(struct softirq_action *a)
24103+{
24104+ struct tasklet_struct *list;
1a6e0f06 24105+
e4b2b4a8
JK
24106+ local_irq_disable();
24107+ list = __this_cpu_read(tasklet_vec.head);
24108+ __this_cpu_write(tasklet_vec.head, NULL);
24109+ __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24110+ local_irq_enable();
1a6e0f06 24111+
e4b2b4a8 24112+ __tasklet_action(a, list);
1a6e0f06 24113+}
e4b2b4a8
JK
24114+
24115 static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24116 {
24117 struct tasklet_struct *list;
24118@@ -536,30 +1061,7 @@
24119 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24120 local_irq_enable();
24121
24122- while (list) {
24123- struct tasklet_struct *t = list;
24124-
24125- list = list->next;
24126-
24127- if (tasklet_trylock(t)) {
24128- if (!atomic_read(&t->count)) {
24129- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24130- &t->state))
24131- BUG();
24132- t->func(t->data);
24133- tasklet_unlock(t);
24134- continue;
24135- }
24136- tasklet_unlock(t);
24137- }
24138-
24139- local_irq_disable();
24140- t->next = NULL;
24141- *__this_cpu_read(tasklet_hi_vec.tail) = t;
24142- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24143- __raise_softirq_irqoff(HI_SOFTIRQ);
24144- local_irq_enable();
24145- }
24146+ __tasklet_action(a, list);
24147 }
24148
24149 void tasklet_init(struct tasklet_struct *t,
24150@@ -580,7 +1082,7 @@
24151
24152 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24153 do {
24154- yield();
24155+ msleep(1);
24156 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24157 }
24158 tasklet_unlock_wait(t);
24159@@ -588,57 +1090,6 @@
24160 }
24161 EXPORT_SYMBOL(tasklet_kill);
24162
24163-/*
24164- * tasklet_hrtimer
24165- */
24166-
24167-/*
24168- * The trampoline is called when the hrtimer expires. It schedules a tasklet
24169- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
24170- * hrtimer callback, but from softirq context.
24171- */
24172-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
24173-{
24174- struct tasklet_hrtimer *ttimer =
24175- container_of(timer, struct tasklet_hrtimer, timer);
24176-
24177- tasklet_hi_schedule(&ttimer->tasklet);
24178- return HRTIMER_NORESTART;
24179-}
24180-
24181-/*
24182- * Helper function which calls the hrtimer callback from
24183- * tasklet/softirq context
24184- */
24185-static void __tasklet_hrtimer_trampoline(unsigned long data)
24186-{
24187- struct tasklet_hrtimer *ttimer = (void *)data;
24188- enum hrtimer_restart restart;
24189-
24190- restart = ttimer->function(&ttimer->timer);
24191- if (restart != HRTIMER_NORESTART)
24192- hrtimer_restart(&ttimer->timer);
24193-}
24194-
24195-/**
24196- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
24197- * @ttimer: tasklet_hrtimer which is initialized
24198- * @function: hrtimer callback function which gets called from softirq context
24199- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
24200- * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
24201- */
24202-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
24203- enum hrtimer_restart (*function)(struct hrtimer *),
24204- clockid_t which_clock, enum hrtimer_mode mode)
24205-{
24206- hrtimer_init(&ttimer->timer, which_clock, mode);
24207- ttimer->timer.function = __hrtimer_tasklet_trampoline;
24208- tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
24209- (unsigned long)ttimer);
24210- ttimer->function = function;
24211-}
24212-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
24213-
24214 void __init softirq_init(void)
24215 {
24216 int cpu;
24217@@ -654,25 +1105,26 @@
24218 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24219 }
24220
24221-static int ksoftirqd_should_run(unsigned int cpu)
24222-{
24223- return local_softirq_pending();
24224-}
24225-
24226-static void run_ksoftirqd(unsigned int cpu)
24227+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24228+void tasklet_unlock_wait(struct tasklet_struct *t)
24229 {
24230- local_irq_disable();
24231- if (local_softirq_pending()) {
24232+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24233 /*
24234- * We can safely run softirq on inline stack, as we are not deep
24235- * in the task stack here.
24236+ * Hack for now to avoid this busy-loop:
24237 */
24238- __do_softirq();
24239- local_irq_enable();
24240- cond_resched_rcu_qs();
24241- return;
24242+#ifdef CONFIG_PREEMPT_RT_FULL
24243+ msleep(1);
1a6e0f06 24244+#else
e4b2b4a8
JK
24245+ barrier();
24246+#endif
24247 }
24248- local_irq_enable();
1a6e0f06 24249+}
e4b2b4a8 24250+EXPORT_SYMBOL(tasklet_unlock_wait);
1a6e0f06
JK
24251+#endif
24252+
e4b2b4a8 24253+static int ksoftirqd_should_run(unsigned int cpu)
1a6e0f06 24254+{
e4b2b4a8
JK
24255+ return ksoftirqd_softirq_pending();
24256 }
1a6e0f06 24257
e4b2b4a8
JK
24258 #ifdef CONFIG_HOTPLUG_CPU
24259@@ -739,17 +1191,31 @@
24260
24261 static struct smp_hotplug_thread softirq_threads = {
24262 .store = &ksoftirqd,
24263+ .setup = ksoftirqd_set_sched_params,
24264 .thread_should_run = ksoftirqd_should_run,
24265 .thread_fn = run_ksoftirqd,
24266 .thread_comm = "ksoftirqd/%u",
24267 };
24268
24269+#ifdef CONFIG_PREEMPT_RT_FULL
24270+static struct smp_hotplug_thread softirq_timer_threads = {
24271+ .store = &ktimer_softirqd,
24272+ .setup = ktimer_softirqd_set_sched_params,
24273+ .cleanup = ktimer_softirqd_clr_sched_params,
24274+ .thread_should_run = ktimer_softirqd_should_run,
24275+ .thread_fn = run_ksoftirqd,
24276+ .thread_comm = "ktimersoftd/%u",
24277+};
24278+#endif
1a6e0f06 24279+
e4b2b4a8
JK
24280 static __init int spawn_ksoftirqd(void)
24281 {
24282 cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
24283 takeover_tasklets);
24284 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24285-
24286+#ifdef CONFIG_PREEMPT_RT_FULL
24287+ BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24288+#endif
24289 return 0;
24290 }
24291 early_initcall(spawn_ksoftirqd);
24292diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/stop_machine.c linux-4.14/kernel/stop_machine.c
24293--- linux-4.14.orig/kernel/stop_machine.c 2018-09-05 11:03:22.000000000 +0200
24294+++ linux-4.14/kernel/stop_machine.c 2018-09-05 11:05:07.000000000 +0200
24295@@ -496,6 +496,8 @@
24296 struct cpu_stop_done *done = work->done;
24297 int ret;
24298
24299+ /* XXX */
1a6e0f06 24300+
e4b2b4a8
JK
24301 /* cpu stop callbacks must not sleep, make in_atomic() == T */
24302 preempt_count_inc();
24303 ret = fn(arg);
24304diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/alarmtimer.c linux-4.14/kernel/time/alarmtimer.c
24305--- linux-4.14.orig/kernel/time/alarmtimer.c 2018-09-05 11:03:22.000000000 +0200
24306+++ linux-4.14/kernel/time/alarmtimer.c 2018-09-05 11:05:07.000000000 +0200
24307@@ -436,7 +436,7 @@
24308 int ret = alarm_try_to_cancel(alarm);
24309 if (ret >= 0)
24310 return ret;
24311- cpu_relax();
24312+ hrtimer_wait_for_timer(&alarm->timer);
24313 }
24314 }
24315 EXPORT_SYMBOL_GPL(alarm_cancel);
24316diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/hrtimer.c linux-4.14/kernel/time/hrtimer.c
24317--- linux-4.14.orig/kernel/time/hrtimer.c 2018-09-05 11:03:22.000000000 +0200
24318+++ linux-4.14/kernel/time/hrtimer.c 2018-09-05 11:05:07.000000000 +0200
24319@@ -60,6 +60,15 @@
24320 #include "tick-internal.h"
1a6e0f06 24321
e4b2b4a8
JK
24322 /*
24323+ * Masks for selecting the soft and hard context timers from
24324+ * cpu_base->active
24325+ */
24326+#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
24327+#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
24328+#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
24329+#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
24330+
24331+/*
24332 * The timer bases:
24333 *
24334 * There are more clockids than hrtimer bases. Thus, we index
24335@@ -70,7 +79,6 @@
24336 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
1a6e0f06 24337 {
e4b2b4a8
JK
24338 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
24339- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
24340 .clock_base =
24341 {
24342 {
24343@@ -93,6 +101,26 @@
24344 .clockid = CLOCK_TAI,
24345 .get_time = &ktime_get_clocktai,
24346 },
24347+ {
24348+ .index = HRTIMER_BASE_MONOTONIC_SOFT,
24349+ .clockid = CLOCK_MONOTONIC,
24350+ .get_time = &ktime_get,
24351+ },
24352+ {
24353+ .index = HRTIMER_BASE_REALTIME_SOFT,
24354+ .clockid = CLOCK_REALTIME,
24355+ .get_time = &ktime_get_real,
24356+ },
24357+ {
24358+ .index = HRTIMER_BASE_BOOTTIME_SOFT,
24359+ .clockid = CLOCK_BOOTTIME,
24360+ .get_time = &ktime_get_boottime,
24361+ },
24362+ {
24363+ .index = HRTIMER_BASE_TAI_SOFT,
24364+ .clockid = CLOCK_TAI,
24365+ .get_time = &ktime_get_clocktai,
24366+ },
24367 }
24368 };
1a6e0f06 24369
e4b2b4a8
JK
24370@@ -118,7 +146,6 @@
24371 * timer->base->cpu_base
24372 */
24373 static struct hrtimer_cpu_base migration_cpu_base = {
24374- .seq = SEQCNT_ZERO(migration_cpu_base),
24375 .clock_base = { { .cpu_base = &migration_cpu_base, }, },
24376 };
1a6e0f06 24377
e4b2b4a8 24378@@ -156,45 +183,33 @@
1a6e0f06
JK
24379 }
24380
24381 /*
e4b2b4a8
JK
24382- * With HIGHRES=y we do not migrate the timer when it is expiring
24383- * before the next event on the target cpu because we cannot reprogram
24384- * the target cpu hardware and we would cause it to fire late.
24385+ * We do not migrate the timer when it is expiring before the next
24386+ * event on the target cpu. When high resolution is enabled, we cannot
24387+ * reprogram the target cpu hardware and we would cause it to fire
24388+ * late. To keep it simple, we handle the high resolution enabled and
24389+ * disabled case similar.
24390 *
24391 * Called with cpu_base->lock of target cpu held.
24392 */
24393 static int
24394 hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
1a6e0f06 24395 {
e4b2b4a8
JK
24396-#ifdef CONFIG_HIGH_RES_TIMERS
24397 ktime_t expires;
1a6e0f06 24398
e4b2b4a8
JK
24399- if (!new_base->cpu_base->hres_active)
24400- return 0;
24401-
24402 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
24403- return expires <= new_base->cpu_base->expires_next;
24404-#else
24405- return 0;
24406-#endif
24407+ return expires < new_base->cpu_base->expires_next;
24408 }
1a6e0f06 24409
e4b2b4a8
JK
24410-#ifdef CONFIG_NO_HZ_COMMON
24411-static inline
24412-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
24413- int pinned)
24414-{
24415- if (pinned || !base->migration_enabled)
24416- return base;
24417- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
24418-}
24419-#else
24420 static inline
24421 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
24422 int pinned)
24423 {
24424+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
24425+ if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
24426+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
24427+#endif
24428 return base;
24429 }
24430-#endif
1a6e0f06 24431
e4b2b4a8
JK
24432 /*
24433 * We switch the timer base to a power-optimized selected CPU target,
24434@@ -396,7 +411,8 @@
24435 debug_object_init(timer, &hrtimer_debug_descr);
24436 }
1a6e0f06 24437
e4b2b4a8
JK
24438-static inline void debug_hrtimer_activate(struct hrtimer *timer)
24439+static inline void debug_hrtimer_activate(struct hrtimer *timer,
24440+ enum hrtimer_mode mode)
24441 {
24442 debug_object_activate(timer, &hrtimer_debug_descr);
24443 }
24444@@ -429,8 +445,10 @@
24445 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
1a6e0f06 24446
e4b2b4a8
JK
24447 #else
24448+
24449 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
24450-static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
24451+static inline void debug_hrtimer_activate(struct hrtimer *timer,
24452+ enum hrtimer_mode mode) { }
24453 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
24454 #endif
1a6e0f06 24455
e4b2b4a8
JK
24456@@ -442,10 +460,11 @@
24457 trace_hrtimer_init(timer, clockid, mode);
1a6e0f06 24458 }
1a6e0f06 24459
e4b2b4a8
JK
24460-static inline void debug_activate(struct hrtimer *timer)
24461+static inline void debug_activate(struct hrtimer *timer,
24462+ enum hrtimer_mode mode)
24463 {
24464- debug_hrtimer_activate(timer);
24465- trace_hrtimer_start(timer);
24466+ debug_hrtimer_activate(timer, mode);
24467+ trace_hrtimer_start(timer, mode);
24468 }
1a6e0f06 24469
e4b2b4a8
JK
24470 static inline void debug_deactivate(struct hrtimer *timer)
24471@@ -454,35 +473,43 @@
24472 trace_hrtimer_cancel(timer);
1a6e0f06
JK
24473 }
24474
e4b2b4a8
JK
24475-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
24476-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
24477- struct hrtimer *timer)
24478+static struct hrtimer_clock_base *
24479+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
24480 {
24481-#ifdef CONFIG_HIGH_RES_TIMERS
24482- cpu_base->next_timer = timer;
24483-#endif
24484+ unsigned int idx;
1a6e0f06 24485+
e4b2b4a8
JK
24486+ if (!*active)
24487+ return NULL;
1a6e0f06 24488+
e4b2b4a8
JK
24489+ idx = __ffs(*active);
24490+ *active &= ~(1U << idx);
1a6e0f06 24491+
e4b2b4a8
JK
24492+ return &cpu_base->clock_base[idx];
24493 }
24494
24495-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
24496+#define for_each_active_base(base, cpu_base, active) \
24497+ while ((base = __next_base((cpu_base), &(active))))
1a6e0f06 24498+
e4b2b4a8
JK
24499+static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
24500+ unsigned int active,
24501+ ktime_t expires_next)
24502 {
24503- struct hrtimer_clock_base *base = cpu_base->clock_base;
24504- unsigned int active = cpu_base->active_bases;
24505- ktime_t expires, expires_next = KTIME_MAX;
24506+ struct hrtimer_clock_base *base;
24507+ ktime_t expires;
24508
24509- hrtimer_update_next_timer(cpu_base, NULL);
24510- for (; active; base++, active >>= 1) {
24511+ for_each_active_base(base, cpu_base, active) {
24512 struct timerqueue_node *next;
24513 struct hrtimer *timer;
24514
24515- if (!(active & 0x01))
24516- continue;
24517-
24518 next = timerqueue_getnext(&base->active);
24519 timer = container_of(next, struct hrtimer, node);
24520 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24521 if (expires < expires_next) {
24522 expires_next = expires;
24523- hrtimer_update_next_timer(cpu_base, timer);
24524+ if (timer->is_soft)
24525+ cpu_base->softirq_next_timer = timer;
24526+ else
24527+ cpu_base->next_timer = timer;
24528 }
24529 }
24530 /*
24531@@ -494,7 +521,47 @@
24532 expires_next = 0;
24533 return expires_next;
24534 }
24535-#endif
1a6e0f06 24536+
e4b2b4a8
JK
24537+/*
24538+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
24539+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
24540+ *
24541+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
24542+ * those timers will get run whenever the softirq gets handled, at the end of
24543+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
24544+ *
24545+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
24546+ * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
24547+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
24548+ *
24549+ * @active_mask must be one of:
24550+ * - HRTIMER_ACTIVE_ALL,
24551+ * - HRTIMER_ACTIVE_SOFT, or
24552+ * - HRTIMER_ACTIVE_HARD.
24553+ */
24554+static ktime_t
24555+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
1a6e0f06 24556+{
e4b2b4a8
JK
24557+ unsigned int active;
24558+ struct hrtimer *next_timer = NULL;
24559+ ktime_t expires_next = KTIME_MAX;
1a6e0f06 24560+
e4b2b4a8
JK
24561+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
24562+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
24563+ cpu_base->softirq_next_timer = NULL;
24564+ expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
24565+
24566+ next_timer = cpu_base->softirq_next_timer;
1a6e0f06 24567+ }
1a6e0f06 24568+
e4b2b4a8
JK
24569+ if (active_mask & HRTIMER_ACTIVE_HARD) {
24570+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
24571+ cpu_base->next_timer = next_timer;
24572+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
24573+ }
1a6e0f06 24574+
e4b2b4a8 24575+ return expires_next;
1a6e0f06 24576+}
e4b2b4a8
JK
24577
24578 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
24579 {
24580@@ -502,36 +569,14 @@
24581 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
24582 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
24583
24584- return ktime_get_update_offsets_now(&base->clock_was_set_seq,
24585+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
24586 offs_real, offs_boot, offs_tai);
24587-}
24588-
24589-/* High resolution timer related functions */
24590-#ifdef CONFIG_HIGH_RES_TIMERS
24591-
24592-/*
24593- * High resolution timer enabled ?
24594- */
24595-static bool hrtimer_hres_enabled __read_mostly = true;
24596-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
24597-EXPORT_SYMBOL_GPL(hrtimer_resolution);
24598-
24599-/*
24600- * Enable / Disable high resolution mode
24601- */
24602-static int __init setup_hrtimer_hres(char *str)
24603-{
24604- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
24605-}
24606
24607-__setup("highres=", setup_hrtimer_hres);
24608+ base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
24609+ base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
24610+ base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
24611
24612-/*
24613- * hrtimer_high_res_enabled - query, if the highres mode is enabled
24614- */
24615-static inline int hrtimer_is_hres_enabled(void)
24616-{
24617- return hrtimer_hres_enabled;
24618+ return now;
24619 }
24620
1a6e0f06 24621 /*
e4b2b4a8 24622@@ -539,7 +584,8 @@
1a6e0f06 24623 */
e4b2b4a8 24624 static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
1a6e0f06 24625 {
e4b2b4a8
JK
24626- return cpu_base->hres_active;
24627+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
24628+ cpu_base->hres_active : 0;
24629 }
1a6e0f06 24630
e4b2b4a8
JK
24631 static inline int hrtimer_hres_active(void)
24632@@ -557,10 +603,23 @@
24633 {
24634 ktime_t expires_next;
1a6e0f06 24635
e4b2b4a8
JK
24636- if (!cpu_base->hres_active)
24637- return;
24638+ /*
24639+ * Find the current next expiration time.
24640+ */
24641+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
1a6e0f06 24642
e4b2b4a8
JK
24643- expires_next = __hrtimer_get_next_event(cpu_base);
24644+ if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
24645+ /*
24646+ * When the softirq is activated, hrtimer has to be
24647+ * programmed with the first hard hrtimer because soft
24648+ * timer interrupt could occur too late.
24649+ */
24650+ if (cpu_base->softirq_activated)
24651+ expires_next = __hrtimer_get_next_event(cpu_base,
24652+ HRTIMER_ACTIVE_HARD);
24653+ else
24654+ cpu_base->softirq_expires_next = expires_next;
1a6e0f06
JK
24655+ }
24656
e4b2b4a8
JK
24657 if (skip_equal && expires_next == cpu_base->expires_next)
24658 return;
24659@@ -568,6 +627,9 @@
24660 cpu_base->expires_next = expires_next;
1a6e0f06
JK
24661
24662 /*
e4b2b4a8
JK
24663+ * If hres is not active, hardware does not have to be
24664+ * reprogrammed yet.
24665+ *
24666 * If a hang was detected in the last timer interrupt then we
24667 * leave the hang delay active in the hardware. We want the
24668 * system to make progress. That also prevents the following
24669@@ -581,83 +643,38 @@
24670 * set. So we'd effectivly block all timers until the T2 event
24671 * fires.
1a6e0f06 24672 */
e4b2b4a8
JK
24673- if (cpu_base->hang_detected)
24674+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
24675 return;
1a6e0f06 24676
e4b2b4a8
JK
24677 tick_program_event(cpu_base->expires_next, 1);
24678 }
1a6e0f06 24679
e4b2b4a8
JK
24680+/* High resolution timer related functions */
24681+#ifdef CONFIG_HIGH_RES_TIMERS
24682+
24683 /*
24684- * When a timer is enqueued and expires earlier than the already enqueued
24685- * timers, we have to check, whether it expires earlier than the timer for
24686- * which the clock event device was armed.
24687- *
24688- * Called with interrupts disabled and base->cpu_base.lock held
24689+ * High resolution timer enabled ?
1a6e0f06 24690 */
e4b2b4a8
JK
24691-static void hrtimer_reprogram(struct hrtimer *timer,
24692- struct hrtimer_clock_base *base)
24693-{
24694- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
24695- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24696-
24697- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
24698-
24699- /*
24700- * If the timer is not on the current cpu, we cannot reprogram
24701- * the other cpus clock event device.
24702- */
24703- if (base->cpu_base != cpu_base)
24704- return;
24705-
24706- /*
24707- * If the hrtimer interrupt is running, then it will
24708- * reevaluate the clock bases and reprogram the clock event
24709- * device. The callbacks are always executed in hard interrupt
24710- * context so we don't need an extra check for a running
24711- * callback.
24712- */
24713- if (cpu_base->in_hrtirq)
24714- return;
24715-
24716- /*
24717- * CLOCK_REALTIME timer might be requested with an absolute
24718- * expiry time which is less than base->offset. Set it to 0.
24719- */
24720- if (expires < 0)
24721- expires = 0;
24722-
24723- if (expires >= cpu_base->expires_next)
24724- return;
24725-
24726- /* Update the pointer to the next expiring timer */
24727- cpu_base->next_timer = timer;
24728-
24729- /*
24730- * If a hang was detected in the last timer interrupt then we
24731- * do not schedule a timer which is earlier than the expiry
24732- * which we enforced in the hang detection. We want the system
24733- * to make progress.
24734- */
24735- if (cpu_base->hang_detected)
24736- return;
24737+static bool hrtimer_hres_enabled __read_mostly = true;
24738+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
24739+EXPORT_SYMBOL_GPL(hrtimer_resolution);
24740
24741- /*
24742- * Program the timer hardware. We enforce the expiry for
24743- * events which are already in the past.
24744- */
24745- cpu_base->expires_next = expires;
24746- tick_program_event(expires, 1);
24747+/*
24748+ * Enable / Disable high resolution mode
24749+ */
24750+static int __init setup_hrtimer_hres(char *str)
24751+{
24752+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
1a6e0f06
JK
24753 }
24754
e4b2b4a8
JK
24755+__setup("highres=", setup_hrtimer_hres);
24756+
24757 /*
24758- * Initialize the high resolution related parts of cpu_base
24759+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
24760 */
24761-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
24762+static inline int hrtimer_is_hres_enabled(void)
1a6e0f06 24763 {
e4b2b4a8
JK
24764- base->expires_next = KTIME_MAX;
24765- base->hang_detected = 0;
24766- base->hres_active = 0;
24767- base->next_timer = NULL;
24768+ return hrtimer_hres_enabled;
1a6e0f06
JK
24769 }
24770
e4b2b4a8
JK
24771 /*
24772@@ -669,7 +686,7 @@
1a6e0f06 24773 {
e4b2b4a8 24774 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
1a6e0f06 24775
e4b2b4a8
JK
24776- if (!base->hres_active)
24777+ if (!__hrtimer_hres_active(base))
24778 return;
1a6e0f06 24779
e4b2b4a8
JK
24780 raw_spin_lock(&base->lock);
24781@@ -698,6 +715,29 @@
24782 retrigger_next_event(NULL);
24783 }
1a6e0f06 24784
e4b2b4a8
JK
24785+#ifdef CONFIG_PREEMPT_RT_FULL
24786+
24787+static struct swork_event clock_set_delay_work;
24788+
24789+static void run_clock_set_delay(struct swork_event *event)
24790+{
24791+ clock_was_set();
24792+}
24793+
24794+void clock_was_set_delayed(void)
24795+{
24796+ swork_queue(&clock_set_delay_work);
24797+}
24798+
24799+static __init int create_clock_set_delay_thread(void)
24800+{
24801+ WARN_ON(swork_get());
24802+ INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
24803+ return 0;
24804+}
24805+early_initcall(create_clock_set_delay_thread);
24806+#else /* PREEMPT_RT_FULL */
24807+
24808 static void clock_was_set_work(struct work_struct *work)
1a6e0f06 24809 {
e4b2b4a8
JK
24810 clock_was_set();
24811@@ -713,26 +753,106 @@
24812 {
24813 schedule_work(&hrtimer_work);
1a6e0f06 24814 }
e4b2b4a8 24815+#endif
1a6e0f06 24816
e4b2b4a8 24817 #else
1a6e0f06 24818
e4b2b4a8
JK
24819-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
24820-static inline int hrtimer_hres_active(void) { return 0; }
24821 static inline int hrtimer_is_hres_enabled(void) { return 0; }
24822 static inline void hrtimer_switch_to_hres(void) { }
24823-static inline void
24824-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
24825-static inline int hrtimer_reprogram(struct hrtimer *timer,
24826- struct hrtimer_clock_base *base)
24827-{
24828- return 0;
24829-}
24830-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
24831 static inline void retrigger_next_event(void *arg) { }
1a6e0f06 24832
e4b2b4a8 24833 #endif /* CONFIG_HIGH_RES_TIMERS */
1a6e0f06 24834
e4b2b4a8
JK
24835 /*
24836+ * When a timer is enqueued and expires earlier than the already enqueued
24837+ * timers, we have to check, whether it expires earlier than the timer for
24838+ * which the clock event device was armed.
24839+ *
24840+ * Called with interrupts disabled and base->cpu_base.lock held
24841+ */
24842+static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
24843+{
24844+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
24845+ struct hrtimer_clock_base *base = timer->base;
24846+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24847+
24848+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
24849+
24850+ /*
24851+ * CLOCK_REALTIME timer might be requested with an absolute
24852+ * expiry time which is less than base->offset. Set it to 0.
24853+ */
24854+ if (expires < 0)
24855+ expires = 0;
24856+
24857+ if (timer->is_soft) {
24858+ /*
24859+ * soft hrtimer could be started on a remote CPU. In this
24860+ * case softirq_expires_next needs to be updated on the
24861+ * remote CPU. The soft hrtimer will not expire before the
24862+ * first hard hrtimer on the remote CPU -
24863+ * hrtimer_check_target() prevents this case.
24864+ */
24865+ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
24866+
24867+ if (timer_cpu_base->softirq_activated)
24868+ return;
24869+
24870+ if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
24871+ return;
24872+
24873+ timer_cpu_base->softirq_next_timer = timer;
24874+ timer_cpu_base->softirq_expires_next = expires;
24875+
24876+ if (!ktime_before(expires, timer_cpu_base->expires_next) ||
24877+ !reprogram)
24878+ return;
24879+ }
24880+
24881+ /*
24882+ * If the timer is not on the current cpu, we cannot reprogram
24883+ * the other cpus clock event device.
24884+ */
24885+ if (base->cpu_base != cpu_base)
24886+ return;
24887+
24888+ /*
24889+ * If the hrtimer interrupt is running, then it will
24890+ * reevaluate the clock bases and reprogram the clock event
24891+ * device. The callbacks are always executed in hard interrupt
24892+ * context so we don't need an extra check for a running
24893+ * callback.
24894+ */
24895+ if (cpu_base->in_hrtirq)
24896+ return;
24897+
24898+ if (expires >= cpu_base->expires_next)
24899+ return;
24900+
24901+ /* Update the pointer to the next expiring timer */
24902+ cpu_base->next_timer = timer;
24903+ cpu_base->expires_next = expires;
24904+
24905+ /*
24906+ * If hres is not active, hardware does not have to be
24907+ * programmed yet.
24908+ *
24909+ * If a hang was detected in the last timer interrupt then we
24910+ * do not schedule a timer which is earlier than the expiry
24911+ * which we enforced in the hang detection. We want the system
24912+ * to make progress.
24913+ */
24914+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
24915+ return;
24916+
24917+ /*
24918+ * Program the timer hardware. We enforce the expiry for
24919+ * events which are already in the past.
24920+ */
24921+ tick_program_event(expires, 1);
24922+}
24923+
24924+/*
24925 * Clock realtime was set
24926 *
24927 * Change the offset of the realtime clock vs. the monotonic
24928@@ -830,6 +950,33 @@
1a6e0f06 24929 }
e4b2b4a8 24930 EXPORT_SYMBOL_GPL(hrtimer_forward);
1a6e0f06 24931
e4b2b4a8
JK
24932+#ifdef CONFIG_PREEMPT_RT_BASE
24933+# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
24934+
24935+/**
24936+ * hrtimer_wait_for_timer - Wait for a running timer
1a6e0f06 24937+ *
e4b2b4a8 24938+ * @timer: timer to wait for
1a6e0f06 24939+ *
e4b2b4a8
JK
24940+ * The function waits in case the timers callback function is
24941+ * currently executed on the waitqueue of the timer base. The
24942+ * waitqueue is woken up after the timer callback function has
24943+ * finished execution.
1a6e0f06 24944+ */
e4b2b4a8 24945+void hrtimer_wait_for_timer(const struct hrtimer *timer)
1a6e0f06 24946+{
e4b2b4a8 24947+ struct hrtimer_clock_base *base = timer->base;
1a6e0f06 24948+
e4b2b4a8
JK
24949+ if (base && base->cpu_base &&
24950+ base->index >= HRTIMER_BASE_MONOTONIC_SOFT)
24951+ wait_event(base->cpu_base->wait,
24952+ !(hrtimer_callback_running(timer)));
1a6e0f06 24953+}
1a6e0f06 24954+
1a6e0f06 24955+#else
e4b2b4a8 24956+# define wake_up_timer_waiters(b) do { } while (0)
1a6e0f06 24957+#endif
e4b2b4a8
JK
24958+
24959 /*
24960 * enqueue_hrtimer - internal function to (re)start a timer
24961 *
24962@@ -839,9 +986,10 @@
24963 * Returns 1 when the new timer is the leftmost timer in the tree.
1a6e0f06 24964 */
e4b2b4a8
JK
24965 static int enqueue_hrtimer(struct hrtimer *timer,
24966- struct hrtimer_clock_base *base)
24967+ struct hrtimer_clock_base *base,
24968+ enum hrtimer_mode mode)
1a6e0f06 24969 {
e4b2b4a8
JK
24970- debug_activate(timer);
24971+ debug_activate(timer, mode);
1a6e0f06 24972
e4b2b4a8
JK
24973 base->cpu_base->active_bases |= 1 << base->index;
24974
24975@@ -874,7 +1022,6 @@
24976 if (!timerqueue_del(&base->active, &timer->node))
24977 cpu_base->active_bases &= ~(1 << base->index);
24978
24979-#ifdef CONFIG_HIGH_RES_TIMERS
24980 /*
24981 * Note: If reprogram is false we do not update
24982 * cpu_base->next_timer. This happens when we remove the first
24983@@ -885,7 +1032,6 @@
24984 */
24985 if (reprogram && timer == cpu_base->next_timer)
24986 hrtimer_force_reprogram(cpu_base, 1);
24987-#endif
24988 }
1a6e0f06 24989
e4b2b4a8
JK
24990 /*
24991@@ -934,22 +1080,36 @@
24992 return tim;
1a6e0f06 24993 }
1a6e0f06 24994
e4b2b4a8
JK
24995-/**
24996- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
24997- * @timer: the timer to be added
24998- * @tim: expiry time
24999- * @delta_ns: "slack" range for the timer
25000- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
25001- * relative (HRTIMER_MODE_REL)
25002- */
25003-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25004- u64 delta_ns, const enum hrtimer_mode mode)
25005+static void
25006+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
1a6e0f06 25007 {
e4b2b4a8
JK
25008- struct hrtimer_clock_base *base, *new_base;
25009- unsigned long flags;
25010- int leftmost;
25011+ ktime_t expires;
1a6e0f06 25012
e4b2b4a8 25013- base = lock_hrtimer_base(timer, &flags);
1a6e0f06 25014+ /*
e4b2b4a8 25015+ * Find the next SOFT expiration.
1a6e0f06 25016+ */
e4b2b4a8 25017+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
1a6e0f06 25018+
e4b2b4a8
JK
25019+ /*
25020+ * reprogramming needs to be triggered, even if the next soft
25021+ * hrtimer expires at the same time than the next hard
25022+ * hrtimer. cpu_base->softirq_expires_next needs to be updated!
25023+ */
25024+ if (expires == KTIME_MAX)
25025+ return;
1a6e0f06 25026+
e4b2b4a8
JK
25027+ /*
25028+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
25029+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
25030+ */
25031+ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
1a6e0f06 25032+}
1a6e0f06 25033+
e4b2b4a8
JK
25034+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25035+ u64 delta_ns, const enum hrtimer_mode mode,
25036+ struct hrtimer_clock_base *base)
1a6e0f06 25037+{
e4b2b4a8
JK
25038+ struct hrtimer_clock_base *new_base;
25039
25040 /* Remove an active timer from the queue: */
25041 remove_hrtimer(timer, base, true);
25042@@ -964,21 +1124,37 @@
25043 /* Switch the timer base, if necessary: */
25044 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25045
25046- leftmost = enqueue_hrtimer(timer, new_base);
25047- if (!leftmost)
25048- goto unlock;
25049+ return enqueue_hrtimer(timer, new_base, mode);
1a6e0f06 25050+}
1a6e0f06 25051+
e4b2b4a8
JK
25052+/**
25053+ * hrtimer_start_range_ns - (re)start an hrtimer
25054+ * @timer: the timer to be added
25055+ * @tim: expiry time
25056+ * @delta_ns: "slack" range for the timer
25057+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
25058+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
25059+ * softirq based mode is considered for debug purpose only!
25060+ */
25061+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25062+ u64 delta_ns, const enum hrtimer_mode mode)
1a6e0f06 25063+{
e4b2b4a8
JK
25064+ struct hrtimer_clock_base *base;
25065+ unsigned long flags;
1a6e0f06
JK
25066+
25067+ /*
e4b2b4a8
JK
25068+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
25069+ * match.
1a6e0f06 25070+ */
e4b2b4a8
JK
25071+#ifndef CONFIG_PREEMPT_RT_BASE
25072+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
1a6e0f06 25073+#endif
1a6e0f06 25074+
e4b2b4a8
JK
25075+ base = lock_hrtimer_base(timer, &flags);
25076+
25077+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
25078+ hrtimer_reprogram(timer, true);
25079
25080- if (!hrtimer_is_hres_active(timer)) {
25081- /*
25082- * Kick to reschedule the next tick to handle the new timer
25083- * on dynticks target.
25084- */
25085- if (new_base->cpu_base->nohz_active)
25086- wake_up_nohz_cpu(new_base->cpu_base->cpu);
25087- } else {
25088- hrtimer_reprogram(timer, new_base);
25089- }
25090-unlock:
25091 unlock_hrtimer_base(timer, &flags);
25092 }
25093 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
25094@@ -1035,7 +1211,7 @@
25095
25096 if (ret >= 0)
25097 return ret;
25098- cpu_relax();
25099+ hrtimer_wait_for_timer(timer);
25100 }
25101 }
25102 EXPORT_SYMBOL_GPL(hrtimer_cancel);
25103@@ -1076,7 +1252,7 @@
25104 raw_spin_lock_irqsave(&cpu_base->lock, flags);
25105
25106 if (!__hrtimer_hres_active(cpu_base))
25107- expires = __hrtimer_get_next_event(cpu_base);
25108+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25109
25110 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25111
25112@@ -1099,8 +1275,16 @@
25113 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25114 enum hrtimer_mode mode)
25115 {
25116- struct hrtimer_cpu_base *cpu_base;
25117+ bool softtimer;
25118 int base;
25119+ struct hrtimer_cpu_base *cpu_base;
25120+
25121+ softtimer = !!(mode & HRTIMER_MODE_SOFT);
25122+#ifdef CONFIG_PREEMPT_RT_FULL
25123+ if (!softtimer && !(mode & HRTIMER_MODE_HARD))
25124+ softtimer = true;
1a6e0f06 25125+#endif
e4b2b4a8
JK
25126+ base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
25127
25128 memset(timer, 0, sizeof(struct hrtimer));
25129
25130@@ -1114,7 +1298,8 @@
25131 if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
25132 clock_id = CLOCK_MONOTONIC;
25133
25134- base = hrtimer_clockid_to_base(clock_id);
25135+ base += hrtimer_clockid_to_base(clock_id);
25136+ timer->is_soft = softtimer;
25137 timer->base = &cpu_base->clock_base[base];
25138 timerqueue_init(&timer->node);
25139 }
25140@@ -1123,7 +1308,13 @@
25141 * hrtimer_init - initialize a timer to the given clock
25142 * @timer: the timer to be initialized
25143 * @clock_id: the clock to be used
25144- * @mode: timer mode abs/rel
25145+ * @mode: The modes which are relevant for intitialization:
25146+ * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
25147+ * HRTIMER_MODE_REL_SOFT
25148+ *
25149+ * The PINNED variants of the above can be handed in,
25150+ * but the PINNED bit is ignored as pinning happens
25151+ * when the hrtimer is started
1a6e0f06 25152 */
e4b2b4a8
JK
25153 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25154 enum hrtimer_mode mode)
25155@@ -1142,19 +1333,19 @@
25156 */
25157 bool hrtimer_active(const struct hrtimer *timer)
25158 {
25159- struct hrtimer_cpu_base *cpu_base;
25160+ struct hrtimer_clock_base *base;
25161 unsigned int seq;
25162
25163 do {
25164- cpu_base = READ_ONCE(timer->base->cpu_base);
25165- seq = raw_read_seqcount_begin(&cpu_base->seq);
25166+ base = READ_ONCE(timer->base);
25167+ seq = raw_read_seqcount_begin(&base->seq);
25168
25169 if (timer->state != HRTIMER_STATE_INACTIVE ||
25170- cpu_base->running == timer)
25171+ base->running == timer)
25172 return true;
25173
25174- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
25175- cpu_base != READ_ONCE(timer->base->cpu_base));
25176+ } while (read_seqcount_retry(&base->seq, seq) ||
25177+ base != READ_ONCE(timer->base));
25178
25179 return false;
25180 }
25181@@ -1180,7 +1371,8 @@
25182
25183 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25184 struct hrtimer_clock_base *base,
25185- struct hrtimer *timer, ktime_t *now)
25186+ struct hrtimer *timer, ktime_t *now,
25187+ unsigned long flags)
25188 {
25189 enum hrtimer_restart (*fn)(struct hrtimer *);
25190 int restart;
25191@@ -1188,16 +1380,16 @@
25192 lockdep_assert_held(&cpu_base->lock);
25193
25194 debug_deactivate(timer);
25195- cpu_base->running = timer;
25196+ base->running = timer;
25197
25198 /*
25199 * Separate the ->running assignment from the ->state assignment.
25200 *
25201 * As with a regular write barrier, this ensures the read side in
25202- * hrtimer_active() cannot observe cpu_base->running == NULL &&
25203+ * hrtimer_active() cannot observe base->running == NULL &&
25204 * timer->state == INACTIVE.
25205 */
25206- raw_write_seqcount_barrier(&cpu_base->seq);
25207+ raw_write_seqcount_barrier(&base->seq);
25208
25209 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25210 fn = timer->function;
25211@@ -1211,15 +1403,15 @@
25212 timer->is_rel = false;
25213
25214 /*
25215- * Because we run timers from hardirq context, there is no chance
25216- * they get migrated to another cpu, therefore its safe to unlock
25217- * the timer base.
25218+ * The timer is marked as running in the cpu base, so it is
25219+ * protected against migration to a different CPU even if the lock
25220+ * is dropped.
25221 */
25222- raw_spin_unlock(&cpu_base->lock);
25223+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25224 trace_hrtimer_expire_entry(timer, now);
25225 restart = fn(timer);
25226 trace_hrtimer_expire_exit(timer);
25227- raw_spin_lock(&cpu_base->lock);
25228+ raw_spin_lock_irq(&cpu_base->lock);
25229
25230 /*
25231 * Note: We clear the running state after enqueue_hrtimer and
25232@@ -1232,33 +1424,31 @@
25233 */
25234 if (restart != HRTIMER_NORESTART &&
25235 !(timer->state & HRTIMER_STATE_ENQUEUED))
25236- enqueue_hrtimer(timer, base);
25237+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
25238
25239 /*
25240 * Separate the ->running assignment from the ->state assignment.
25241 *
25242 * As with a regular write barrier, this ensures the read side in
25243- * hrtimer_active() cannot observe cpu_base->running == NULL &&
25244+ * hrtimer_active() cannot observe base->running.timer == NULL &&
25245 * timer->state == INACTIVE.
25246 */
25247- raw_write_seqcount_barrier(&cpu_base->seq);
25248+ raw_write_seqcount_barrier(&base->seq);
1a6e0f06 25249
e4b2b4a8
JK
25250- WARN_ON_ONCE(cpu_base->running != timer);
25251- cpu_base->running = NULL;
25252+ WARN_ON_ONCE(base->running != timer);
25253+ base->running = NULL;
25254 }
1a6e0f06 25255
e4b2b4a8
JK
25256-static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25257+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
25258+ unsigned long flags, unsigned int active_mask)
25259 {
25260- struct hrtimer_clock_base *base = cpu_base->clock_base;
25261- unsigned int active = cpu_base->active_bases;
25262+ struct hrtimer_clock_base *base;
25263+ unsigned int active = cpu_base->active_bases & active_mask;
25264
25265- for (; active; base++, active >>= 1) {
25266+ for_each_active_base(base, cpu_base, active) {
25267 struct timerqueue_node *node;
25268 ktime_t basenow;
25269
25270- if (!(active & 0x01))
25271- continue;
25272-
25273 basenow = ktime_add(now, base->offset);
25274
25275 while ((node = timerqueue_getnext(&base->active))) {
25276@@ -1281,11 +1471,29 @@
25277 if (basenow < hrtimer_get_softexpires_tv64(timer))
25278 break;
25279
25280- __run_hrtimer(cpu_base, base, timer, &basenow);
25281+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
25282 }
25283 }
25284 }
25285
25286+static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
1a6e0f06 25287+{
e4b2b4a8
JK
25288+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25289+ unsigned long flags;
25290+ ktime_t now;
1a6e0f06 25291+
e4b2b4a8 25292+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
1a6e0f06 25293+
e4b2b4a8
JK
25294+ now = hrtimer_update_base(cpu_base);
25295+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
25296+
25297+ cpu_base->softirq_activated = 0;
25298+ hrtimer_update_softirq_timer(cpu_base, true);
25299+
25300+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25301+ wake_up_timer_waiters(cpu_base);
25302+}
25303+
25304 #ifdef CONFIG_HIGH_RES_TIMERS
1a6e0f06 25305
e4b2b4a8
JK
25306 /*
25307@@ -1296,13 +1504,14 @@
25308 {
25309 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25310 ktime_t expires_next, now, entry_time, delta;
25311+ unsigned long flags;
25312 int retries = 0;
1a6e0f06 25313
e4b2b4a8
JK
25314 BUG_ON(!cpu_base->hres_active);
25315 cpu_base->nr_events++;
25316 dev->next_event = KTIME_MAX;
1a6e0f06 25317
e4b2b4a8
JK
25318- raw_spin_lock(&cpu_base->lock);
25319+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
25320 entry_time = now = hrtimer_update_base(cpu_base);
25321 retry:
25322 cpu_base->in_hrtirq = 1;
25323@@ -1315,17 +1524,23 @@
25324 */
25325 cpu_base->expires_next = KTIME_MAX;
1a6e0f06 25326
e4b2b4a8
JK
25327- __hrtimer_run_queues(cpu_base, now);
25328+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
25329+ cpu_base->softirq_expires_next = KTIME_MAX;
25330+ cpu_base->softirq_activated = 1;
25331+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25332+ }
1a6e0f06 25333+
e4b2b4a8 25334+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
1a6e0f06 25335
e4b2b4a8
JK
25336 /* Reevaluate the clock bases for the next expiry */
25337- expires_next = __hrtimer_get_next_event(cpu_base);
25338+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25339 /*
25340 * Store the new expiry value so the migration code can verify
25341 * against it.
25342 */
25343 cpu_base->expires_next = expires_next;
25344 cpu_base->in_hrtirq = 0;
25345- raw_spin_unlock(&cpu_base->lock);
25346+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25347
25348 /* Reprogramming necessary ? */
25349 if (!tick_program_event(expires_next, 0)) {
25350@@ -1346,7 +1561,7 @@
25351 * Acquire base lock for updating the offsets and retrieving
25352 * the current time.
25353 */
25354- raw_spin_lock(&cpu_base->lock);
25355+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
25356 now = hrtimer_update_base(cpu_base);
25357 cpu_base->nr_retries++;
25358 if (++retries < 3)
25359@@ -1359,7 +1574,8 @@
25360 */
25361 cpu_base->nr_hangs++;
25362 cpu_base->hang_detected = 1;
25363- raw_spin_unlock(&cpu_base->lock);
25364+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25365+
25366 delta = ktime_sub(now, entry_time);
25367 if ((unsigned int)delta > cpu_base->max_hang_time)
25368 cpu_base->max_hang_time = (unsigned int) delta;
25369@@ -1401,6 +1617,7 @@
25370 void hrtimer_run_queues(void)
25371 {
25372 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25373+ unsigned long flags;
25374 ktime_t now;
1a6e0f06 25375
e4b2b4a8
JK
25376 if (__hrtimer_hres_active(cpu_base))
25377@@ -1418,10 +1635,17 @@
25378 return;
25379 }
1a6e0f06 25380
e4b2b4a8
JK
25381- raw_spin_lock(&cpu_base->lock);
25382+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
25383 now = hrtimer_update_base(cpu_base);
25384- __hrtimer_run_queues(cpu_base, now);
25385- raw_spin_unlock(&cpu_base->lock);
25386+
25387+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
25388+ cpu_base->softirq_expires_next = KTIME_MAX;
25389+ cpu_base->softirq_activated = 1;
25390+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25391+ }
25392+
25393+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
25394+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1a6e0f06
JK
25395 }
25396
e4b2b4a8
JK
25397 /*
25398@@ -1440,13 +1664,65 @@
25399 return HRTIMER_NORESTART;
1a6e0f06
JK
25400 }
25401
e4b2b4a8
JK
25402-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25403+#ifdef CONFIG_PREEMPT_RT_FULL
25404+static bool task_is_realtime(struct task_struct *tsk)
1a6e0f06 25405 {
e4b2b4a8 25406+ int policy = tsk->policy;
1a6e0f06 25407+
e4b2b4a8
JK
25408+ if (policy == SCHED_FIFO || policy == SCHED_RR)
25409+ return true;
25410+ if (policy == SCHED_DEADLINE)
25411+ return true;
25412+ return false;
25413+}
1a6e0f06 25414+#endif
e4b2b4a8
JK
25415+
25416+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
25417+ clockid_t clock_id,
25418+ enum hrtimer_mode mode,
25419+ struct task_struct *task)
25420+{
25421+#ifdef CONFIG_PREEMPT_RT_FULL
25422+ if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
25423+ if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
25424+ mode |= HRTIMER_MODE_HARD;
25425+ else
25426+ mode |= HRTIMER_MODE_SOFT;
25427+ }
1a6e0f06 25428+#endif
e4b2b4a8
JK
25429+ __hrtimer_init(&sl->timer, clock_id, mode);
25430 sl->timer.function = hrtimer_wakeup;
25431 sl->task = task;
25432 }
25433+
25434+/**
25435+ * hrtimer_init_sleeper - initialize sleeper to the given clock
25436+ * @sl: sleeper to be initialized
25437+ * @clock_id: the clock to be used
25438+ * @mode: timer mode abs/rel
25439+ * @task: the task to wake up
25440+ */
25441+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
25442+ enum hrtimer_mode mode, struct task_struct *task)
25443+{
25444+ debug_init(&sl->timer, clock_id, mode);
25445+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
25446+
25447+}
25448 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1a6e0f06 25449
e4b2b4a8
JK
25450+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
25451+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
25452+ clockid_t clock_id,
25453+ enum hrtimer_mode mode,
25454+ struct task_struct *task)
25455+{
25456+ debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
25457+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
25458+}
25459+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
25460+#endif
1a6e0f06 25461+
e4b2b4a8
JK
25462 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
25463 {
25464 switch(restart->nanosleep.type) {
25465@@ -1470,8 +1746,6 @@
25466 {
25467 struct restart_block *restart;
1a6e0f06 25468
e4b2b4a8
JK
25469- hrtimer_init_sleeper(t, current);
25470-
25471 do {
25472 set_current_state(TASK_INTERRUPTIBLE);
25473 hrtimer_start_expires(&t->timer, mode);
25474@@ -1508,10 +1782,9 @@
25475 struct hrtimer_sleeper t;
25476 int ret;
1a6e0f06 25477
e4b2b4a8
JK
25478- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
25479- HRTIMER_MODE_ABS);
25480+ hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
25481+ HRTIMER_MODE_ABS, current);
25482 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25483-
25484 ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
25485 destroy_hrtimer_on_stack(&t.timer);
25486 return ret;
25487@@ -1529,7 +1802,7 @@
25488 if (dl_task(current) || rt_task(current))
25489 slack = 0;
1a6e0f06 25490
e4b2b4a8
JK
25491- hrtimer_init_on_stack(&t.timer, clockid, mode);
25492+ hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
25493 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
25494 ret = do_nanosleep(&t, mode);
25495 if (ret != -ERESTART_RESTARTBLOCK)
25496@@ -1585,6 +1858,27 @@
25497 }
25498 #endif
1a6e0f06 25499
e4b2b4a8
JK
25500+#ifdef CONFIG_PREEMPT_RT_FULL
25501+/*
25502+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
25503+ */
25504+void cpu_chill(void)
25505+{
25506+ ktime_t chill_time;
25507+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
25508+
25509+ chill_time = ktime_set(0, NSEC_PER_MSEC);
25510+ set_current_state(TASK_UNINTERRUPTIBLE);
25511+ current->flags |= PF_NOFREEZE;
25512+ sleeping_lock_inc();
25513+ schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
25514+ sleeping_lock_dec();
25515+ if (!freeze_flag)
25516+ current->flags &= ~PF_NOFREEZE;
25517+}
25518+EXPORT_SYMBOL(cpu_chill);
25519+#endif
25520+
25521 /*
25522 * Functions related to boot-time initialization:
25523 */
25524@@ -1598,9 +1892,17 @@
25525 timerqueue_init_head(&cpu_base->clock_base[i].active);
25526 }
1a6e0f06 25527
e4b2b4a8
JK
25528- cpu_base->active_bases = 0;
25529 cpu_base->cpu = cpu;
25530- hrtimer_init_hres(cpu_base);
25531+ cpu_base->active_bases = 0;
25532+ cpu_base->hres_active = 0;
25533+ cpu_base->hang_detected = 0;
25534+ cpu_base->next_timer = NULL;
25535+ cpu_base->softirq_next_timer = NULL;
25536+ cpu_base->expires_next = KTIME_MAX;
25537+ cpu_base->softirq_expires_next = KTIME_MAX;
25538+#ifdef CONFIG_PREEMPT_RT_BASE
25539+ init_waitqueue_head(&cpu_base->wait);
25540+#endif
25541 return 0;
25542 }
1a6e0f06 25543
e4b2b4a8
JK
25544@@ -1632,7 +1934,7 @@
25545 * sort out already expired timers and reprogram the
25546 * event device.
25547 */
25548- enqueue_hrtimer(timer, new_base);
25549+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
25550 }
25551 }
1a6e0f06 25552
e4b2b4a8
JK
25553@@ -1644,6 +1946,12 @@
25554 BUG_ON(cpu_online(scpu));
25555 tick_cancel_sched_timer(scpu);
25556
25557+ /*
25558+ * this BH disable ensures that raise_softirq_irqoff() does
25559+ * not wakeup ksoftirqd (and acquire the pi-lock) while
25560+ * holding the cpu_base lock
25561+ */
25562+ local_bh_disable();
1a6e0f06 25563 local_irq_disable();
e4b2b4a8
JK
25564 old_base = &per_cpu(hrtimer_bases, scpu);
25565 new_base = this_cpu_ptr(&hrtimer_bases);
25566@@ -1659,12 +1967,19 @@
25567 &new_base->clock_base[i]);
25568 }
1a6e0f06 25569
e4b2b4a8
JK
25570+ /*
25571+ * The migration might have changed the first expiring softirq
25572+ * timer on this CPU. Update it.
25573+ */
25574+ hrtimer_update_softirq_timer(new_base, false);
25575+
25576 raw_spin_unlock(&old_base->lock);
25577 raw_spin_unlock(&new_base->lock);
25578
25579 /* Check, if we got expired work to do */
25580 __hrtimer_peek_ahead_timers();
1a6e0f06 25581 local_irq_enable();
e4b2b4a8
JK
25582+ local_bh_enable();
25583 return 0;
25584 }
1a6e0f06 25585
e4b2b4a8
JK
25586@@ -1673,18 +1988,19 @@
25587 void __init hrtimers_init(void)
25588 {
25589 hrtimers_prepare_cpu(smp_processor_id());
25590+ open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
1a6e0f06
JK
25591 }
25592
1a6e0f06 25593 /**
e4b2b4a8
JK
25594 * schedule_hrtimeout_range_clock - sleep until timeout
25595 * @expires: timeout value (ktime_t)
25596 * @delta: slack in expires timeout (ktime_t)
25597- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25598- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
25599+ * @mode: timer mode
25600+ * @clock_id: timer clock to be used
1a6e0f06 25601 */
e4b2b4a8
JK
25602 int __sched
25603 schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
25604- const enum hrtimer_mode mode, int clock)
25605+ const enum hrtimer_mode mode, clockid_t clock_id)
25606 {
25607 struct hrtimer_sleeper t;
25608
25609@@ -1705,11 +2021,9 @@
25610 return -EINTR;
1a6e0f06
JK
25611 }
25612
e4b2b4a8
JK
25613- hrtimer_init_on_stack(&t.timer, clock, mode);
25614+ hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
25615 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
25616
25617- hrtimer_init_sleeper(&t, current);
25618-
25619 hrtimer_start_expires(&t.timer, mode);
25620
25621 if (likely(t.task))
25622@@ -1727,7 +2041,7 @@
25623 * schedule_hrtimeout_range - sleep until timeout
25624 * @expires: timeout value (ktime_t)
25625 * @delta: slack in expires timeout (ktime_t)
25626- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25627+ * @mode: timer mode
25628 *
25629 * Make the current task sleep until the given expiry time has
25630 * elapsed. The routine will return immediately unless
25631@@ -1766,7 +2080,7 @@
25632 /**
25633 * schedule_hrtimeout - sleep until timeout
25634 * @expires: timeout value (ktime_t)
25635- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25636+ * @mode: timer mode
25637 *
25638 * Make the current task sleep until the given expiry time has
25639 * elapsed. The routine will return immediately unless
25640diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/itimer.c linux-4.14/kernel/time/itimer.c
25641--- linux-4.14.orig/kernel/time/itimer.c 2017-11-12 19:46:13.000000000 +0100
25642+++ linux-4.14/kernel/time/itimer.c 2018-09-05 11:05:07.000000000 +0200
25643@@ -214,6 +214,7 @@
25644 /* We are sharing ->siglock with it_real_fn() */
25645 if (hrtimer_try_to_cancel(timer) < 0) {
25646 spin_unlock_irq(&tsk->sighand->siglock);
25647+ hrtimer_wait_for_timer(&tsk->signal->real_timer);
25648 goto again;
25649 }
25650 expires = timeval_to_ktime(value->it_value);
25651diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/jiffies.c linux-4.14/kernel/time/jiffies.c
25652--- linux-4.14.orig/kernel/time/jiffies.c 2017-11-12 19:46:13.000000000 +0100
25653+++ linux-4.14/kernel/time/jiffies.c 2018-09-05 11:05:07.000000000 +0200
25654@@ -74,7 +74,8 @@
25655 .max_cycles = 10,
25656 };
25657
25658-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
25659+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
25660+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
25661
25662 #if (BITS_PER_LONG < 64)
25663 u64 get_jiffies_64(void)
25664@@ -83,9 +84,9 @@
25665 u64 ret;
25666
25667 do {
25668- seq = read_seqbegin(&jiffies_lock);
25669+ seq = read_seqcount_begin(&jiffies_seq);
25670 ret = jiffies_64;
25671- } while (read_seqretry(&jiffies_lock, seq));
25672+ } while (read_seqcount_retry(&jiffies_seq, seq));
25673 return ret;
1a6e0f06 25674 }
e4b2b4a8
JK
25675 EXPORT_SYMBOL(get_jiffies_64);
25676diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/posix-cpu-timers.c linux-4.14/kernel/time/posix-cpu-timers.c
25677--- linux-4.14.orig/kernel/time/posix-cpu-timers.c 2017-11-12 19:46:13.000000000 +0100
25678+++ linux-4.14/kernel/time/posix-cpu-timers.c 2018-09-05 11:05:07.000000000 +0200
25679@@ -3,8 +3,10 @@
25680 * Implement CPU time clocks for the POSIX clock interface.
25681 */
1a6e0f06 25682
e4b2b4a8
JK
25683+#include <uapi/linux/sched/types.h>
25684 #include <linux/sched/signal.h>
25685 #include <linux/sched/cputime.h>
25686+#include <linux/sched/rt.h>
25687 #include <linux/posix-timers.h>
25688 #include <linux/errno.h>
25689 #include <linux/math64.h>
25690@@ -14,6 +16,7 @@
25691 #include <linux/tick.h>
25692 #include <linux/workqueue.h>
25693 #include <linux/compat.h>
25694+#include <linux/smpboot.h>
1a6e0f06 25695
e4b2b4a8 25696 #include "posix-timers.h"
1a6e0f06 25697
e4b2b4a8
JK
25698@@ -603,7 +606,7 @@
25699 /*
25700 * Disarm any old timer after extracting its expiry time.
25701 */
25702- WARN_ON_ONCE(!irqs_disabled());
25703+ WARN_ON_ONCE_NONRT(!irqs_disabled());
1a6e0f06 25704
e4b2b4a8
JK
25705 ret = 0;
25706 old_incr = timer->it.cpu.incr;
25707@@ -1034,7 +1037,7 @@
25708 /*
25709 * Now re-arm for the new expiry time.
25710 */
25711- WARN_ON_ONCE(!irqs_disabled());
25712+ WARN_ON_ONCE_NONRT(!irqs_disabled());
25713 arm_timer(timer);
25714 unlock:
25715 unlock_task_sighand(p, &flags);
25716@@ -1119,13 +1122,13 @@
25717 * already updated our counts. We need to check if any timers fire now.
25718 * Interrupts are disabled.
25719 */
25720-void run_posix_cpu_timers(struct task_struct *tsk)
25721+static void __run_posix_cpu_timers(struct task_struct *tsk)
25722 {
25723 LIST_HEAD(firing);
25724 struct k_itimer *timer, *next;
25725 unsigned long flags;
1a6e0f06 25726
e4b2b4a8
JK
25727- WARN_ON_ONCE(!irqs_disabled());
25728+ WARN_ON_ONCE_NONRT(!irqs_disabled());
1a6e0f06 25729
e4b2b4a8
JK
25730 /*
25731 * The fast path checks that there are no expired thread or thread
25732@@ -1179,6 +1182,152 @@
1a6e0f06 25733 }
1a6e0f06 25734 }
1a6e0f06 25735
e4b2b4a8
JK
25736+#ifdef CONFIG_PREEMPT_RT_BASE
25737+#include <linux/kthread.h>
25738+#include <linux/cpu.h>
25739+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
25740+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
25741+DEFINE_PER_CPU(bool, posix_timer_th_active);
1a6e0f06 25742+
e4b2b4a8 25743+static void posix_cpu_kthread_fn(unsigned int cpu)
1a6e0f06 25744+{
e4b2b4a8
JK
25745+ struct task_struct *tsk = NULL;
25746+ struct task_struct *next = NULL;
1a6e0f06 25747+
e4b2b4a8
JK
25748+ BUG_ON(per_cpu(posix_timer_task, cpu) != current);
25749+
25750+ /* grab task list */
25751+ raw_local_irq_disable();
25752+ tsk = per_cpu(posix_timer_tasklist, cpu);
25753+ per_cpu(posix_timer_tasklist, cpu) = NULL;
25754+ raw_local_irq_enable();
25755+
25756+ /* its possible the list is empty, just return */
25757+ if (!tsk)
25758+ return;
25759+
25760+ /* Process task list */
25761+ while (1) {
25762+ /* save next */
25763+ next = tsk->posix_timer_list;
25764+
25765+ /* run the task timers, clear its ptr and
25766+ * unreference it
25767+ */
25768+ __run_posix_cpu_timers(tsk);
25769+ tsk->posix_timer_list = NULL;
25770+ put_task_struct(tsk);
25771+
25772+ /* check if this is the last on the list */
25773+ if (next == tsk)
25774+ break;
25775+ tsk = next;
1a6e0f06
JK
25776+ }
25777+}
25778+
e4b2b4a8 25779+static inline int __fastpath_timer_check(struct task_struct *tsk)
1a6e0f06 25780+{
e4b2b4a8
JK
25781+ /* tsk == current, ensure it is safe to use ->signal/sighand */
25782+ if (unlikely(tsk->exit_state))
25783+ return 0;
1a6e0f06 25784+
e4b2b4a8
JK
25785+ if (!task_cputime_zero(&tsk->cputime_expires))
25786+ return 1;
25787+
25788+ if (!task_cputime_zero(&tsk->signal->cputime_expires))
25789+ return 1;
25790+
25791+ return 0;
1a6e0f06
JK
25792+}
25793+
e4b2b4a8
JK
25794+void run_posix_cpu_timers(struct task_struct *tsk)
25795+{
25796+ unsigned int cpu = smp_processor_id();
25797+ struct task_struct *tasklist;
1a6e0f06 25798+
e4b2b4a8
JK
25799+ BUG_ON(!irqs_disabled());
25800+
25801+ if (per_cpu(posix_timer_th_active, cpu) != true)
25802+ return;
25803+
25804+ /* get per-cpu references */
25805+ tasklist = per_cpu(posix_timer_tasklist, cpu);
25806+
25807+ /* check to see if we're already queued */
25808+ if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
25809+ get_task_struct(tsk);
25810+ if (tasklist) {
25811+ tsk->posix_timer_list = tasklist;
25812+ } else {
25813+ /*
25814+ * The list is terminated by a self-pointing
25815+ * task_struct
25816+ */
25817+ tsk->posix_timer_list = tsk;
25818+ }
25819+ per_cpu(posix_timer_tasklist, cpu) = tsk;
25820+
25821+ wake_up_process(per_cpu(posix_timer_task, cpu));
25822+ }
25823+}
25824+
25825+static int posix_cpu_kthread_should_run(unsigned int cpu)
1a6e0f06 25826+{
e4b2b4a8 25827+ return __this_cpu_read(posix_timer_tasklist) != NULL;
1a6e0f06 25828+}
1a6e0f06 25829+
e4b2b4a8 25830+static void posix_cpu_kthread_park(unsigned int cpu)
1a6e0f06 25831+{
e4b2b4a8 25832+ this_cpu_write(posix_timer_th_active, false);
1a6e0f06
JK
25833+}
25834+
e4b2b4a8 25835+static void posix_cpu_kthread_unpark(unsigned int cpu)
1a6e0f06 25836+{
e4b2b4a8 25837+ this_cpu_write(posix_timer_th_active, true);
1a6e0f06 25838+}
1a6e0f06 25839+
e4b2b4a8
JK
25840+static void posix_cpu_kthread_setup(unsigned int cpu)
25841+{
25842+ struct sched_param sp;
25843+
25844+ sp.sched_priority = MAX_RT_PRIO - 1;
25845+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
25846+ posix_cpu_kthread_unpark(cpu);
25847+}
25848+
25849+static struct smp_hotplug_thread posix_cpu_thread = {
25850+ .store = &posix_timer_task,
25851+ .thread_should_run = posix_cpu_kthread_should_run,
25852+ .thread_fn = posix_cpu_kthread_fn,
25853+ .thread_comm = "posixcputmr/%u",
25854+ .setup = posix_cpu_kthread_setup,
25855+ .park = posix_cpu_kthread_park,
25856+ .unpark = posix_cpu_kthread_unpark,
25857+};
25858+
25859+static int __init posix_cpu_thread_init(void)
1a6e0f06 25860+{
e4b2b4a8
JK
25861+ /* Start one for boot CPU. */
25862+ unsigned long cpu;
25863+ int ret;
25864+
25865+ /* init the per-cpu posix_timer_tasklets */
25866+ for_each_possible_cpu(cpu)
25867+ per_cpu(posix_timer_tasklist, cpu) = NULL;
25868+
25869+ ret = smpboot_register_percpu_thread(&posix_cpu_thread);
25870+ WARN_ON(ret);
25871+
1a6e0f06
JK
25872+ return 0;
25873+}
e4b2b4a8
JK
25874+early_initcall(posix_cpu_thread_init);
25875+#else /* CONFIG_PREEMPT_RT_BASE */
25876+void run_posix_cpu_timers(struct task_struct *tsk)
25877+{
25878+ __run_posix_cpu_timers(tsk);
25879+}
25880+#endif /* CONFIG_PREEMPT_RT_BASE */
25881+
25882 /*
25883 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
25884 * The tsk->sighand->siglock must be held by the caller.
25885diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/posix-timers.c linux-4.14/kernel/time/posix-timers.c
25886--- linux-4.14.orig/kernel/time/posix-timers.c 2018-09-05 11:03:22.000000000 +0200
25887+++ linux-4.14/kernel/time/posix-timers.c 2018-09-05 11:05:07.000000000 +0200
25888@@ -434,6 +434,7 @@
25889 static struct pid *good_sigevent(sigevent_t * event)
25890 {
25891 struct task_struct *rtn = current->group_leader;
25892+ int sig = event->sigev_signo;
25893
25894 switch (event->sigev_notify) {
25895 case SIGEV_SIGNAL | SIGEV_THREAD_ID:
25896@@ -443,7 +444,8 @@
25897 /* FALLTHRU */
25898 case SIGEV_SIGNAL:
25899 case SIGEV_THREAD:
25900- if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
25901+ if (sig <= 0 || sig > SIGRTMAX ||
25902+ sig_kernel_only(sig) || sig_kernel_coredump(sig))
25903 return NULL;
25904 /* FALLTHRU */
25905 case SIGEV_NONE:
25906@@ -469,7 +471,7 @@
25907
25908 static void k_itimer_rcu_free(struct rcu_head *head)
25909 {
25910- struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
25911+ struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
25912
25913 kmem_cache_free(posix_timers_cache, tmr);
25914 }
25915@@ -486,7 +488,7 @@
25916 }
25917 put_pid(tmr->it_pid);
25918 sigqueue_free(tmr->sigq);
25919- call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
25920+ call_rcu(&tmr->rcu, k_itimer_rcu_free);
25921 }
25922
25923 static int common_timer_create(struct k_itimer *new_timer)
25924@@ -825,6 +827,22 @@
25925 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
25926 }
25927
25928+/*
25929+ * Protected by RCU!
25930+ */
25931+static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr)
25932+{
25933+#ifdef CONFIG_PREEMPT_RT_FULL
25934+ if (kc->timer_arm == common_hrtimer_arm)
25935+ hrtimer_wait_for_timer(&timr->it.real.timer);
25936+ else if (kc == &alarm_clock)
25937+ hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer);
25938+ else
25939+ /* FIXME: Whacky hack for posix-cpu-timers */
25940+ schedule_timeout(1);
1a6e0f06 25941+#endif
e4b2b4a8 25942+}
1a6e0f06 25943+
e4b2b4a8
JK
25944 static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
25945 {
25946 return hrtimer_try_to_cancel(&timr->it.real.timer);
25947@@ -889,6 +907,7 @@
25948 if (!timr)
25949 return -EINVAL;
25950
25951+ rcu_read_lock();
25952 kc = timr->kclock;
25953 if (WARN_ON_ONCE(!kc || !kc->timer_set))
25954 error = -EINVAL;
25955@@ -897,9 +916,12 @@
25956
25957 unlock_timer(timr, flag);
25958 if (error == TIMER_RETRY) {
25959+ timer_wait_for_callback(kc, timr);
25960 old_spec64 = NULL; // We already got the old time...
25961+ rcu_read_unlock();
25962 goto retry;
25963 }
25964+ rcu_read_unlock();
25965
25966 return error;
25967 }
25968@@ -981,10 +1003,15 @@
25969 if (!timer)
25970 return -EINVAL;
25971
25972+ rcu_read_lock();
25973 if (timer_delete_hook(timer) == TIMER_RETRY) {
25974 unlock_timer(timer, flags);
25975+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25976+ timer);
25977+ rcu_read_unlock();
25978 goto retry_delete;
25979 }
25980+ rcu_read_unlock();
25981
25982 spin_lock(&current->sighand->siglock);
25983 list_del(&timer->list);
25984@@ -1010,8 +1037,18 @@
25985 retry_delete:
25986 spin_lock_irqsave(&timer->it_lock, flags);
25987
25988+ /* On RT we can race with a deletion */
25989+ if (!timer->it_signal) {
25990+ unlock_timer(timer, flags);
25991+ return;
25992+ }
25993+
25994 if (timer_delete_hook(timer) == TIMER_RETRY) {
25995+ rcu_read_lock();
25996 unlock_timer(timer, flags);
25997+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25998+ timer);
25999+ rcu_read_unlock();
26000 goto retry_delete;
26001 }
26002 list_del(&timer->list);
26003diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-broadcast-hrtimer.c linux-4.14/kernel/time/tick-broadcast-hrtimer.c
26004--- linux-4.14.orig/kernel/time/tick-broadcast-hrtimer.c 2017-11-12 19:46:13.000000000 +0100
26005+++ linux-4.14/kernel/time/tick-broadcast-hrtimer.c 2018-09-05 11:05:07.000000000 +0200
26006@@ -106,7 +106,7 @@
26007
26008 void tick_setup_hrtimer_broadcast(void)
1a6e0f06 26009 {
e4b2b4a8
JK
26010- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26011+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26012 bctimer.function = bc_handler;
26013 clockevents_register_device(&ce_broadcast_hrtimer);
26014 }
26015diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-common.c linux-4.14/kernel/time/tick-common.c
26016--- linux-4.14.orig/kernel/time/tick-common.c 2017-11-12 19:46:13.000000000 +0100
26017+++ linux-4.14/kernel/time/tick-common.c 2018-09-05 11:05:07.000000000 +0200
26018@@ -79,13 +79,15 @@
26019 static void tick_periodic(int cpu)
26020 {
26021 if (tick_do_timer_cpu == cpu) {
26022- write_seqlock(&jiffies_lock);
26023+ raw_spin_lock(&jiffies_lock);
26024+ write_seqcount_begin(&jiffies_seq);
1a6e0f06 26025
e4b2b4a8
JK
26026 /* Keep track of the next tick event */
26027 tick_next_period = ktime_add(tick_next_period, tick_period);
1a6e0f06 26028
e4b2b4a8
JK
26029 do_timer(1);
26030- write_sequnlock(&jiffies_lock);
26031+ write_seqcount_end(&jiffies_seq);
26032+ raw_spin_unlock(&jiffies_lock);
26033 update_wall_time();
26034 }
1a6e0f06 26035
e4b2b4a8
JK
26036@@ -157,9 +159,9 @@
26037 ktime_t next;
1a6e0f06 26038
e4b2b4a8
JK
26039 do {
26040- seq = read_seqbegin(&jiffies_lock);
26041+ seq = read_seqcount_begin(&jiffies_seq);
26042 next = tick_next_period;
26043- } while (read_seqretry(&jiffies_lock, seq));
26044+ } while (read_seqcount_retry(&jiffies_seq, seq));
1a6e0f06 26045
e4b2b4a8
JK
26046 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26047
26048diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-internal.h linux-4.14/kernel/time/tick-internal.h
26049--- linux-4.14.orig/kernel/time/tick-internal.h 2017-11-12 19:46:13.000000000 +0100
26050+++ linux-4.14/kernel/time/tick-internal.h 2018-09-05 11:05:07.000000000 +0200
26051@@ -150,16 +150,15 @@
26052
26053 #ifdef CONFIG_NO_HZ_COMMON
26054 extern unsigned long tick_nohz_active;
26055-#else
26056+extern void timers_update_nohz(void);
26057+# ifdef CONFIG_SMP
26058+extern struct static_key_false timers_migration_enabled;
26059+# endif
26060+#else /* CONFIG_NO_HZ_COMMON */
26061+static inline void timers_update_nohz(void) { }
26062 #define tick_nohz_active (0)
26063 #endif
26064
26065-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26066-extern void timers_update_migration(bool update_nohz);
26067-#else
26068-static inline void timers_update_migration(bool update_nohz) { }
26069-#endif
26070-
26071 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26072
26073 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
26074diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-sched.c linux-4.14/kernel/time/tick-sched.c
26075--- linux-4.14.orig/kernel/time/tick-sched.c 2018-09-05 11:03:22.000000000 +0200
26076+++ linux-4.14/kernel/time/tick-sched.c 2018-09-05 11:05:07.000000000 +0200
26077@@ -66,7 +66,8 @@
1a6e0f06
JK
26078 return;
26079
e4b2b4a8
JK
26080 /* Reevaluate with jiffies_lock held */
26081- write_seqlock(&jiffies_lock);
26082+ raw_spin_lock(&jiffies_lock);
26083+ write_seqcount_begin(&jiffies_seq);
26084
26085 delta = ktime_sub(now, last_jiffies_update);
26086 if (delta >= tick_period) {
26087@@ -89,10 +90,12 @@
26088 /* Keep the tick_next_period variable up to date */
26089 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26090 } else {
26091- write_sequnlock(&jiffies_lock);
26092+ write_seqcount_end(&jiffies_seq);
26093+ raw_spin_unlock(&jiffies_lock);
26094 return;
1a6e0f06 26095 }
e4b2b4a8
JK
26096- write_sequnlock(&jiffies_lock);
26097+ write_seqcount_end(&jiffies_seq);
26098+ raw_spin_unlock(&jiffies_lock);
26099 update_wall_time();
26100 }
26101
26102@@ -103,12 +106,14 @@
26103 {
26104 ktime_t period;
26105
26106- write_seqlock(&jiffies_lock);
26107+ raw_spin_lock(&jiffies_lock);
26108+ write_seqcount_begin(&jiffies_seq);
26109 /* Did we start the jiffies update yet ? */
26110 if (last_jiffies_update == 0)
26111 last_jiffies_update = tick_next_period;
26112 period = last_jiffies_update;
26113- write_sequnlock(&jiffies_lock);
26114+ write_seqcount_end(&jiffies_seq);
26115+ raw_spin_unlock(&jiffies_lock);
26116 return period;
1a6e0f06
JK
26117 }
26118
e4b2b4a8
JK
26119@@ -225,6 +230,7 @@
26120
26121 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26122 .func = nohz_full_kick_func,
26123+ .flags = IRQ_WORK_HARD_IRQ,
26124 };
26125
1a6e0f06 26126 /*
e4b2b4a8 26127@@ -689,10 +695,10 @@
1a6e0f06 26128
e4b2b4a8
JK
26129 /* Read jiffies and the time when jiffies were updated last */
26130 do {
26131- seq = read_seqbegin(&jiffies_lock);
26132+ seq = read_seqcount_begin(&jiffies_seq);
26133 basemono = last_jiffies_update;
26134 basejiff = jiffies;
26135- } while (read_seqretry(&jiffies_lock, seq));
26136+ } while (read_seqcount_retry(&jiffies_seq, seq));
26137 ts->last_jiffies = basejiff;
1a6e0f06 26138
e4b2b4a8
JK
26139 /*
26140@@ -906,14 +912,7 @@
26141 return false;
26142
26143 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26144- static int ratelimit;
26145-
26146- if (ratelimit < 10 &&
26147- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26148- pr_warn("NOHZ: local_softirq_pending %02x\n",
26149- (unsigned int) local_softirq_pending());
26150- ratelimit++;
26151- }
26152+ softirq_check_pending_idle();
26153 return false;
1a6e0f06 26154 }
1a6e0f06 26155
e4b2b4a8
JK
26156@@ -1132,7 +1131,7 @@
26157 ts->nohz_mode = mode;
26158 /* One update is enough */
26159 if (!test_and_set_bit(0, &tick_nohz_active))
26160- timers_update_migration(true);
26161+ timers_update_nohz();
26162 }
1a6e0f06 26163
e4b2b4a8
JK
26164 /**
26165@@ -1250,7 +1249,7 @@
26166 /*
26167 * Emulate tick processing via per-CPU hrtimers:
26168 */
26169- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26170+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26171 ts->sched_timer.function = tick_sched_timer;
26172
26173 /* Get the next period (per-CPU) */
26174diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timekeeping.c linux-4.14/kernel/time/timekeeping.c
26175--- linux-4.14.orig/kernel/time/timekeeping.c 2017-11-12 19:46:13.000000000 +0100
26176+++ linux-4.14/kernel/time/timekeeping.c 2018-09-05 11:05:07.000000000 +0200
26177@@ -2326,8 +2326,10 @@
26178 */
26179 void xtime_update(unsigned long ticks)
1a6e0f06 26180 {
e4b2b4a8
JK
26181- write_seqlock(&jiffies_lock);
26182+ raw_spin_lock(&jiffies_lock);
26183+ write_seqcount_begin(&jiffies_seq);
26184 do_timer(ticks);
26185- write_sequnlock(&jiffies_lock);
26186+ write_seqcount_end(&jiffies_seq);
26187+ raw_spin_unlock(&jiffies_lock);
26188 update_wall_time();
26189 }
26190diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timekeeping.h linux-4.14/kernel/time/timekeeping.h
26191--- linux-4.14.orig/kernel/time/timekeeping.h 2017-11-12 19:46:13.000000000 +0100
26192+++ linux-4.14/kernel/time/timekeeping.h 2018-09-05 11:05:07.000000000 +0200
26193@@ -18,7 +18,8 @@
26194 extern void do_timer(unsigned long ticks);
26195 extern void update_wall_time(void);
1a6e0f06 26196
e4b2b4a8
JK
26197-extern seqlock_t jiffies_lock;
26198+extern raw_spinlock_t jiffies_lock;
26199+extern seqcount_t jiffies_seq;
1a6e0f06 26200
e4b2b4a8
JK
26201 #define CS_NAME_LEN 32
26202
26203diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timer.c linux-4.14/kernel/time/timer.c
26204--- linux-4.14.orig/kernel/time/timer.c 2018-09-05 11:03:22.000000000 +0200
26205+++ linux-4.14/kernel/time/timer.c 2018-09-05 11:05:07.000000000 +0200
26206@@ -44,6 +44,7 @@
26207 #include <linux/sched/debug.h>
26208 #include <linux/slab.h>
26209 #include <linux/compat.h>
26210+#include <linux/swait.h>
26211
26212 #include <linux/uaccess.h>
26213 #include <asm/unistd.h>
26214@@ -197,11 +198,12 @@
26215 struct timer_base {
26216 raw_spinlock_t lock;
26217 struct timer_list *running_timer;
26218+#ifdef CONFIG_PREEMPT_RT_FULL
26219+ struct swait_queue_head wait_for_running_timer;
1a6e0f06 26220+#endif
e4b2b4a8
JK
26221 unsigned long clk;
26222 unsigned long next_expiry;
26223 unsigned int cpu;
26224- bool migration_enabled;
26225- bool nohz_active;
26226 bool is_idle;
26227 bool must_forward_clk;
26228 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
26229@@ -210,45 +212,73 @@
1a6e0f06 26230
e4b2b4a8 26231 static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
1a6e0f06 26232
e4b2b4a8
JK
26233-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26234+#ifdef CONFIG_NO_HZ_COMMON
1a6e0f06 26235+
e4b2b4a8
JK
26236+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
26237+static DEFINE_MUTEX(timer_keys_mutex);
26238+
26239+static struct swork_event timer_update_swork;
26240+
26241+#ifdef CONFIG_SMP
26242 unsigned int sysctl_timer_migration = 1;
1a6e0f06 26243
e4b2b4a8
JK
26244-void timers_update_migration(bool update_nohz)
26245+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
1a6e0f06 26246+
e4b2b4a8
JK
26247+static void timers_update_migration(void)
26248 {
26249 bool on = sysctl_timer_migration && tick_nohz_active;
26250- unsigned int cpu;
1a6e0f06 26251
e4b2b4a8
JK
26252- /* Avoid the loop, if nothing to update */
26253- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
26254- return;
26255+ if (on)
26256+ static_branch_enable(&timers_migration_enabled);
26257+ else
26258+ static_branch_disable(&timers_migration_enabled);
26259+}
26260+#else
26261+static inline void timers_update_migration(void) { }
26262+#endif /* !CONFIG_SMP */
26263
26264- for_each_possible_cpu(cpu) {
26265- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
26266- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
26267- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
26268- if (!update_nohz)
26269- continue;
26270- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
26271- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
26272- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
26273- }
26274+static void timer_update_keys(struct swork_event *event)
26275+{
26276+ mutex_lock(&timer_keys_mutex);
26277+ timers_update_migration();
26278+ static_branch_enable(&timers_nohz_active);
26279+ mutex_unlock(&timer_keys_mutex);
1a6e0f06
JK
26280 }
26281
e4b2b4a8
JK
26282+void timers_update_nohz(void)
26283+{
26284+ swork_queue(&timer_update_swork);
26285+}
1a6e0f06 26286+
e4b2b4a8 26287+static __init int hrtimer_init_thread(void)
1a6e0f06 26288+{
e4b2b4a8
JK
26289+ WARN_ON(swork_get());
26290+ INIT_SWORK(&timer_update_swork, timer_update_keys);
26291+ return 0;
26292+}
26293+early_initcall(hrtimer_init_thread);
1a6e0f06 26294+
e4b2b4a8
JK
26295 int timer_migration_handler(struct ctl_table *table, int write,
26296 void __user *buffer, size_t *lenp,
26297 loff_t *ppos)
26298 {
26299- static DEFINE_MUTEX(mutex);
26300 int ret;
26301
26302- mutex_lock(&mutex);
26303+ mutex_lock(&timer_keys_mutex);
26304 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
26305 if (!ret && write)
26306- timers_update_migration(false);
26307- mutex_unlock(&mutex);
26308+ timers_update_migration();
26309+ mutex_unlock(&timer_keys_mutex);
26310 return ret;
26311 }
26312-#endif
26313+
26314+static inline bool is_timers_nohz_active(void)
26315+{
26316+ return static_branch_unlikely(&timers_nohz_active);
1a6e0f06
JK
26317+}
26318+#else
e4b2b4a8
JK
26319+static inline bool is_timers_nohz_active(void) { return false; }
26320+#endif /* NO_HZ_COMMON */
26321
26322 static unsigned long round_jiffies_common(unsigned long j, int cpu,
26323 bool force_up)
26324@@ -534,7 +564,7 @@
26325 static void
26326 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
1a6e0f06 26327 {
e4b2b4a8
JK
26328- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
26329+ if (!is_timers_nohz_active())
26330 return;
1a6e0f06 26331
e4b2b4a8
JK
26332 /*
26333@@ -840,21 +870,20 @@
26334 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
26335 }
1a6e0f06 26336
e4b2b4a8
JK
26337-#ifdef CONFIG_NO_HZ_COMMON
26338 static inline struct timer_base *
26339 get_target_base(struct timer_base *base, unsigned tflags)
1a6e0f06 26340 {
e4b2b4a8
JK
26341-#ifdef CONFIG_SMP
26342- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
26343- return get_timer_this_cpu_base(tflags);
26344- return get_timer_cpu_base(tflags, get_nohz_timer_target());
26345-#else
26346- return get_timer_this_cpu_base(tflags);
26347+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26348+ if (static_branch_unlikely(&timers_migration_enabled) &&
26349+ !(tflags & TIMER_PINNED))
26350+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
26351 #endif
26352+ return get_timer_this_cpu_base(tflags);
1a6e0f06 26353 }
1a6e0f06 26354
e4b2b4a8
JK
26355 static inline void forward_timer_base(struct timer_base *base)
26356 {
26357+#ifdef CONFIG_NO_HZ_COMMON
26358 unsigned long jnow;
1a6e0f06 26359
e4b2b4a8
JK
26360 /*
26361@@ -878,16 +907,8 @@
26362 base->clk = jnow;
26363 else
26364 base->clk = base->next_expiry;
26365-}
26366-#else
26367-static inline struct timer_base *
26368-get_target_base(struct timer_base *base, unsigned tflags)
26369-{
26370- return get_timer_this_cpu_base(tflags);
26371-}
26372-
26373-static inline void forward_timer_base(struct timer_base *base) { }
26374 #endif
26375+}
1a6e0f06 26376
1a6e0f06 26377
1a6e0f06 26378 /*
e4b2b4a8 26379@@ -1130,6 +1151,33 @@
1a6e0f06 26380 }
e4b2b4a8 26381 EXPORT_SYMBOL_GPL(add_timer_on);
1a6e0f06 26382
e4b2b4a8
JK
26383+#ifdef CONFIG_PREEMPT_RT_FULL
26384+/*
26385+ * Wait for a running timer
26386+ */
26387+static void wait_for_running_timer(struct timer_list *timer)
26388+{
26389+ struct timer_base *base;
26390+ u32 tf = timer->flags;
26391+
26392+ if (tf & TIMER_MIGRATING)
26393+ return;
26394+
26395+ base = get_timer_base(tf);
26396+ swait_event(base->wait_for_running_timer,
26397+ base->running_timer != timer);
26398+}
26399+
26400+# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer)
1a6e0f06 26401+#else
e4b2b4a8 26402+static inline void wait_for_running_timer(struct timer_list *timer)
1a6e0f06 26403+{
e4b2b4a8 26404+ cpu_relax();
1a6e0f06 26405+}
e4b2b4a8
JK
26406+
26407+# define wakeup_timer_waiters(b) do { } while (0)
1a6e0f06
JK
26408+#endif
26409+
e4b2b4a8
JK
26410 /**
26411 * del_timer - deactivate a timer.
26412 * @timer: the timer to be deactivated
26413@@ -1185,7 +1233,7 @@
26414 }
26415 EXPORT_SYMBOL(try_to_del_timer_sync);
26416
26417-#ifdef CONFIG_SMP
26418+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26419 /**
26420 * del_timer_sync - deactivate a timer and wait for the handler to finish.
26421 * @timer: the timer to be deactivated
26422@@ -1245,7 +1293,7 @@
26423 int ret = try_to_del_timer_sync(timer);
26424 if (ret >= 0)
26425 return ret;
26426- cpu_relax();
26427+ wait_for_running_timer(timer);
26428 }
26429 }
26430 EXPORT_SYMBOL(del_timer_sync);
26431@@ -1309,13 +1357,16 @@
26432 fn = timer->function;
26433 data = timer->data;
26434
26435- if (timer->flags & TIMER_IRQSAFE) {
26436+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
26437+ timer->flags & TIMER_IRQSAFE) {
26438 raw_spin_unlock(&base->lock);
26439 call_timer_fn(timer, fn, data);
26440+ base->running_timer = NULL;
26441 raw_spin_lock(&base->lock);
26442 } else {
26443 raw_spin_unlock_irq(&base->lock);
26444 call_timer_fn(timer, fn, data);
26445+ base->running_timer = NULL;
26446 raw_spin_lock_irq(&base->lock);
26447 }
26448 }
26449@@ -1584,13 +1635,13 @@
26450
26451 /* Note: this timer irq context must be accounted for as well. */
26452 account_process_tick(p, user_tick);
26453+ scheduler_tick();
26454 run_local_timers();
26455 rcu_check_callbacks(user_tick);
26456-#ifdef CONFIG_IRQ_WORK
26457+#if defined(CONFIG_IRQ_WORK)
26458 if (in_irq())
26459 irq_work_tick();
26460 #endif
26461- scheduler_tick();
26462 if (IS_ENABLED(CONFIG_POSIX_TIMERS))
26463 run_posix_cpu_timers(p);
26464 }
26465@@ -1617,8 +1668,8 @@
26466 while (levels--)
26467 expire_timers(base, heads + levels);
26468 }
26469- base->running_timer = NULL;
26470 raw_spin_unlock_irq(&base->lock);
26471+ wakeup_timer_waiters(base);
26472 }
26473
1a6e0f06 26474 /*
e4b2b4a8
JK
26475@@ -1628,6 +1679,7 @@
26476 {
26477 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
26478
26479+ irq_work_tick_soft();
26480 /*
26481 * must_forward_clk must be cleared before running timers so that any
26482 * timer functions that call mod_timer will not try to forward the
26483@@ -1864,6 +1916,9 @@
26484 base->cpu = cpu;
26485 raw_spin_lock_init(&base->lock);
26486 base->clk = jiffies;
26487+#ifdef CONFIG_PREEMPT_RT_FULL
26488+ init_swait_queue_head(&base->wait_for_running_timer);
1a6e0f06 26489+#endif
e4b2b4a8
JK
26490 }
26491 }
26492
26493diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/Kconfig linux-4.14/kernel/trace/Kconfig
26494--- linux-4.14.orig/kernel/trace/Kconfig 2018-09-05 11:03:22.000000000 +0200
26495+++ linux-4.14/kernel/trace/Kconfig 2018-09-05 11:05:07.000000000 +0200
26496@@ -585,7 +585,10 @@
26497 event activity as an initial guide for further investigation
26498 using more advanced tools.
26499
26500- See Documentation/trace/events.txt.
26501+ Inter-event tracing of quantities such as latencies is also
26502+ supported using hist triggers under this option.
26503+
26504+ See Documentation/trace/histogram.txt.
26505 If in doubt, say N.
26506
26507 config MMIOTRACE_TEST
26508diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/ring_buffer.c linux-4.14/kernel/trace/ring_buffer.c
26509--- linux-4.14.orig/kernel/trace/ring_buffer.c 2018-09-05 11:03:22.000000000 +0200
26510+++ linux-4.14/kernel/trace/ring_buffer.c 2018-09-05 11:05:07.000000000 +0200
26511@@ -41,6 +41,8 @@
26512 RINGBUF_TYPE_PADDING);
26513 trace_seq_printf(s, "\ttime_extend : type == %d\n",
26514 RINGBUF_TYPE_TIME_EXTEND);
26515+ trace_seq_printf(s, "\ttime_stamp : type == %d\n",
26516+ RINGBUF_TYPE_TIME_STAMP);
26517 trace_seq_printf(s, "\tdata max type_len == %d\n",
26518 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
26519
26520@@ -140,12 +142,15 @@
26521
26522 enum {
26523 RB_LEN_TIME_EXTEND = 8,
26524- RB_LEN_TIME_STAMP = 16,
26525+ RB_LEN_TIME_STAMP = 8,
26526 };
26527
26528 #define skip_time_extend(event) \
26529 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
26530
26531+#define extended_time(event) \
26532+ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
26533+
26534 static inline int rb_null_event(struct ring_buffer_event *event)
1a6e0f06 26535 {
e4b2b4a8
JK
26536 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
26537@@ -209,7 +214,7 @@
26538 {
26539 unsigned len = 0;
1a6e0f06 26540
e4b2b4a8
JK
26541- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
26542+ if (extended_time(event)) {
26543 /* time extends include the data event after it */
26544 len = RB_LEN_TIME_EXTEND;
26545 event = skip_time_extend(event);
26546@@ -231,7 +236,7 @@
26547 {
26548 unsigned length;
1a6e0f06 26549
e4b2b4a8
JK
26550- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26551+ if (extended_time(event))
26552 event = skip_time_extend(event);
26553
26554 length = rb_event_length(event);
26555@@ -248,7 +253,7 @@
26556 static __always_inline void *
26557 rb_event_data(struct ring_buffer_event *event)
1a6e0f06 26558 {
e4b2b4a8
JK
26559- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26560+ if (extended_time(event))
26561 event = skip_time_extend(event);
26562 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
26563 /* If length is in len field, then array[0] has the data */
26564@@ -275,6 +280,27 @@
26565 #define TS_MASK ((1ULL << TS_SHIFT) - 1)
26566 #define TS_DELTA_TEST (~TS_MASK)
1a6e0f06 26567
e4b2b4a8
JK
26568+/**
26569+ * ring_buffer_event_time_stamp - return the event's extended timestamp
26570+ * @event: the event to get the timestamp of
26571+ *
26572+ * Returns the extended timestamp associated with a data event.
26573+ * An extended time_stamp is a 64-bit timestamp represented
26574+ * internally in a special way that makes the best use of space
26575+ * contained within a ring buffer event. This function decodes
26576+ * it and maps it to a straight u64 value.
26577+ */
26578+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
1a6e0f06 26579+{
e4b2b4a8 26580+ u64 ts;
1a6e0f06 26581+
e4b2b4a8
JK
26582+ ts = event->array[0];
26583+ ts <<= TS_SHIFT;
26584+ ts += event->time_delta;
1a6e0f06 26585+
e4b2b4a8
JK
26586+ return ts;
26587+}
26588+
26589 /* Flag when events were overwritten */
26590 #define RB_MISSED_EVENTS (1 << 31)
26591 /* Missed count stored at end */
26592@@ -451,6 +477,7 @@
26593 struct buffer_page *reader_page;
26594 unsigned long lost_events;
26595 unsigned long last_overrun;
26596+ unsigned long nest;
26597 local_t entries_bytes;
26598 local_t entries;
26599 local_t overrun;
26600@@ -488,6 +515,7 @@
26601 u64 (*clock)(void);
26602
26603 struct rb_irq_work irq_work;
26604+ bool time_stamp_abs;
26605 };
26606
26607 struct ring_buffer_iter {
26608@@ -1387,6 +1415,16 @@
26609 buffer->clock = clock;
1a6e0f06
JK
26610 }
26611
e4b2b4a8 26612+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
1a6e0f06 26613+{
e4b2b4a8 26614+ buffer->time_stamp_abs = abs;
1a6e0f06
JK
26615+}
26616+
e4b2b4a8 26617+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
1a6e0f06 26618+{
e4b2b4a8 26619+ return buffer->time_stamp_abs;
1a6e0f06
JK
26620+}
26621+
e4b2b4a8
JK
26622 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
26623
26624 static inline unsigned long rb_page_entries(struct buffer_page *bpage)
26625@@ -2217,12 +2255,15 @@
26626
26627 /* Slow path, do not inline */
26628 static noinline struct ring_buffer_event *
26629-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
26630+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
26631 {
26632- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
26633+ if (abs)
26634+ event->type_len = RINGBUF_TYPE_TIME_STAMP;
26635+ else
26636+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
26637
26638- /* Not the first event on the page? */
26639- if (rb_event_index(event)) {
26640+ /* Not the first event on the page, or not delta? */
26641+ if (abs || rb_event_index(event)) {
26642 event->time_delta = delta & TS_MASK;
26643 event->array[0] = delta >> TS_SHIFT;
26644 } else {
26645@@ -2265,7 +2306,9 @@
26646 * add it to the start of the resevered space.
26647 */
26648 if (unlikely(info->add_timestamp)) {
26649- event = rb_add_time_stamp(event, delta);
26650+ bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
26651+
26652+ event = rb_add_time_stamp(event, info->delta, abs);
26653 length -= RB_LEN_TIME_EXTEND;
26654 delta = 0;
26655 }
26656@@ -2453,7 +2496,7 @@
26657
26658 static inline void rb_event_discard(struct ring_buffer_event *event)
26659 {
26660- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26661+ if (extended_time(event))
26662 event = skip_time_extend(event);
26663
26664 /* array[0] holds the actual length for the discarded event */
26665@@ -2497,10 +2540,11 @@
26666 cpu_buffer->write_stamp =
26667 cpu_buffer->commit_page->page->time_stamp;
26668 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
26669- delta = event->array[0];
26670- delta <<= TS_SHIFT;
26671- delta += event->time_delta;
26672+ delta = ring_buffer_event_time_stamp(event);
26673 cpu_buffer->write_stamp += delta;
26674+ } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
26675+ delta = ring_buffer_event_time_stamp(event);
26676+ cpu_buffer->write_stamp = delta;
26677 } else
26678 cpu_buffer->write_stamp += event->time_delta;
26679 }
26680@@ -2583,22 +2627,19 @@
26681 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
26682 {
26683 unsigned int val = cpu_buffer->current_context;
26684+ unsigned long pc = preempt_count();
26685 int bit;
26686
26687- if (in_interrupt()) {
26688- if (in_nmi())
26689- bit = RB_CTX_NMI;
26690- else if (in_irq())
26691- bit = RB_CTX_IRQ;
26692- else
26693- bit = RB_CTX_SOFTIRQ;
26694- } else
26695+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
26696 bit = RB_CTX_NORMAL;
26697+ else
26698+ bit = pc & NMI_MASK ? RB_CTX_NMI :
26699+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
26700
26701- if (unlikely(val & (1 << bit)))
26702+ if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
26703 return 1;
26704
26705- val |= (1 << bit);
26706+ val |= (1 << (bit + cpu_buffer->nest));
26707 cpu_buffer->current_context = val;
26708
26709 return 0;
26710@@ -2607,7 +2648,57 @@
26711 static __always_inline void
26712 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
26713 {
26714- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
26715+ cpu_buffer->current_context &=
26716+ cpu_buffer->current_context - (1 << cpu_buffer->nest);
26717+}
26718+
26719+/* The recursive locking above uses 4 bits */
26720+#define NESTED_BITS 4
26721+
26722+/**
26723+ * ring_buffer_nest_start - Allow to trace while nested
26724+ * @buffer: The ring buffer to modify
26725+ *
26726+ * The ring buffer has a safty mechanism to prevent recursion.
26727+ * But there may be a case where a trace needs to be done while
26728+ * tracing something else. In this case, calling this function
26729+ * will allow this function to nest within a currently active
26730+ * ring_buffer_lock_reserve().
26731+ *
26732+ * Call this function before calling another ring_buffer_lock_reserve() and
26733+ * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
1a6e0f06 26734+ */
e4b2b4a8 26735+void ring_buffer_nest_start(struct ring_buffer *buffer)
1a6e0f06 26736+{
e4b2b4a8
JK
26737+ struct ring_buffer_per_cpu *cpu_buffer;
26738+ int cpu;
1a6e0f06 26739+
e4b2b4a8
JK
26740+ /* Enabled by ring_buffer_nest_end() */
26741+ preempt_disable_notrace();
26742+ cpu = raw_smp_processor_id();
26743+ cpu_buffer = buffer->buffers[cpu];
26744+ /* This is the shift value for the above recusive locking */
26745+ cpu_buffer->nest += NESTED_BITS;
1a6e0f06
JK
26746+}
26747+
e4b2b4a8
JK
26748+/**
26749+ * ring_buffer_nest_end - Allow to trace while nested
26750+ * @buffer: The ring buffer to modify
26751+ *
26752+ * Must be called after ring_buffer_nest_start() and after the
26753+ * ring_buffer_unlock_commit().
1a6e0f06 26754+ */
e4b2b4a8 26755+void ring_buffer_nest_end(struct ring_buffer *buffer)
1a6e0f06 26756+{
e4b2b4a8 26757+ struct ring_buffer_per_cpu *cpu_buffer;
1a6e0f06
JK
26758+ int cpu;
26759+
e4b2b4a8
JK
26760+ /* disabled by ring_buffer_nest_start() */
26761+ cpu = raw_smp_processor_id();
26762+ cpu_buffer = buffer->buffers[cpu];
26763+ /* This is the shift value for the above recusive locking */
26764+ cpu_buffer->nest -= NESTED_BITS;
26765+ preempt_enable_notrace();
26766 }
26767
26768 /**
26769@@ -2683,7 +2774,7 @@
26770 * If this is the first commit on the page, then it has the same
26771 * timestamp as the page itself.
26772 */
26773- if (!tail)
26774+ if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
26775 info->delta = 0;
26776
26777 /* See if we shot pass the end of this buffer page */
26778@@ -2760,8 +2851,11 @@
26779 /* make sure this diff is calculated here */
26780 barrier();
26781
26782- /* Did the write stamp get updated already? */
26783- if (likely(info.ts >= cpu_buffer->write_stamp)) {
26784+ if (ring_buffer_time_stamp_abs(buffer)) {
26785+ info.delta = info.ts;
26786+ rb_handle_timestamp(cpu_buffer, &info);
26787+ } else /* Did the write stamp get updated already? */
26788+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
26789 info.delta = diff;
26790 if (unlikely(test_time_stamp(info.delta)))
26791 rb_handle_timestamp(cpu_buffer, &info);
26792@@ -3459,14 +3553,13 @@
26793 return;
26794
26795 case RINGBUF_TYPE_TIME_EXTEND:
26796- delta = event->array[0];
26797- delta <<= TS_SHIFT;
26798- delta += event->time_delta;
26799+ delta = ring_buffer_event_time_stamp(event);
26800 cpu_buffer->read_stamp += delta;
26801 return;
26802
26803 case RINGBUF_TYPE_TIME_STAMP:
26804- /* FIXME: not implemented */
26805+ delta = ring_buffer_event_time_stamp(event);
26806+ cpu_buffer->read_stamp = delta;
26807 return;
26808
26809 case RINGBUF_TYPE_DATA:
26810@@ -3490,14 +3583,13 @@
26811 return;
26812
26813 case RINGBUF_TYPE_TIME_EXTEND:
26814- delta = event->array[0];
26815- delta <<= TS_SHIFT;
26816- delta += event->time_delta;
26817+ delta = ring_buffer_event_time_stamp(event);
26818 iter->read_stamp += delta;
26819 return;
26820
26821 case RINGBUF_TYPE_TIME_STAMP:
26822- /* FIXME: not implemented */
26823+ delta = ring_buffer_event_time_stamp(event);
26824+ iter->read_stamp = delta;
26825 return;
26826
26827 case RINGBUF_TYPE_DATA:
26828@@ -3721,6 +3813,8 @@
26829 struct buffer_page *reader;
26830 int nr_loops = 0;
26831
26832+ if (ts)
26833+ *ts = 0;
26834 again:
26835 /*
26836 * We repeat when a time extend is encountered.
26837@@ -3757,12 +3851,17 @@
26838 goto again;
26839
26840 case RINGBUF_TYPE_TIME_STAMP:
26841- /* FIXME: not implemented */
26842+ if (ts) {
26843+ *ts = ring_buffer_event_time_stamp(event);
26844+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26845+ cpu_buffer->cpu, ts);
26846+ }
26847+ /* Internal data, OK to advance */
26848 rb_advance_reader(cpu_buffer);
26849 goto again;
26850
26851 case RINGBUF_TYPE_DATA:
26852- if (ts) {
26853+ if (ts && !(*ts)) {
26854 *ts = cpu_buffer->read_stamp + event->time_delta;
26855 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26856 cpu_buffer->cpu, ts);
26857@@ -3787,6 +3886,9 @@
26858 struct ring_buffer_event *event;
26859 int nr_loops = 0;
26860
26861+ if (ts)
26862+ *ts = 0;
26863+
26864 cpu_buffer = iter->cpu_buffer;
26865 buffer = cpu_buffer->buffer;
26866
26867@@ -3839,12 +3941,17 @@
26868 goto again;
26869
26870 case RINGBUF_TYPE_TIME_STAMP:
26871- /* FIXME: not implemented */
26872+ if (ts) {
26873+ *ts = ring_buffer_event_time_stamp(event);
26874+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26875+ cpu_buffer->cpu, ts);
26876+ }
26877+ /* Internal data, OK to advance */
26878 rb_advance_iter(iter);
26879 goto again;
26880
26881 case RINGBUF_TYPE_DATA:
26882- if (ts) {
26883+ if (ts && !(*ts)) {
26884 *ts = iter->read_stamp + event->time_delta;
26885 ring_buffer_normalize_time_stamp(buffer,
26886 cpu_buffer->cpu, ts);
26887diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace.c linux-4.14/kernel/trace/trace.c
26888--- linux-4.14.orig/kernel/trace/trace.c 2018-09-05 11:03:22.000000000 +0200
26889+++ linux-4.14/kernel/trace/trace.c 2018-09-05 11:05:07.000000000 +0200
26890@@ -1170,6 +1170,14 @@
26891 ARCH_TRACE_CLOCKS
26892 };
26893
26894+bool trace_clock_in_ns(struct trace_array *tr)
26895+{
26896+ if (trace_clocks[tr->clock_id].in_ns)
26897+ return true;
26898+
26899+ return false;
1a6e0f06 26900+}
1a6e0f06
JK
26901+
26902 /*
e4b2b4a8 26903 * trace_parser_get_init - gets the buffer for trace parser
1a6e0f06 26904 */
e4b2b4a8
JK
26905@@ -2127,6 +2135,7 @@
26906 struct task_struct *tsk = current;
26907
26908 entry->preempt_count = pc & 0xff;
26909+ entry->preempt_lazy_count = preempt_lazy_count();
26910 entry->pid = (tsk) ? tsk->pid : 0;
26911 entry->flags =
26912 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
26913@@ -2137,8 +2146,11 @@
26914 ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
26915 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
26916 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
26917- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
26918+ (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
26919+ (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
26920 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
26921+
26922+ entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
1a6e0f06 26923 }
e4b2b4a8 26924 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
1a6e0f06 26925
e4b2b4a8
JK
26926@@ -2275,7 +2287,7 @@
26927
26928 *current_rb = trace_file->tr->trace_buffer.buffer;
26929
26930- if ((trace_file->flags &
26931+ if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
26932 (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
26933 (entry = this_cpu_read(trace_buffered_event))) {
26934 /* Try to use the per cpu buffer first */
26935@@ -3342,14 +3354,17 @@
26936
26937 static void print_lat_help_header(struct seq_file *m)
26938 {
26939- seq_puts(m, "# _------=> CPU# \n"
26940- "# / _-----=> irqs-off \n"
26941- "# | / _----=> need-resched \n"
26942- "# || / _---=> hardirq/softirq \n"
26943- "# ||| / _--=> preempt-depth \n"
26944- "# |||| / delay \n"
26945- "# cmd pid ||||| time | caller \n"
26946- "# \\ / ||||| \\ | / \n");
26947+ seq_puts(m, "# _--------=> CPU# \n"
26948+ "# / _-------=> irqs-off \n"
26949+ "# | / _------=> need-resched \n"
26950+ "# || / _-----=> need-resched_lazy \n"
26951+ "# ||| / _----=> hardirq/softirq \n"
26952+ "# |||| / _---=> preempt-depth \n"
26953+ "# ||||| / _--=> preempt-lazy-depth\n"
26954+ "# |||||| / _-=> migrate-disable \n"
26955+ "# ||||||| / delay \n"
26956+ "# cmd pid |||||||| time | caller \n"
26957+ "# \\ / |||||||| \\ | / \n");
1a6e0f06 26958 }
1a6e0f06 26959
e4b2b4a8
JK
26960 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
26961@@ -3385,15 +3400,17 @@
26962 tgid ? tgid_space : space);
26963 seq_printf(m, "# %s / _----=> need-resched\n",
26964 tgid ? tgid_space : space);
26965- seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
26966+ seq_printf(m, "# %s| / _----=> need-resched_lazy\n",
26967 tgid ? tgid_space : space);
26968- seq_printf(m, "# %s|| / _--=> preempt-depth\n",
26969+ seq_printf(m, "# %s|| / _---=> hardirq/softirq\n",
26970 tgid ? tgid_space : space);
26971- seq_printf(m, "# %s||| / delay\n",
26972+ seq_printf(m, "# %s||| / _--=> preempt-depth\n",
26973 tgid ? tgid_space : space);
26974- seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
26975+ seq_printf(m, "# %s|||| / delay\n",
26976+ tgid ? tgid_space : space);
26977+ seq_printf(m, "# TASK-PID %sCPU# ||||| TIMESTAMP FUNCTION\n",
26978 tgid ? " TGID " : space);
26979- seq_printf(m, "# | | %s | |||| | |\n",
26980+ seq_printf(m, "# | | %s | ||||| | |\n",
26981 tgid ? " | " : space);
26982 }
26983
26984@@ -4531,6 +4548,9 @@
26985 #ifdef CONFIG_X86_64
26986 " x86-tsc: TSC cycle counter\n"
26987 #endif
26988+ "\n timestamp_mode\t-view the mode used to timestamp events\n"
26989+ " delta: Delta difference against a buffer-wide timestamp\n"
26990+ " absolute: Absolute (standalone) timestamp\n"
26991 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
26992 "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
26993 " tracing_cpumask\t- Limit which CPUs to trace\n"
26994@@ -4707,8 +4727,9 @@
26995 "\t .sym display an address as a symbol\n"
26996 "\t .sym-offset display an address as a symbol and offset\n"
26997 "\t .execname display a common_pid as a program name\n"
26998- "\t .syscall display a syscall id as a syscall name\n\n"
26999- "\t .log2 display log2 value rather than raw number\n\n"
27000+ "\t .syscall display a syscall id as a syscall name\n"
27001+ "\t .log2 display log2 value rather than raw number\n"
27002+ "\t .usecs display a common_timestamp in microseconds\n\n"
27003 "\t The 'pause' parameter can be used to pause an existing hist\n"
27004 "\t trigger or to start a hist trigger but not log any events\n"
27005 "\t until told to do so. 'continue' can be used to start or\n"
27006@@ -6218,7 +6239,7 @@
27007 return 0;
1a6e0f06 27008 }
1a6e0f06 27009
e4b2b4a8
JK
27010-static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27011+int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27012 {
27013 int i;
27014
27015@@ -6298,6 +6319,71 @@
27016 return ret;
1a6e0f06 27017 }
e4b2b4a8
JK
27018
27019+static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
27020+{
27021+ struct trace_array *tr = m->private;
27022+
27023+ mutex_lock(&trace_types_lock);
27024+
27025+ if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
27026+ seq_puts(m, "delta [absolute]\n");
27027+ else
27028+ seq_puts(m, "[delta] absolute\n");
27029+
27030+ mutex_unlock(&trace_types_lock);
27031+
27032+ return 0;
27033+}
27034+
27035+static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
27036+{
27037+ struct trace_array *tr = inode->i_private;
27038+ int ret;
27039+
27040+ if (tracing_disabled)
27041+ return -ENODEV;
27042+
27043+ if (trace_array_get(tr))
27044+ return -ENODEV;
27045+
27046+ ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
27047+ if (ret < 0)
27048+ trace_array_put(tr);
27049+
27050+ return ret;
27051+}
27052+
27053+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
27054+{
27055+ int ret = 0;
27056+
27057+ mutex_lock(&trace_types_lock);
27058+
27059+ if (abs && tr->time_stamp_abs_ref++)
27060+ goto out;
27061+
27062+ if (!abs) {
27063+ if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
27064+ ret = -EINVAL;
27065+ goto out;
27066+ }
27067+
27068+ if (--tr->time_stamp_abs_ref)
27069+ goto out;
27070+ }
27071+
27072+ ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
27073+
27074+#ifdef CONFIG_TRACER_MAX_TRACE
27075+ if (tr->max_buffer.buffer)
27076+ ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
1a6e0f06 27077+#endif
e4b2b4a8
JK
27078+ out:
27079+ mutex_unlock(&trace_types_lock);
27080+
27081+ return ret;
27082+}
27083+
27084 struct ftrace_buffer_info {
27085 struct trace_iterator iter;
27086 void *spare;
27087@@ -6545,6 +6631,13 @@
27088 .write = tracing_clock_write,
27089 };
1a6e0f06 27090
e4b2b4a8
JK
27091+static const struct file_operations trace_time_stamp_mode_fops = {
27092+ .open = tracing_time_stamp_mode_open,
27093+ .read = seq_read,
27094+ .llseek = seq_lseek,
27095+ .release = tracing_single_release_tr,
27096+};
27097+
27098 #ifdef CONFIG_TRACER_SNAPSHOT
27099 static const struct file_operations snapshot_fops = {
27100 .open = tracing_snapshot_open,
27101@@ -7682,6 +7775,7 @@
27102 struct trace_array *tr;
27103 int ret;
1a6e0f06 27104
e4b2b4a8
JK
27105+ mutex_lock(&event_mutex);
27106 mutex_lock(&trace_types_lock);
1a6e0f06 27107
e4b2b4a8
JK
27108 ret = -EEXIST;
27109@@ -7714,6 +7808,7 @@
1a6e0f06 27110
e4b2b4a8
JK
27111 INIT_LIST_HEAD(&tr->systems);
27112 INIT_LIST_HEAD(&tr->events);
27113+ INIT_LIST_HEAD(&tr->hist_vars);
1a6e0f06 27114
e4b2b4a8
JK
27115 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
27116 goto out_free_tr;
27117@@ -7737,6 +7832,7 @@
27118 list_add(&tr->list, &ftrace_trace_arrays);
1a6e0f06 27119
e4b2b4a8
JK
27120 mutex_unlock(&trace_types_lock);
27121+ mutex_unlock(&event_mutex);
1a6e0f06 27122
e4b2b4a8 27123 return 0;
1a6e0f06 27124
e4b2b4a8 27125@@ -7748,6 +7844,7 @@
1a6e0f06 27126
e4b2b4a8
JK
27127 out_unlock:
27128 mutex_unlock(&trace_types_lock);
27129+ mutex_unlock(&event_mutex);
1a6e0f06 27130
e4b2b4a8 27131 return ret;
1a6e0f06 27132
e4b2b4a8
JK
27133@@ -7760,6 +7857,7 @@
27134 int ret;
27135 int i;
1a6e0f06 27136
e4b2b4a8
JK
27137+ mutex_lock(&event_mutex);
27138 mutex_lock(&trace_types_lock);
1a6e0f06 27139
e4b2b4a8
JK
27140 ret = -ENODEV;
27141@@ -7805,6 +7903,7 @@
1a6e0f06 27142
e4b2b4a8
JK
27143 out_unlock:
27144 mutex_unlock(&trace_types_lock);
27145+ mutex_unlock(&event_mutex);
1a6e0f06 27146
e4b2b4a8
JK
27147 return ret;
27148 }
27149@@ -7862,6 +7961,9 @@
27150 trace_create_file("tracing_on", 0644, d_tracer,
27151 tr, &rb_simple_fops);
1a6e0f06 27152
e4b2b4a8
JK
27153+ trace_create_file("timestamp_mode", 0444, d_tracer, tr,
27154+ &trace_time_stamp_mode_fops);
1a6e0f06 27155+
e4b2b4a8 27156 create_trace_options_dir(tr);
1a6e0f06 27157
e4b2b4a8
JK
27158 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
27159@@ -8271,6 +8373,92 @@
1a6e0f06 27160 }
e4b2b4a8 27161 EXPORT_SYMBOL_GPL(ftrace_dump);
1a6e0f06 27162
e4b2b4a8 27163+int trace_run_command(const char *buf, int (*createfn)(int, char **))
1a6e0f06 27164+{
e4b2b4a8
JK
27165+ char **argv;
27166+ int argc, ret;
1a6e0f06 27167+
e4b2b4a8
JK
27168+ argc = 0;
27169+ ret = 0;
27170+ argv = argv_split(GFP_KERNEL, buf, &argc);
27171+ if (!argv)
27172+ return -ENOMEM;
27173+
27174+ if (argc)
27175+ ret = createfn(argc, argv);
27176+
27177+ argv_free(argv);
27178+
27179+ return ret;
1a6e0f06
JK
27180+}
27181+
e4b2b4a8
JK
27182+#define WRITE_BUFSIZE 4096
27183+
27184+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
27185+ size_t count, loff_t *ppos,
27186+ int (*createfn)(int, char **))
27187+{
27188+ char *kbuf, *buf, *tmp;
27189+ int ret = 0;
27190+ size_t done = 0;
27191+ size_t size;
27192+
27193+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
27194+ if (!kbuf)
27195+ return -ENOMEM;
27196+
27197+ while (done < count) {
27198+ size = count - done;
27199+
27200+ if (size >= WRITE_BUFSIZE)
27201+ size = WRITE_BUFSIZE - 1;
27202+
27203+ if (copy_from_user(kbuf, buffer + done, size)) {
27204+ ret = -EFAULT;
27205+ goto out;
27206+ }
27207+ kbuf[size] = '\0';
27208+ buf = kbuf;
27209+ do {
27210+ tmp = strchr(buf, '\n');
27211+ if (tmp) {
27212+ *tmp = '\0';
27213+ size = tmp - buf + 1;
27214+ } else {
27215+ size = strlen(buf);
27216+ if (done + size < count) {
27217+ if (buf != kbuf)
27218+ break;
27219+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
27220+ pr_warn("Line length is too long: Should be less than %d\n",
27221+ WRITE_BUFSIZE - 2);
27222+ ret = -EINVAL;
27223+ goto out;
27224+ }
27225+ }
27226+ done += size;
27227+
27228+ /* Remove comments */
27229+ tmp = strchr(buf, '#');
27230+
27231+ if (tmp)
27232+ *tmp = '\0';
27233+
27234+ ret = trace_run_command(buf, createfn);
27235+ if (ret)
27236+ goto out;
27237+ buf += size;
27238+
27239+ } while (done < count);
27240+ }
27241+ ret = done;
27242+
27243+out:
27244+ kfree(kbuf);
27245+
27246+ return ret;
27247+}
27248+
27249 __init static int tracer_alloc_buffers(void)
27250 {
27251 int ring_buf_size;
27252@@ -8371,6 +8559,7 @@
27253
27254 INIT_LIST_HEAD(&global_trace.systems);
27255 INIT_LIST_HEAD(&global_trace.events);
27256+ INIT_LIST_HEAD(&global_trace.hist_vars);
27257 list_add(&global_trace.list, &ftrace_trace_arrays);
27258
27259 apply_trace_boot_options();
27260diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events.c linux-4.14/kernel/trace/trace_events.c
27261--- linux-4.14.orig/kernel/trace/trace_events.c 2018-09-05 11:03:22.000000000 +0200
27262+++ linux-4.14/kernel/trace/trace_events.c 2018-09-05 11:05:07.000000000 +0200
27263@@ -187,6 +187,8 @@
27264 __common_field(unsigned char, flags);
27265 __common_field(unsigned char, preempt_count);
27266 __common_field(int, pid);
27267+ __common_field(unsigned short, migrate_disable);
27268+ __common_field(unsigned short, padding);
27269
27270 return ret;
1a6e0f06 27271 }
e4b2b4a8
JK
27272@@ -1406,8 +1408,8 @@
27273 return -ENODEV;
1a6e0f06 27274
e4b2b4a8
JK
27275 /* Make sure the system still exists */
27276- mutex_lock(&trace_types_lock);
27277 mutex_lock(&event_mutex);
27278+ mutex_lock(&trace_types_lock);
27279 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
27280 list_for_each_entry(dir, &tr->systems, list) {
27281 if (dir == inode->i_private) {
27282@@ -1421,8 +1423,8 @@
27283 }
27284 }
27285 exit_loop:
27286- mutex_unlock(&event_mutex);
27287 mutex_unlock(&trace_types_lock);
27288+ mutex_unlock(&event_mutex);
27289
27290 if (!system)
27291 return -ENODEV;
27292@@ -2308,15 +2310,15 @@
27293 int trace_add_event_call(struct trace_event_call *call)
1a6e0f06 27294 {
e4b2b4a8
JK
27295 int ret;
27296- mutex_lock(&trace_types_lock);
27297 mutex_lock(&event_mutex);
27298+ mutex_lock(&trace_types_lock);
27299
27300 ret = __register_event(call, NULL);
27301 if (ret >= 0)
27302 __add_event_to_tracers(call);
27303
27304- mutex_unlock(&event_mutex);
27305 mutex_unlock(&trace_types_lock);
27306+ mutex_unlock(&event_mutex);
27307 return ret;
1a6e0f06
JK
27308 }
27309
e4b2b4a8 27310@@ -2370,13 +2372,13 @@
1a6e0f06 27311 {
e4b2b4a8 27312 int ret;
1a6e0f06 27313
e4b2b4a8
JK
27314- mutex_lock(&trace_types_lock);
27315 mutex_lock(&event_mutex);
27316+ mutex_lock(&trace_types_lock);
27317 down_write(&trace_event_sem);
27318 ret = probe_remove_event_call(call);
27319 up_write(&trace_event_sem);
27320- mutex_unlock(&event_mutex);
27321 mutex_unlock(&trace_types_lock);
27322+ mutex_unlock(&event_mutex);
1a6e0f06 27323
e4b2b4a8 27324 return ret;
1a6e0f06 27325 }
e4b2b4a8
JK
27326@@ -2438,8 +2440,8 @@
27327 {
27328 struct module *mod = data;
1a6e0f06 27329
e4b2b4a8
JK
27330- mutex_lock(&trace_types_lock);
27331 mutex_lock(&event_mutex);
27332+ mutex_lock(&trace_types_lock);
27333 switch (val) {
27334 case MODULE_STATE_COMING:
27335 trace_module_add_events(mod);
27336@@ -2448,8 +2450,8 @@
27337 trace_module_remove_events(mod);
27338 break;
27339 }
27340- mutex_unlock(&event_mutex);
27341 mutex_unlock(&trace_types_lock);
27342+ mutex_unlock(&event_mutex);
1a6e0f06 27343
1a6e0f06
JK
27344 return 0;
27345 }
e4b2b4a8
JK
27346@@ -2964,24 +2966,24 @@
27347 * creates the event hierachry in the @parent/events directory.
27348 *
27349 * Returns 0 on success.
27350+ *
27351+ * Must be called with event_mutex held.
27352 */
27353 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
27354 {
27355 int ret;
1a6e0f06 27356
e4b2b4a8
JK
27357- mutex_lock(&event_mutex);
27358+ lockdep_assert_held(&event_mutex);
c7c16703 27359
e4b2b4a8
JK
27360 ret = create_event_toplevel_files(parent, tr);
27361 if (ret)
27362- goto out_unlock;
27363+ goto out;
c7c16703 27364
e4b2b4a8
JK
27365 down_write(&trace_event_sem);
27366 __trace_add_event_dirs(tr);
27367 up_write(&trace_event_sem);
c7c16703 27368
e4b2b4a8
JK
27369- out_unlock:
27370- mutex_unlock(&event_mutex);
27371-
27372+ out:
27373 return ret;
1a6e0f06 27374 }
1a6e0f06 27375
e4b2b4a8
JK
27376@@ -3010,9 +3012,10 @@
27377 return ret;
1a6e0f06 27378 }
1a6e0f06 27379
e4b2b4a8
JK
27380+/* Must be called with event_mutex held */
27381 int event_trace_del_tracer(struct trace_array *tr)
27382 {
27383- mutex_lock(&event_mutex);
27384+ lockdep_assert_held(&event_mutex);
1a6e0f06 27385
e4b2b4a8
JK
27386 /* Disable any event triggers and associated soft-disabled events */
27387 clear_event_triggers(tr);
27388@@ -3033,8 +3036,6 @@
1a6e0f06 27389
e4b2b4a8 27390 tr->event_dir = NULL;
1a6e0f06 27391
e4b2b4a8
JK
27392- mutex_unlock(&event_mutex);
27393-
27394 return 0;
1a6e0f06 27395 }
1a6e0f06 27396
e4b2b4a8
JK
27397diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events_hist.c linux-4.14/kernel/trace/trace_events_hist.c
27398--- linux-4.14.orig/kernel/trace/trace_events_hist.c 2018-09-05 11:03:22.000000000 +0200
27399+++ linux-4.14/kernel/trace/trace_events_hist.c 2018-09-05 11:05:07.000000000 +0200
27400@@ -20,13 +20,39 @@
27401 #include <linux/slab.h>
27402 #include <linux/stacktrace.h>
27403 #include <linux/rculist.h>
27404+#include <linux/tracefs.h>
1a6e0f06 27405
e4b2b4a8
JK
27406 #include "tracing_map.h"
27407 #include "trace.h"
1a6e0f06 27408
e4b2b4a8
JK
27409+#define SYNTH_SYSTEM "synthetic"
27410+#define SYNTH_FIELDS_MAX 16
27411+
27412+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
27413+
27414 struct hist_field;
1a6e0f06 27415
e4b2b4a8
JK
27416-typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
27417+typedef u64 (*hist_field_fn_t) (struct hist_field *field,
27418+ struct tracing_map_elt *elt,
27419+ struct ring_buffer_event *rbe,
27420+ void *event);
27421+
27422+#define HIST_FIELD_OPERANDS_MAX 2
27423+#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
27424+#define HIST_ACTIONS_MAX 8
27425+
27426+enum field_op_id {
27427+ FIELD_OP_NONE,
27428+ FIELD_OP_PLUS,
27429+ FIELD_OP_MINUS,
27430+ FIELD_OP_UNARY_MINUS,
27431+};
27432+
27433+struct hist_var {
27434+ char *name;
27435+ struct hist_trigger_data *hist_data;
27436+ unsigned int idx;
27437+};
27438
27439 struct hist_field {
27440 struct ftrace_event_field *field;
27441@@ -34,26 +60,50 @@
27442 hist_field_fn_t fn;
27443 unsigned int size;
27444 unsigned int offset;
27445+ unsigned int is_signed;
27446+ const char *type;
27447+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
27448+ struct hist_trigger_data *hist_data;
27449+ struct hist_var var;
27450+ enum field_op_id operator;
27451+ char *system;
27452+ char *event_name;
27453+ char *name;
27454+ unsigned int var_idx;
27455+ unsigned int var_ref_idx;
27456+ bool read_once;
27457 };
27458
27459-static u64 hist_field_none(struct hist_field *field, void *event)
27460+static u64 hist_field_none(struct hist_field *field,
27461+ struct tracing_map_elt *elt,
27462+ struct ring_buffer_event *rbe,
27463+ void *event)
1a6e0f06 27464 {
e4b2b4a8
JK
27465 return 0;
27466 }
1a6e0f06 27467
e4b2b4a8
JK
27468-static u64 hist_field_counter(struct hist_field *field, void *event)
27469+static u64 hist_field_counter(struct hist_field *field,
27470+ struct tracing_map_elt *elt,
27471+ struct ring_buffer_event *rbe,
27472+ void *event)
27473 {
27474 return 1;
1a6e0f06
JK
27475 }
27476
e4b2b4a8
JK
27477-static u64 hist_field_string(struct hist_field *hist_field, void *event)
27478+static u64 hist_field_string(struct hist_field *hist_field,
27479+ struct tracing_map_elt *elt,
27480+ struct ring_buffer_event *rbe,
27481+ void *event)
27482 {
27483 char *addr = (char *)(event + hist_field->field->offset);
1a6e0f06 27484
e4b2b4a8 27485 return (u64)(unsigned long)addr;
1a6e0f06 27486 }
e4b2b4a8
JK
27487
27488-static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
27489+static u64 hist_field_dynstring(struct hist_field *hist_field,
27490+ struct tracing_map_elt *elt,
27491+ struct ring_buffer_event *rbe,
27492+ void *event)
27493 {
27494 u32 str_item = *(u32 *)(event + hist_field->field->offset);
27495 int str_loc = str_item & 0xffff;
27496@@ -62,22 +112,74 @@
27497 return (u64)(unsigned long)addr;
1a6e0f06 27498 }
1a6e0f06 27499
e4b2b4a8
JK
27500-static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
27501+static u64 hist_field_pstring(struct hist_field *hist_field,
27502+ struct tracing_map_elt *elt,
27503+ struct ring_buffer_event *rbe,
27504+ void *event)
27505 {
27506 char **addr = (char **)(event + hist_field->field->offset);
1a6e0f06 27507
e4b2b4a8 27508 return (u64)(unsigned long)*addr;
1a6e0f06
JK
27509 }
27510
e4b2b4a8
JK
27511-static u64 hist_field_log2(struct hist_field *hist_field, void *event)
27512+static u64 hist_field_log2(struct hist_field *hist_field,
27513+ struct tracing_map_elt *elt,
27514+ struct ring_buffer_event *rbe,
27515+ void *event)
1a6e0f06 27516 {
e4b2b4a8
JK
27517- u64 val = *(u64 *)(event + hist_field->field->offset);
27518+ struct hist_field *operand = hist_field->operands[0];
27519+
27520+ u64 val = operand->fn(operand, elt, rbe, event);
1a6e0f06 27521
e4b2b4a8 27522 return (u64) ilog2(roundup_pow_of_two(val));
1a6e0f06
JK
27523 }
27524
e4b2b4a8
JK
27525+static u64 hist_field_plus(struct hist_field *hist_field,
27526+ struct tracing_map_elt *elt,
27527+ struct ring_buffer_event *rbe,
27528+ void *event)
1a6e0f06 27529+{
e4b2b4a8
JK
27530+ struct hist_field *operand1 = hist_field->operands[0];
27531+ struct hist_field *operand2 = hist_field->operands[1];
27532+
27533+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
27534+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
27535+
27536+ return val1 + val2;
27537+}
27538+
27539+static u64 hist_field_minus(struct hist_field *hist_field,
27540+ struct tracing_map_elt *elt,
27541+ struct ring_buffer_event *rbe,
27542+ void *event)
27543+{
27544+ struct hist_field *operand1 = hist_field->operands[0];
27545+ struct hist_field *operand2 = hist_field->operands[1];
27546+
27547+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
27548+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
27549+
27550+ return val1 - val2;
27551+}
27552+
27553+static u64 hist_field_unary_minus(struct hist_field *hist_field,
27554+ struct tracing_map_elt *elt,
27555+ struct ring_buffer_event *rbe,
27556+ void *event)
27557+{
27558+ struct hist_field *operand = hist_field->operands[0];
27559+
27560+ s64 sval = (s64)operand->fn(operand, elt, rbe, event);
27561+ u64 val = (u64)-sval;
27562+
27563+ return val;
27564+}
27565+
27566 #define DEFINE_HIST_FIELD_FN(type) \
27567-static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
27568+ static u64 hist_field_##type(struct hist_field *hist_field, \
27569+ struct tracing_map_elt *elt, \
27570+ struct ring_buffer_event *rbe, \
27571+ void *event) \
27572 { \
27573 type *addr = (type *)(event + hist_field->field->offset); \
27574 \
27575@@ -110,16 +212,29 @@
27576 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
27577
27578 enum hist_field_flags {
27579- HIST_FIELD_FL_HITCOUNT = 1,
27580- HIST_FIELD_FL_KEY = 2,
27581- HIST_FIELD_FL_STRING = 4,
27582- HIST_FIELD_FL_HEX = 8,
27583- HIST_FIELD_FL_SYM = 16,
27584- HIST_FIELD_FL_SYM_OFFSET = 32,
27585- HIST_FIELD_FL_EXECNAME = 64,
27586- HIST_FIELD_FL_SYSCALL = 128,
27587- HIST_FIELD_FL_STACKTRACE = 256,
27588- HIST_FIELD_FL_LOG2 = 512,
27589+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
27590+ HIST_FIELD_FL_KEY = 1 << 1,
27591+ HIST_FIELD_FL_STRING = 1 << 2,
27592+ HIST_FIELD_FL_HEX = 1 << 3,
27593+ HIST_FIELD_FL_SYM = 1 << 4,
27594+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
27595+ HIST_FIELD_FL_EXECNAME = 1 << 6,
27596+ HIST_FIELD_FL_SYSCALL = 1 << 7,
27597+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
27598+ HIST_FIELD_FL_LOG2 = 1 << 9,
27599+ HIST_FIELD_FL_TIMESTAMP = 1 << 10,
27600+ HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
27601+ HIST_FIELD_FL_VAR = 1 << 12,
27602+ HIST_FIELD_FL_EXPR = 1 << 13,
27603+ HIST_FIELD_FL_VAR_REF = 1 << 14,
27604+ HIST_FIELD_FL_CPU = 1 << 15,
27605+ HIST_FIELD_FL_ALIAS = 1 << 16,
27606+};
27607+
27608+struct var_defs {
27609+ unsigned int n_vars;
27610+ char *name[TRACING_MAP_VARS_MAX];
27611+ char *expr[TRACING_MAP_VARS_MAX];
27612 };
27613
27614 struct hist_trigger_attrs {
27615@@ -127,25 +242,1474 @@
27616 char *vals_str;
27617 char *sort_key_str;
27618 char *name;
27619+ char *clock;
27620 bool pause;
27621 bool cont;
27622 bool clear;
27623+ bool ts_in_usecs;
27624 unsigned int map_bits;
27625+
27626+ char *assignment_str[TRACING_MAP_VARS_MAX];
27627+ unsigned int n_assignments;
27628+
27629+ char *action_str[HIST_ACTIONS_MAX];
27630+ unsigned int n_actions;
27631+
27632+ struct var_defs var_defs;
27633+};
27634+
27635+struct field_var {
27636+ struct hist_field *var;
27637+ struct hist_field *val;
27638+};
1a6e0f06 27639+
e4b2b4a8
JK
27640+struct field_var_hist {
27641+ struct hist_trigger_data *hist_data;
27642+ char *cmd;
27643 };
27644
27645 struct hist_trigger_data {
27646- struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
27647+ struct hist_field *fields[HIST_FIELDS_MAX];
27648 unsigned int n_vals;
27649 unsigned int n_keys;
27650 unsigned int n_fields;
27651+ unsigned int n_vars;
27652 unsigned int key_size;
27653 struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
27654 unsigned int n_sort_keys;
27655 struct trace_event_file *event_file;
27656 struct hist_trigger_attrs *attrs;
27657 struct tracing_map *map;
27658+ bool enable_timestamps;
27659+ bool remove;
27660+ struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
27661+ unsigned int n_var_refs;
27662+
27663+ struct action_data *actions[HIST_ACTIONS_MAX];
27664+ unsigned int n_actions;
27665+
27666+ struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
27667+ unsigned int n_synth_var_refs;
27668+ struct field_var *field_vars[SYNTH_FIELDS_MAX];
27669+ unsigned int n_field_vars;
27670+ unsigned int n_field_var_str;
27671+ struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
27672+ unsigned int n_field_var_hists;
27673+
27674+ struct field_var *max_vars[SYNTH_FIELDS_MAX];
27675+ unsigned int n_max_vars;
27676+ unsigned int n_max_var_str;
27677+};
1a6e0f06 27678+
e4b2b4a8
JK
27679+struct synth_field {
27680+ char *type;
27681+ char *name;
27682+ size_t size;
27683+ bool is_signed;
27684+ bool is_string;
27685+};
1a6e0f06 27686+
e4b2b4a8
JK
27687+struct synth_event {
27688+ struct list_head list;
27689+ int ref;
27690+ char *name;
27691+ struct synth_field **fields;
27692+ unsigned int n_fields;
27693+ unsigned int n_u64;
27694+ struct trace_event_class class;
27695+ struct trace_event_call call;
27696+ struct tracepoint *tp;
27697+};
1a6e0f06 27698+
e4b2b4a8 27699+struct action_data;
1a6e0f06 27700+
e4b2b4a8
JK
27701+typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
27702+ struct tracing_map_elt *elt, void *rec,
27703+ struct ring_buffer_event *rbe,
27704+ struct action_data *data, u64 *var_ref_vals);
1a6e0f06 27705+
e4b2b4a8
JK
27706+struct action_data {
27707+ action_fn_t fn;
27708+ unsigned int n_params;
27709+ char *params[SYNTH_FIELDS_MAX];
27710+
27711+ union {
27712+ struct {
27713+ unsigned int var_ref_idx;
27714+ char *match_event;
27715+ char *match_event_system;
27716+ char *synth_event_name;
27717+ struct synth_event *synth_event;
27718+ } onmatch;
27719+
27720+ struct {
27721+ char *var_str;
27722+ char *fn_name;
27723+ unsigned int max_var_ref_idx;
27724+ struct hist_field *max_var;
27725+ struct hist_field *var;
27726+ } onmax;
27727+ };
27728+};
27729+
27730+
27731+static char last_hist_cmd[MAX_FILTER_STR_VAL];
27732+static char hist_err_str[MAX_FILTER_STR_VAL];
27733+
27734+static void last_cmd_set(char *str)
27735+{
27736+ if (!str)
1a6e0f06
JK
27737+ return;
27738+
e4b2b4a8 27739+ strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
1a6e0f06 27740+}
1a6e0f06 27741+
e4b2b4a8
JK
27742+static void hist_err(char *str, char *var)
27743+{
27744+ int maxlen = MAX_FILTER_STR_VAL - 1;
1a6e0f06 27745+
e4b2b4a8 27746+ if (!str)
1a6e0f06 27747+ return;
1a6e0f06 27748+
e4b2b4a8
JK
27749+ if (strlen(hist_err_str))
27750+ return;
27751+
27752+ if (!var)
27753+ var = "";
27754+
27755+ if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
27756+ return;
1a6e0f06 27757+
e4b2b4a8
JK
27758+ strcat(hist_err_str, str);
27759+ strcat(hist_err_str, var);
27760+}
27761+
27762+static void hist_err_event(char *str, char *system, char *event, char *var)
1a6e0f06 27763+{
e4b2b4a8
JK
27764+ char err[MAX_FILTER_STR_VAL];
27765+
27766+ if (system && var)
27767+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
27768+ else if (system)
27769+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
27770+ else
27771+ strncpy(err, var, MAX_FILTER_STR_VAL);
27772+
27773+ hist_err(str, err);
1a6e0f06
JK
27774+}
27775+
e4b2b4a8 27776+static void hist_err_clear(void)
1a6e0f06 27777+{
e4b2b4a8 27778+ hist_err_str[0] = '\0';
1a6e0f06
JK
27779+}
27780+
e4b2b4a8 27781+static bool have_hist_err(void)
1a6e0f06 27782+{
e4b2b4a8
JK
27783+ if (strlen(hist_err_str))
27784+ return true;
1a6e0f06 27785+
e4b2b4a8
JK
27786+ return false;
27787+}
1a6e0f06 27788+
e4b2b4a8
JK
27789+static LIST_HEAD(synth_event_list);
27790+static DEFINE_MUTEX(synth_event_mutex);
1a6e0f06 27791+
e4b2b4a8
JK
27792+struct synth_trace_event {
27793+ struct trace_entry ent;
27794+ u64 fields[];
27795+};
1a6e0f06 27796+
e4b2b4a8
JK
27797+static int synth_event_define_fields(struct trace_event_call *call)
27798+{
27799+ struct synth_trace_event trace;
27800+ int offset = offsetof(typeof(trace), fields);
27801+ struct synth_event *event = call->data;
27802+ unsigned int i, size, n_u64;
27803+ char *name, *type;
27804+ bool is_signed;
27805+ int ret = 0;
27806+
27807+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
27808+ size = event->fields[i]->size;
27809+ is_signed = event->fields[i]->is_signed;
27810+ type = event->fields[i]->type;
27811+ name = event->fields[i]->name;
27812+ ret = trace_define_field(call, type, name, offset, size,
27813+ is_signed, FILTER_OTHER);
27814+ if (ret)
27815+ break;
1a6e0f06 27816+
e4b2b4a8
JK
27817+ if (event->fields[i]->is_string) {
27818+ offset += STR_VAR_LEN_MAX;
27819+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
27820+ } else {
27821+ offset += sizeof(u64);
27822+ n_u64++;
27823+ }
1a6e0f06
JK
27824+ }
27825+
e4b2b4a8
JK
27826+ event->n_u64 = n_u64;
27827+
27828+ return ret;
27829+}
1a6e0f06 27830+
e4b2b4a8
JK
27831+static bool synth_field_signed(char *type)
27832+{
27833+ if (strncmp(type, "u", 1) == 0)
27834+ return false;
1a6e0f06 27835+
e4b2b4a8
JK
27836+ return true;
27837+}
1a6e0f06 27838+
e4b2b4a8
JK
27839+static int synth_field_is_string(char *type)
27840+{
27841+ if (strstr(type, "char[") != NULL)
27842+ return true;
1a6e0f06 27843+
e4b2b4a8 27844+ return false;
1a6e0f06
JK
27845+}
27846+
e4b2b4a8 27847+static int synth_field_string_size(char *type)
1a6e0f06 27848+{
e4b2b4a8
JK
27849+ char buf[4], *end, *start;
27850+ unsigned int len;
27851+ int size, err;
1a6e0f06 27852+
e4b2b4a8
JK
27853+ start = strstr(type, "char[");
27854+ if (start == NULL)
27855+ return -EINVAL;
27856+ start += strlen("char[");
1a6e0f06 27857+
e4b2b4a8
JK
27858+ end = strchr(type, ']');
27859+ if (!end || end < start)
27860+ return -EINVAL;
27861+
27862+ len = end - start;
27863+ if (len > 3)
27864+ return -EINVAL;
27865+
27866+ strncpy(buf, start, len);
27867+ buf[len] = '\0';
27868+
27869+ err = kstrtouint(buf, 0, &size);
27870+ if (err)
27871+ return err;
27872+
27873+ if (size > STR_VAR_LEN_MAX)
27874+ return -EINVAL;
27875+
27876+ return size;
1a6e0f06
JK
27877+}
27878+
e4b2b4a8
JK
27879+static int synth_field_size(char *type)
27880+{
27881+ int size = 0;
27882+
27883+ if (strcmp(type, "s64") == 0)
27884+ size = sizeof(s64);
27885+ else if (strcmp(type, "u64") == 0)
27886+ size = sizeof(u64);
27887+ else if (strcmp(type, "s32") == 0)
27888+ size = sizeof(s32);
27889+ else if (strcmp(type, "u32") == 0)
27890+ size = sizeof(u32);
27891+ else if (strcmp(type, "s16") == 0)
27892+ size = sizeof(s16);
27893+ else if (strcmp(type, "u16") == 0)
27894+ size = sizeof(u16);
27895+ else if (strcmp(type, "s8") == 0)
27896+ size = sizeof(s8);
27897+ else if (strcmp(type, "u8") == 0)
27898+ size = sizeof(u8);
27899+ else if (strcmp(type, "char") == 0)
27900+ size = sizeof(char);
27901+ else if (strcmp(type, "unsigned char") == 0)
27902+ size = sizeof(unsigned char);
27903+ else if (strcmp(type, "int") == 0)
27904+ size = sizeof(int);
27905+ else if (strcmp(type, "unsigned int") == 0)
27906+ size = sizeof(unsigned int);
27907+ else if (strcmp(type, "long") == 0)
27908+ size = sizeof(long);
27909+ else if (strcmp(type, "unsigned long") == 0)
27910+ size = sizeof(unsigned long);
27911+ else if (strcmp(type, "pid_t") == 0)
27912+ size = sizeof(pid_t);
27913+ else if (synth_field_is_string(type))
27914+ size = synth_field_string_size(type);
1a6e0f06 27915+
e4b2b4a8
JK
27916+ return size;
27917+}
27918+
27919+static const char *synth_field_fmt(char *type)
27920+{
27921+ const char *fmt = "%llu";
27922+
27923+ if (strcmp(type, "s64") == 0)
27924+ fmt = "%lld";
27925+ else if (strcmp(type, "u64") == 0)
27926+ fmt = "%llu";
27927+ else if (strcmp(type, "s32") == 0)
27928+ fmt = "%d";
27929+ else if (strcmp(type, "u32") == 0)
27930+ fmt = "%u";
27931+ else if (strcmp(type, "s16") == 0)
27932+ fmt = "%d";
27933+ else if (strcmp(type, "u16") == 0)
27934+ fmt = "%u";
27935+ else if (strcmp(type, "s8") == 0)
27936+ fmt = "%d";
27937+ else if (strcmp(type, "u8") == 0)
27938+ fmt = "%u";
27939+ else if (strcmp(type, "char") == 0)
27940+ fmt = "%d";
27941+ else if (strcmp(type, "unsigned char") == 0)
27942+ fmt = "%u";
27943+ else if (strcmp(type, "int") == 0)
27944+ fmt = "%d";
27945+ else if (strcmp(type, "unsigned int") == 0)
27946+ fmt = "%u";
27947+ else if (strcmp(type, "long") == 0)
27948+ fmt = "%ld";
27949+ else if (strcmp(type, "unsigned long") == 0)
27950+ fmt = "%lu";
27951+ else if (strcmp(type, "pid_t") == 0)
27952+ fmt = "%d";
27953+ else if (synth_field_is_string(type))
27954+ fmt = "%s";
27955+
27956+ return fmt;
27957+}
27958+
27959+static enum print_line_t print_synth_event(struct trace_iterator *iter,
27960+ int flags,
27961+ struct trace_event *event)
27962+{
27963+ struct trace_array *tr = iter->tr;
27964+ struct trace_seq *s = &iter->seq;
27965+ struct synth_trace_event *entry;
27966+ struct synth_event *se;
27967+ unsigned int i, n_u64;
27968+ char print_fmt[32];
27969+ const char *fmt;
27970+
27971+ entry = (struct synth_trace_event *)iter->ent;
27972+ se = container_of(event, struct synth_event, call.event);
27973+
27974+ trace_seq_printf(s, "%s: ", se->name);
27975+
27976+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
27977+ if (trace_seq_has_overflowed(s))
27978+ goto end;
27979+
27980+ fmt = synth_field_fmt(se->fields[i]->type);
27981+
27982+ /* parameter types */
27983+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
27984+ trace_seq_printf(s, "%s ", fmt);
27985+
27986+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
27987+
27988+ /* parameter values */
27989+ if (se->fields[i]->is_string) {
27990+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
27991+ (char *)&entry->fields[n_u64],
27992+ i == se->n_fields - 1 ? "" : " ");
27993+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
27994+ } else {
27995+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
27996+ entry->fields[n_u64],
27997+ i == se->n_fields - 1 ? "" : " ");
27998+ n_u64++;
27999+ }
28000+ }
28001+end:
28002+ trace_seq_putc(s, '\n');
28003+
28004+ return trace_handle_return(s);
1a6e0f06
JK
28005+}
28006+
e4b2b4a8
JK
28007+static struct trace_event_functions synth_event_funcs = {
28008+ .trace = print_synth_event
28009+};
1a6e0f06 28010+
e4b2b4a8
JK
28011+static notrace void trace_event_raw_event_synth(void *__data,
28012+ u64 *var_ref_vals,
28013+ unsigned int var_ref_idx)
1a6e0f06 28014+{
e4b2b4a8
JK
28015+ struct trace_event_file *trace_file = __data;
28016+ struct synth_trace_event *entry;
28017+ struct trace_event_buffer fbuffer;
28018+ struct ring_buffer *buffer;
28019+ struct synth_event *event;
28020+ unsigned int i, n_u64;
28021+ int fields_size = 0;
1a6e0f06 28022+
e4b2b4a8
JK
28023+ event = trace_file->event_call->data;
28024+
28025+ if (trace_trigger_soft_disabled(trace_file))
1a6e0f06 28026+ return;
1a6e0f06 28027+
e4b2b4a8 28028+ fields_size = event->n_u64 * sizeof(u64);
1a6e0f06 28029+
e4b2b4a8
JK
28030+ /*
28031+ * Avoid ring buffer recursion detection, as this event
28032+ * is being performed within another event.
28033+ */
28034+ buffer = trace_file->tr->trace_buffer.buffer;
28035+ ring_buffer_nest_start(buffer);
28036+
28037+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
28038+ sizeof(*entry) + fields_size);
28039+ if (!entry)
28040+ goto out;
28041+
28042+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28043+ if (event->fields[i]->is_string) {
28044+ char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
28045+ char *str_field = (char *)&entry->fields[n_u64];
28046+
28047+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
28048+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28049+ } else {
28050+ entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
28051+ n_u64++;
28052+ }
1a6e0f06
JK
28053+ }
28054+
e4b2b4a8
JK
28055+ trace_event_buffer_commit(&fbuffer);
28056+out:
28057+ ring_buffer_nest_end(buffer);
1a6e0f06 28058+}
1a6e0f06 28059+
e4b2b4a8 28060+static void free_synth_event_print_fmt(struct trace_event_call *call)
1a6e0f06 28061+{
e4b2b4a8
JK
28062+ if (call) {
28063+ kfree(call->print_fmt);
28064+ call->print_fmt = NULL;
1a6e0f06 28065+ }
e4b2b4a8 28066+}
1a6e0f06 28067+
e4b2b4a8
JK
28068+static int __set_synth_event_print_fmt(struct synth_event *event,
28069+ char *buf, int len)
28070+{
28071+ const char *fmt;
28072+ int pos = 0;
28073+ int i;
28074+
28075+ /* When len=0, we just calculate the needed length */
28076+#define LEN_OR_ZERO (len ? len - pos : 0)
28077+
28078+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
28079+ for (i = 0; i < event->n_fields; i++) {
28080+ fmt = synth_field_fmt(event->fields[i]->type);
28081+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
28082+ event->fields[i]->name, fmt,
28083+ i == event->n_fields - 1 ? "" : ", ");
1a6e0f06 28084+ }
e4b2b4a8 28085+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
1a6e0f06 28086+
e4b2b4a8
JK
28087+ for (i = 0; i < event->n_fields; i++) {
28088+ pos += snprintf(buf + pos, LEN_OR_ZERO,
28089+ ", REC->%s", event->fields[i]->name);
1a6e0f06
JK
28090+ }
28091+
e4b2b4a8 28092+#undef LEN_OR_ZERO
1a6e0f06 28093+
e4b2b4a8
JK
28094+ /* return the length of print_fmt */
28095+ return pos;
1a6e0f06 28096+}
1a6e0f06 28097+
e4b2b4a8
JK
28098+static int set_synth_event_print_fmt(struct trace_event_call *call)
28099+{
28100+ struct synth_event *event = call->data;
28101+ char *print_fmt;
28102+ int len;
1a6e0f06 28103+
e4b2b4a8
JK
28104+ /* First: called with 0 length to calculate the needed length */
28105+ len = __set_synth_event_print_fmt(event, NULL, 0);
1a6e0f06 28106+
e4b2b4a8
JK
28107+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
28108+ if (!print_fmt)
28109+ return -ENOMEM;
1a6e0f06 28110+
e4b2b4a8
JK
28111+ /* Second: actually write the @print_fmt */
28112+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
28113+ call->print_fmt = print_fmt;
28114+
28115+ return 0;
1a6e0f06
JK
28116+}
28117+
e4b2b4a8 28118+static void free_synth_field(struct synth_field *field)
1a6e0f06 28119+{
e4b2b4a8
JK
28120+ kfree(field->type);
28121+ kfree(field->name);
28122+ kfree(field);
1a6e0f06
JK
28123+}
28124+
e4b2b4a8
JK
28125+static struct synth_field *parse_synth_field(char *field_type,
28126+ char *field_name)
1a6e0f06 28127+{
e4b2b4a8
JK
28128+ struct synth_field *field;
28129+ int len, ret = 0;
28130+ char *array;
1a6e0f06 28131+
e4b2b4a8
JK
28132+ if (field_type[0] == ';')
28133+ field_type++;
1a6e0f06 28134+
e4b2b4a8
JK
28135+ len = strlen(field_name);
28136+ if (field_name[len - 1] == ';')
28137+ field_name[len - 1] = '\0';
1a6e0f06 28138+
e4b2b4a8
JK
28139+ field = kzalloc(sizeof(*field), GFP_KERNEL);
28140+ if (!field)
28141+ return ERR_PTR(-ENOMEM);
1a6e0f06 28142+
e4b2b4a8
JK
28143+ len = strlen(field_type) + 1;
28144+ array = strchr(field_name, '[');
28145+ if (array)
28146+ len += strlen(array);
28147+ field->type = kzalloc(len, GFP_KERNEL);
28148+ if (!field->type) {
28149+ ret = -ENOMEM;
28150+ goto free;
28151+ }
28152+ strcat(field->type, field_type);
28153+ if (array) {
28154+ strcat(field->type, array);
28155+ *array = '\0';
28156+ }
1a6e0f06 28157+
e4b2b4a8
JK
28158+ field->size = synth_field_size(field->type);
28159+ if (!field->size) {
28160+ ret = -EINVAL;
28161+ goto free;
1a6e0f06 28162+ }
1a6e0f06 28163+
e4b2b4a8
JK
28164+ if (synth_field_is_string(field->type))
28165+ field->is_string = true;
28166+
28167+ field->is_signed = synth_field_signed(field->type);
28168+
28169+ field->name = kstrdup(field_name, GFP_KERNEL);
28170+ if (!field->name) {
28171+ ret = -ENOMEM;
28172+ goto free;
28173+ }
28174+ out:
28175+ return field;
28176+ free:
28177+ free_synth_field(field);
28178+ field = ERR_PTR(ret);
28179+ goto out;
28180+}
28181+
28182+static void free_synth_tracepoint(struct tracepoint *tp)
1a6e0f06 28183+{
e4b2b4a8
JK
28184+ if (!tp)
28185+ return;
28186+
28187+ kfree(tp->name);
28188+ kfree(tp);
1a6e0f06 28189+}
1a6e0f06 28190+
e4b2b4a8 28191+static struct tracepoint *alloc_synth_tracepoint(char *name)
1a6e0f06 28192+{
e4b2b4a8 28193+ struct tracepoint *tp;
1a6e0f06 28194+
e4b2b4a8
JK
28195+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
28196+ if (!tp)
28197+ return ERR_PTR(-ENOMEM);
1a6e0f06 28198+
e4b2b4a8
JK
28199+ tp->name = kstrdup(name, GFP_KERNEL);
28200+ if (!tp->name) {
28201+ kfree(tp);
28202+ return ERR_PTR(-ENOMEM);
1a6e0f06 28203+ }
e4b2b4a8
JK
28204+
28205+ return tp;
1a6e0f06 28206+}
1a6e0f06 28207+
e4b2b4a8
JK
28208+typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
28209+ unsigned int var_ref_idx);
1a6e0f06 28210+
e4b2b4a8
JK
28211+static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
28212+ unsigned int var_ref_idx)
28213+{
28214+ struct tracepoint *tp = event->tp;
28215+
28216+ if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
28217+ struct tracepoint_func *probe_func_ptr;
28218+ synth_probe_func_t probe_func;
28219+ void *__data;
28220+
28221+ if (!(cpu_online(raw_smp_processor_id())))
28222+ return;
28223+
28224+ probe_func_ptr = rcu_dereference_sched((tp)->funcs);
28225+ if (probe_func_ptr) {
28226+ do {
28227+ probe_func = probe_func_ptr->func;
28228+ __data = probe_func_ptr->data;
28229+ probe_func(__data, var_ref_vals, var_ref_idx);
28230+ } while ((++probe_func_ptr)->func);
28231+ }
28232+ }
28233+}
28234+
28235+static struct synth_event *find_synth_event(const char *name)
28236+{
28237+ struct synth_event *event;
28238+
28239+ list_for_each_entry(event, &synth_event_list, list) {
28240+ if (strcmp(event->name, name) == 0)
28241+ return event;
28242+ }
28243+
28244+ return NULL;
28245+}
28246+
28247+static int register_synth_event(struct synth_event *event)
28248+{
28249+ struct trace_event_call *call = &event->call;
28250+ int ret = 0;
28251+
28252+ event->call.class = &event->class;
28253+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
28254+ if (!event->class.system) {
28255+ ret = -ENOMEM;
28256+ goto out;
28257+ }
28258+
28259+ event->tp = alloc_synth_tracepoint(event->name);
28260+ if (IS_ERR(event->tp)) {
28261+ ret = PTR_ERR(event->tp);
28262+ event->tp = NULL;
28263+ goto out;
28264+ }
28265+
28266+ INIT_LIST_HEAD(&call->class->fields);
28267+ call->event.funcs = &synth_event_funcs;
28268+ call->class->define_fields = synth_event_define_fields;
28269+
28270+ ret = register_trace_event(&call->event);
28271+ if (!ret) {
28272+ ret = -ENODEV;
28273+ goto out;
28274+ }
28275+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
28276+ call->class->reg = trace_event_reg;
28277+ call->class->probe = trace_event_raw_event_synth;
28278+ call->data = event;
28279+ call->tp = event->tp;
28280+
28281+ ret = trace_add_event_call(call);
28282+ if (ret) {
28283+ pr_warn("Failed to register synthetic event: %s\n",
28284+ trace_event_name(call));
28285+ goto err;
28286+ }
28287+
28288+ ret = set_synth_event_print_fmt(call);
28289+ if (ret < 0) {
28290+ trace_remove_event_call(call);
28291+ goto err;
28292+ }
28293+ out:
28294+ return ret;
28295+ err:
28296+ unregister_trace_event(&call->event);
28297+ goto out;
28298+}
28299+
28300+static int unregister_synth_event(struct synth_event *event)
28301+{
28302+ struct trace_event_call *call = &event->call;
28303+ int ret;
28304+
28305+ ret = trace_remove_event_call(call);
28306+
28307+ return ret;
28308+}
28309+
28310+static void free_synth_event(struct synth_event *event)
28311+{
28312+ unsigned int i;
28313+
28314+ if (!event)
28315+ return;
28316+
28317+ for (i = 0; i < event->n_fields; i++)
28318+ free_synth_field(event->fields[i]);
28319+
28320+ kfree(event->fields);
28321+ kfree(event->name);
28322+ kfree(event->class.system);
28323+ free_synth_tracepoint(event->tp);
28324+ free_synth_event_print_fmt(&event->call);
28325+ kfree(event);
28326+}
28327+
28328+static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
28329+ struct synth_field **fields)
28330+{
28331+ struct synth_event *event;
28332+ unsigned int i;
1a6e0f06 28333+
e4b2b4a8
JK
28334+ event = kzalloc(sizeof(*event), GFP_KERNEL);
28335+ if (!event) {
28336+ event = ERR_PTR(-ENOMEM);
28337+ goto out;
28338+ }
1a6e0f06 28339+
e4b2b4a8
JK
28340+ event->name = kstrdup(event_name, GFP_KERNEL);
28341+ if (!event->name) {
28342+ kfree(event);
28343+ event = ERR_PTR(-ENOMEM);
28344+ goto out;
28345+ }
1a6e0f06 28346+
e4b2b4a8
JK
28347+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
28348+ if (!event->fields) {
28349+ free_synth_event(event);
28350+ event = ERR_PTR(-ENOMEM);
28351+ goto out;
28352+ }
1a6e0f06 28353+
e4b2b4a8
JK
28354+ for (i = 0; i < n_fields; i++)
28355+ event->fields[i] = fields[i];
1a6e0f06 28356+
e4b2b4a8
JK
28357+ event->n_fields = n_fields;
28358+ out:
28359+ return event;
28360+}
1a6e0f06 28361+
e4b2b4a8
JK
28362+static void action_trace(struct hist_trigger_data *hist_data,
28363+ struct tracing_map_elt *elt, void *rec,
28364+ struct ring_buffer_event *rbe,
28365+ struct action_data *data, u64 *var_ref_vals)
1a6e0f06 28366+{
e4b2b4a8 28367+ struct synth_event *event = data->onmatch.synth_event;
1a6e0f06 28368+
e4b2b4a8
JK
28369+ trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
28370+}
1a6e0f06 28371+
e4b2b4a8
JK
28372+struct hist_var_data {
28373+ struct list_head list;
28374+ struct hist_trigger_data *hist_data;
28375+};
1a6e0f06 28376+
e4b2b4a8
JK
28377+static void add_or_delete_synth_event(struct synth_event *event, int delete)
28378+{
28379+ if (delete)
28380+ free_synth_event(event);
28381+ else {
28382+ mutex_lock(&synth_event_mutex);
28383+ if (!find_synth_event(event->name))
28384+ list_add(&event->list, &synth_event_list);
28385+ else
28386+ free_synth_event(event);
28387+ mutex_unlock(&synth_event_mutex);
28388+ }
1a6e0f06
JK
28389+}
28390+
e4b2b4a8 28391+static int create_synth_event(int argc, char **argv)
1a6e0f06 28392+{
e4b2b4a8
JK
28393+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
28394+ struct synth_event *event = NULL;
28395+ bool delete_event = false;
28396+ int i, n_fields = 0, ret = 0;
28397+ char *name;
1a6e0f06 28398+
e4b2b4a8 28399+ mutex_lock(&synth_event_mutex);
1a6e0f06 28400+
e4b2b4a8
JK
28401+ /*
28402+ * Argument syntax:
28403+ * - Add synthetic event: <event_name> field[;field] ...
28404+ * - Remove synthetic event: !<event_name> field[;field] ...
28405+ * where 'field' = type field_name
28406+ */
28407+ if (argc < 1) {
28408+ ret = -EINVAL;
28409+ goto out;
28410+ }
1a6e0f06 28411+
e4b2b4a8
JK
28412+ name = argv[0];
28413+ if (name[0] == '!') {
28414+ delete_event = true;
28415+ name++;
28416+ }
1a6e0f06 28417+
e4b2b4a8
JK
28418+ event = find_synth_event(name);
28419+ if (event) {
28420+ if (delete_event) {
28421+ if (event->ref) {
28422+ event = NULL;
28423+ ret = -EBUSY;
28424+ goto out;
28425+ }
28426+ list_del(&event->list);
28427+ goto out;
1a6e0f06 28428+ }
e4b2b4a8
JK
28429+ event = NULL;
28430+ ret = -EEXIST;
28431+ goto out;
28432+ } else if (delete_event)
28433+ goto out;
28434+
28435+ if (argc < 2) {
28436+ ret = -EINVAL;
28437+ goto out;
1a6e0f06 28438+ }
1a6e0f06 28439+
e4b2b4a8
JK
28440+ for (i = 1; i < argc - 1; i++) {
28441+ if (strcmp(argv[i], ";") == 0)
28442+ continue;
28443+ if (n_fields == SYNTH_FIELDS_MAX) {
28444+ ret = -EINVAL;
28445+ goto err;
28446+ }
1a6e0f06 28447+
e4b2b4a8
JK
28448+ field = parse_synth_field(argv[i], argv[i + 1]);
28449+ if (IS_ERR(field)) {
28450+ ret = PTR_ERR(field);
28451+ goto err;
28452+ }
28453+ fields[n_fields] = field;
28454+ i++; n_fields++;
28455+ }
1a6e0f06 28456+
e4b2b4a8
JK
28457+ if (i < argc) {
28458+ ret = -EINVAL;
28459+ goto err;
28460+ }
1a6e0f06 28461+
e4b2b4a8
JK
28462+ event = alloc_synth_event(name, n_fields, fields);
28463+ if (IS_ERR(event)) {
28464+ ret = PTR_ERR(event);
28465+ event = NULL;
28466+ goto err;
1a6e0f06 28467+ }
e4b2b4a8
JK
28468+ out:
28469+ mutex_unlock(&synth_event_mutex);
1a6e0f06 28470+
e4b2b4a8
JK
28471+ if (event) {
28472+ if (delete_event) {
28473+ ret = unregister_synth_event(event);
28474+ add_or_delete_synth_event(event, !ret);
28475+ } else {
28476+ ret = register_synth_event(event);
28477+ add_or_delete_synth_event(event, ret);
28478+ }
28479+ }
28480+
28481+ return ret;
28482+ err:
28483+ mutex_unlock(&synth_event_mutex);
28484+
28485+ for (i = 0; i < n_fields; i++)
28486+ free_synth_field(fields[i]);
28487+ free_synth_event(event);
28488+
28489+ return ret;
1a6e0f06
JK
28490+}
28491+
e4b2b4a8 28492+static int release_all_synth_events(void)
1a6e0f06 28493+{
e4b2b4a8
JK
28494+ struct list_head release_events;
28495+ struct synth_event *event, *e;
28496+ int ret = 0;
1a6e0f06 28497+
e4b2b4a8
JK
28498+ INIT_LIST_HEAD(&release_events);
28499+
28500+ mutex_lock(&synth_event_mutex);
28501+
28502+ list_for_each_entry(event, &synth_event_list, list) {
28503+ if (event->ref) {
28504+ mutex_unlock(&synth_event_mutex);
28505+ return -EBUSY;
28506+ }
28507+ }
28508+
28509+ list_splice_init(&event->list, &release_events);
28510+
28511+ mutex_unlock(&synth_event_mutex);
28512+
28513+ list_for_each_entry_safe(event, e, &release_events, list) {
28514+ list_del(&event->list);
28515+
28516+ ret = unregister_synth_event(event);
28517+ add_or_delete_synth_event(event, !ret);
28518+ }
28519+
28520+ return ret;
1a6e0f06
JK
28521+}
28522+
e4b2b4a8
JK
28523+
28524+static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
1a6e0f06 28525+{
e4b2b4a8 28526+ mutex_lock(&synth_event_mutex);
1a6e0f06 28527+
e4b2b4a8
JK
28528+ return seq_list_start(&synth_event_list, *pos);
28529+}
1a6e0f06 28530+
e4b2b4a8
JK
28531+static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
28532+{
28533+ return seq_list_next(v, &synth_event_list, pos);
28534+}
1a6e0f06 28535+
e4b2b4a8
JK
28536+static void synth_events_seq_stop(struct seq_file *m, void *v)
28537+{
28538+ mutex_unlock(&synth_event_mutex);
1a6e0f06 28539+}
1a6e0f06 28540+
e4b2b4a8 28541+static int synth_events_seq_show(struct seq_file *m, void *v)
1a6e0f06 28542+{
e4b2b4a8
JK
28543+ struct synth_field *field;
28544+ struct synth_event *event = v;
28545+ unsigned int i;
1a6e0f06 28546+
e4b2b4a8 28547+ seq_printf(m, "%s\t", event->name);
1a6e0f06 28548+
e4b2b4a8
JK
28549+ for (i = 0; i < event->n_fields; i++) {
28550+ field = event->fields[i];
28551+
28552+ /* parameter values */
28553+ seq_printf(m, "%s %s%s", field->type, field->name,
28554+ i == event->n_fields - 1 ? "" : "; ");
1a6e0f06
JK
28555+ }
28556+
e4b2b4a8 28557+ seq_putc(m, '\n');
1a6e0f06
JK
28558+
28559+ return 0;
28560+}
1a6e0f06 28561+
e4b2b4a8
JK
28562+static const struct seq_operations synth_events_seq_op = {
28563+ .start = synth_events_seq_start,
28564+ .next = synth_events_seq_next,
28565+ .stop = synth_events_seq_stop,
28566+ .show = synth_events_seq_show
28567+};
28568+
28569+static int synth_events_open(struct inode *inode, struct file *file)
1a6e0f06 28570+{
e4b2b4a8 28571+ int ret;
1a6e0f06 28572+
e4b2b4a8
JK
28573+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
28574+ ret = release_all_synth_events();
28575+ if (ret < 0)
28576+ return ret;
28577+ }
1a6e0f06 28578+
e4b2b4a8 28579+ return seq_open(file, &synth_events_seq_op);
1a6e0f06 28580+}
e4b2b4a8
JK
28581+
28582+static ssize_t synth_events_write(struct file *file,
28583+ const char __user *buffer,
28584+ size_t count, loff_t *ppos)
1a6e0f06 28585+{
e4b2b4a8
JK
28586+ return trace_parse_run_command(file, buffer, count, ppos,
28587+ create_synth_event);
28588+}
1a6e0f06 28589+
e4b2b4a8
JK
28590+static const struct file_operations synth_events_fops = {
28591+ .open = synth_events_open,
28592+ .write = synth_events_write,
28593+ .read = seq_read,
28594+ .llseek = seq_lseek,
28595+ .release = seq_release,
28596+};
28597+
28598+static u64 hist_field_timestamp(struct hist_field *hist_field,
28599+ struct tracing_map_elt *elt,
28600+ struct ring_buffer_event *rbe,
28601+ void *event)
28602+{
28603+ struct hist_trigger_data *hist_data = hist_field->hist_data;
28604+ struct trace_array *tr = hist_data->event_file->tr;
28605+
28606+ u64 ts = ring_buffer_event_time_stamp(rbe);
28607+
28608+ if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
28609+ ts = ns2usecs(ts);
28610+
28611+ return ts;
1a6e0f06
JK
28612+}
28613+
e4b2b4a8
JK
28614+static u64 hist_field_cpu(struct hist_field *hist_field,
28615+ struct tracing_map_elt *elt,
28616+ struct ring_buffer_event *rbe,
28617+ void *event)
1a6e0f06 28618+{
e4b2b4a8
JK
28619+ int cpu = smp_processor_id();
28620+
28621+ return cpu;
1a6e0f06
JK
28622+}
28623+
e4b2b4a8
JK
28624+static struct hist_field *
28625+check_field_for_var_ref(struct hist_field *hist_field,
28626+ struct hist_trigger_data *var_data,
28627+ unsigned int var_idx)
1a6e0f06 28628+{
e4b2b4a8
JK
28629+ struct hist_field *found = NULL;
28630+
28631+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
28632+ if (hist_field->var.idx == var_idx &&
28633+ hist_field->var.hist_data == var_data) {
28634+ found = hist_field;
28635+ }
28636+ }
28637+
28638+ return found;
1a6e0f06
JK
28639+}
28640+
e4b2b4a8
JK
28641+static struct hist_field *
28642+check_field_for_var_refs(struct hist_trigger_data *hist_data,
28643+ struct hist_field *hist_field,
28644+ struct hist_trigger_data *var_data,
28645+ unsigned int var_idx,
28646+ unsigned int level)
28647+{
28648+ struct hist_field *found = NULL;
28649+ unsigned int i;
28650+
28651+ if (level > 3)
28652+ return found;
28653+
28654+ if (!hist_field)
28655+ return found;
28656+
28657+ found = check_field_for_var_ref(hist_field, var_data, var_idx);
28658+ if (found)
28659+ return found;
28660+
28661+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
28662+ struct hist_field *operand;
28663+
28664+ operand = hist_field->operands[i];
28665+ found = check_field_for_var_refs(hist_data, operand, var_data,
28666+ var_idx, level + 1);
28667+ if (found)
28668+ return found;
28669+ }
28670+
28671+ return found;
28672+}
28673+
28674+static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
28675+ struct hist_trigger_data *var_data,
28676+ unsigned int var_idx)
28677+{
28678+ struct hist_field *hist_field, *found = NULL;
28679+ unsigned int i;
28680+
28681+ for_each_hist_field(i, hist_data) {
28682+ hist_field = hist_data->fields[i];
28683+ found = check_field_for_var_refs(hist_data, hist_field,
28684+ var_data, var_idx, 0);
28685+ if (found)
28686+ return found;
28687+ }
28688+
28689+ for (i = 0; i < hist_data->n_synth_var_refs; i++) {
28690+ hist_field = hist_data->synth_var_refs[i];
28691+ found = check_field_for_var_refs(hist_data, hist_field,
28692+ var_data, var_idx, 0);
28693+ if (found)
28694+ return found;
28695+ }
28696+
28697+ return found;
28698+}
28699+
28700+static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
28701+ unsigned int var_idx)
1a6e0f06 28702+{
e4b2b4a8
JK
28703+ struct trace_array *tr = hist_data->event_file->tr;
28704+ struct hist_field *found = NULL;
28705+ struct hist_var_data *var_data;
1a6e0f06 28706+
e4b2b4a8
JK
28707+ list_for_each_entry(var_data, &tr->hist_vars, list) {
28708+ if (var_data->hist_data == hist_data)
28709+ continue;
28710+ found = find_var_ref(var_data->hist_data, hist_data, var_idx);
28711+ if (found)
28712+ break;
28713+ }
1a6e0f06 28714+
e4b2b4a8 28715+ return found;
1a6e0f06
JK
28716+}
28717+
e4b2b4a8 28718+static bool check_var_refs(struct hist_trigger_data *hist_data)
1a6e0f06 28719+{
e4b2b4a8
JK
28720+ struct hist_field *field;
28721+ bool found = false;
28722+ int i;
1a6e0f06 28723+
e4b2b4a8
JK
28724+ for_each_hist_field(i, hist_data) {
28725+ field = hist_data->fields[i];
28726+ if (field && field->flags & HIST_FIELD_FL_VAR) {
28727+ if (find_any_var_ref(hist_data, field->var.idx)) {
28728+ found = true;
28729+ break;
28730+ }
28731+ }
28732+ }
1a6e0f06 28733+
e4b2b4a8 28734+ return found;
1a6e0f06
JK
28735+}
28736+
e4b2b4a8 28737+static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
1a6e0f06 28738+{
e4b2b4a8
JK
28739+ struct trace_array *tr = hist_data->event_file->tr;
28740+ struct hist_var_data *var_data, *found = NULL;
1a6e0f06 28741+
e4b2b4a8
JK
28742+ list_for_each_entry(var_data, &tr->hist_vars, list) {
28743+ if (var_data->hist_data == hist_data) {
28744+ found = var_data;
28745+ break;
1a6e0f06 28746+ }
e4b2b4a8 28747+ }
1a6e0f06 28748+
e4b2b4a8
JK
28749+ return found;
28750+}
28751+
28752+static bool field_has_hist_vars(struct hist_field *hist_field,
28753+ unsigned int level)
28754+{
28755+ int i;
28756+
28757+ if (level > 3)
28758+ return false;
28759+
28760+ if (!hist_field)
28761+ return false;
28762+
28763+ if (hist_field->flags & HIST_FIELD_FL_VAR ||
28764+ hist_field->flags & HIST_FIELD_FL_VAR_REF)
28765+ return true;
28766+
28767+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
28768+ struct hist_field *operand;
28769+
28770+ operand = hist_field->operands[i];
28771+ if (field_has_hist_vars(operand, level + 1))
28772+ return true;
1a6e0f06 28773+ }
e4b2b4a8
JK
28774+
28775+ return false;
1a6e0f06
JK
28776+}
28777+
e4b2b4a8
JK
28778+static bool has_hist_vars(struct hist_trigger_data *hist_data)
28779+{
28780+ struct hist_field *hist_field;
28781+ int i;
1a6e0f06 28782+
e4b2b4a8
JK
28783+ for_each_hist_field(i, hist_data) {
28784+ hist_field = hist_data->fields[i];
28785+ if (field_has_hist_vars(hist_field, 0))
28786+ return true;
28787+ }
1a6e0f06 28788+
e4b2b4a8
JK
28789+ return false;
28790+}
1a6e0f06 28791+
e4b2b4a8 28792+static int save_hist_vars(struct hist_trigger_data *hist_data)
1a6e0f06 28793+{
e4b2b4a8
JK
28794+ struct trace_array *tr = hist_data->event_file->tr;
28795+ struct hist_var_data *var_data;
1a6e0f06 28796+
e4b2b4a8
JK
28797+ var_data = find_hist_vars(hist_data);
28798+ if (var_data)
28799+ return 0;
28800+
28801+ if (trace_array_get(tr) < 0)
28802+ return -ENODEV;
28803+
28804+ var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
28805+ if (!var_data) {
28806+ trace_array_put(tr);
28807+ return -ENOMEM;
28808+ }
28809+
28810+ var_data->hist_data = hist_data;
28811+ list_add(&var_data->list, &tr->hist_vars);
28812+
28813+ return 0;
1a6e0f06
JK
28814+}
28815+
e4b2b4a8 28816+static void remove_hist_vars(struct hist_trigger_data *hist_data)
1a6e0f06 28817+{
e4b2b4a8
JK
28818+ struct trace_array *tr = hist_data->event_file->tr;
28819+ struct hist_var_data *var_data;
1a6e0f06 28820+
e4b2b4a8
JK
28821+ var_data = find_hist_vars(hist_data);
28822+ if (!var_data)
28823+ return;
28824+
28825+ if (WARN_ON(check_var_refs(hist_data)))
28826+ return;
28827+
28828+ list_del(&var_data->list);
28829+
28830+ kfree(var_data);
28831+
28832+ trace_array_put(tr);
1a6e0f06
JK
28833+}
28834+
e4b2b4a8
JK
28835+static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
28836+ const char *var_name)
1a6e0f06 28837+{
e4b2b4a8 28838+ struct hist_field *hist_field, *found = NULL;
1a6e0f06
JK
28839+ int i;
28840+
e4b2b4a8
JK
28841+ for_each_hist_field(i, hist_data) {
28842+ hist_field = hist_data->fields[i];
28843+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
28844+ strcmp(hist_field->var.name, var_name) == 0) {
28845+ found = hist_field;
28846+ break;
28847+ }
28848+ }
1a6e0f06 28849+
e4b2b4a8
JK
28850+ return found;
28851+}
1a6e0f06 28852+
e4b2b4a8
JK
28853+static struct hist_field *find_var(struct hist_trigger_data *hist_data,
28854+ struct trace_event_file *file,
28855+ const char *var_name)
28856+{
28857+ struct hist_trigger_data *test_data;
28858+ struct event_trigger_data *test;
28859+ struct hist_field *hist_field;
28860+
28861+ hist_field = find_var_field(hist_data, var_name);
28862+ if (hist_field)
28863+ return hist_field;
28864+
28865+ list_for_each_entry_rcu(test, &file->triggers, list) {
28866+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
28867+ test_data = test->private_data;
28868+ hist_field = find_var_field(test_data, var_name);
28869+ if (hist_field)
28870+ return hist_field;
28871+ }
28872+ }
28873+
28874+ return NULL;
28875+}
28876+
28877+static struct trace_event_file *find_var_file(struct trace_array *tr,
28878+ char *system,
28879+ char *event_name,
28880+ char *var_name)
28881+{
28882+ struct hist_trigger_data *var_hist_data;
28883+ struct hist_var_data *var_data;
28884+ struct trace_event_file *file, *found = NULL;
28885+
28886+ if (system)
28887+ return find_event_file(tr, system, event_name);
28888+
28889+ list_for_each_entry(var_data, &tr->hist_vars, list) {
28890+ var_hist_data = var_data->hist_data;
28891+ file = var_hist_data->event_file;
28892+ if (file == found)
28893+ continue;
28894+
28895+ if (find_var_field(var_hist_data, var_name)) {
28896+ if (found) {
28897+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
28898+ return NULL;
1a6e0f06 28899+ }
e4b2b4a8
JK
28900+
28901+ found = file;
1a6e0f06
JK
28902+ }
28903+ }
28904+
e4b2b4a8
JK
28905+ return found;
28906+}
28907+
28908+static struct hist_field *find_file_var(struct trace_event_file *file,
28909+ const char *var_name)
28910+{
28911+ struct hist_trigger_data *test_data;
28912+ struct event_trigger_data *test;
28913+ struct hist_field *hist_field;
28914+
28915+ list_for_each_entry_rcu(test, &file->triggers, list) {
28916+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
28917+ test_data = test->private_data;
28918+ hist_field = find_var_field(test_data, var_name);
28919+ if (hist_field)
28920+ return hist_field;
28921+ }
1a6e0f06 28922+ }
e4b2b4a8
JK
28923+
28924+ return NULL;
1a6e0f06 28925+}
e4b2b4a8
JK
28926+
28927+static struct hist_field *
28928+find_match_var(struct hist_trigger_data *hist_data, char *var_name)
1a6e0f06 28929+{
e4b2b4a8
JK
28930+ struct trace_array *tr = hist_data->event_file->tr;
28931+ struct hist_field *hist_field, *found = NULL;
28932+ struct trace_event_file *file;
28933+ unsigned int i;
1a6e0f06 28934+
e4b2b4a8
JK
28935+ for (i = 0; i < hist_data->n_actions; i++) {
28936+ struct action_data *data = hist_data->actions[i];
28937+
28938+ if (data->fn == action_trace) {
28939+ char *system = data->onmatch.match_event_system;
28940+ char *event_name = data->onmatch.match_event;
28941+
28942+ file = find_var_file(tr, system, event_name, var_name);
28943+ if (!file)
28944+ continue;
28945+ hist_field = find_file_var(file, var_name);
28946+ if (hist_field) {
28947+ if (found) {
28948+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
28949+ return ERR_PTR(-EINVAL);
28950+ }
28951+
28952+ found = hist_field;
28953+ }
28954+ }
1a6e0f06 28955+ }
e4b2b4a8 28956+ return found;
1a6e0f06 28957+}
1a6e0f06 28958+
e4b2b4a8
JK
28959+static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
28960+ char *system,
28961+ char *event_name,
28962+ char *var_name)
28963+{
28964+ struct trace_array *tr = hist_data->event_file->tr;
28965+ struct hist_field *hist_field = NULL;
28966+ struct trace_event_file *file;
1a6e0f06 28967+
e4b2b4a8
JK
28968+ if (!system || !event_name) {
28969+ hist_field = find_match_var(hist_data, var_name);
28970+ if (IS_ERR(hist_field))
28971+ return NULL;
28972+ if (hist_field)
28973+ return hist_field;
28974+ }
28975+
28976+ file = find_var_file(tr, system, event_name, var_name);
28977+ if (!file)
28978+ return NULL;
28979+
28980+ hist_field = find_file_var(file, var_name);
28981+
28982+ return hist_field;
28983+}
28984+
28985+struct hist_elt_data {
28986+ char *comm;
28987+ u64 *var_ref_vals;
28988+ char *field_var_str[SYNTH_FIELDS_MAX];
28989 };
1a6e0f06 28990
e4b2b4a8
JK
28991+static u64 hist_field_var_ref(struct hist_field *hist_field,
28992+ struct tracing_map_elt *elt,
28993+ struct ring_buffer_event *rbe,
28994+ void *event)
1a6e0f06 28995+{
e4b2b4a8
JK
28996+ struct hist_elt_data *elt_data;
28997+ u64 var_val = 0;
1a6e0f06 28998+
e4b2b4a8
JK
28999+ elt_data = elt->private_data;
29000+ var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
29001+
29002+ return var_val;
1a6e0f06 29003+}
1a6e0f06 29004+
e4b2b4a8
JK
29005+static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
29006+ u64 *var_ref_vals, bool self)
1a6e0f06 29007+{
e4b2b4a8
JK
29008+ struct hist_trigger_data *var_data;
29009+ struct tracing_map_elt *var_elt;
29010+ struct hist_field *hist_field;
29011+ unsigned int i, var_idx;
29012+ bool resolved = true;
29013+ u64 var_val = 0;
1a6e0f06 29014+
e4b2b4a8
JK
29015+ for (i = 0; i < hist_data->n_var_refs; i++) {
29016+ hist_field = hist_data->var_refs[i];
29017+ var_idx = hist_field->var.idx;
29018+ var_data = hist_field->var.hist_data;
1a6e0f06 29019+
e4b2b4a8
JK
29020+ if (var_data == NULL) {
29021+ resolved = false;
29022+ break;
29023+ }
1a6e0f06 29024+
e4b2b4a8
JK
29025+ if ((self && var_data != hist_data) ||
29026+ (!self && var_data == hist_data))
29027+ continue;
29028+
29029+ var_elt = tracing_map_lookup(var_data->map, key);
29030+ if (!var_elt) {
29031+ resolved = false;
29032+ break;
29033+ }
29034+
29035+ if (!tracing_map_var_set(var_elt, var_idx)) {
29036+ resolved = false;
29037+ break;
29038+ }
29039+
29040+ if (self || !hist_field->read_once)
29041+ var_val = tracing_map_read_var(var_elt, var_idx);
29042+ else
29043+ var_val = tracing_map_read_var_once(var_elt, var_idx);
29044+
29045+ var_ref_vals[i] = var_val;
1a6e0f06 29046+ }
e4b2b4a8
JK
29047+
29048+ return resolved;
1a6e0f06
JK
29049+}
29050+
e4b2b4a8
JK
29051+static const char *hist_field_name(struct hist_field *field,
29052+ unsigned int level)
1a6e0f06 29053+{
e4b2b4a8
JK
29054+ const char *field_name = "";
29055+
29056+ if (level > 1)
29057+ return field_name;
29058+
29059+ if (field->field)
29060+ field_name = field->field->name;
29061+ else if (field->flags & HIST_FIELD_FL_LOG2 ||
29062+ field->flags & HIST_FIELD_FL_ALIAS)
29063+ field_name = hist_field_name(field->operands[0], ++level);
29064+ else if (field->flags & HIST_FIELD_FL_CPU)
29065+ field_name = "cpu";
29066+ else if (field->flags & HIST_FIELD_FL_EXPR ||
29067+ field->flags & HIST_FIELD_FL_VAR_REF) {
29068+ if (field->system) {
29069+ static char full_name[MAX_FILTER_STR_VAL];
29070+
29071+ strcat(full_name, field->system);
29072+ strcat(full_name, ".");
29073+ strcat(full_name, field->event_name);
29074+ strcat(full_name, ".");
29075+ strcat(full_name, field->name);
29076+ field_name = full_name;
29077+ } else
29078+ field_name = field->name;
29079+ } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
29080+ field_name = "common_timestamp";
29081+
29082+ if (field_name == NULL)
29083+ field_name = "";
29084+
29085+ return field_name;
1a6e0f06
JK
29086+}
29087+
e4b2b4a8
JK
29088 static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
29089 {
29090 hist_field_fn_t fn = NULL;
29091@@ -207,16 +1771,119 @@
29092
29093 static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
29094 {
29095+ unsigned int i;
1a6e0f06 29096+
e4b2b4a8
JK
29097 if (!attrs)
29098 return;
29099
29100+ for (i = 0; i < attrs->n_assignments; i++)
29101+ kfree(attrs->assignment_str[i]);
1a6e0f06 29102+
e4b2b4a8
JK
29103+ for (i = 0; i < attrs->n_actions; i++)
29104+ kfree(attrs->action_str[i]);
1a6e0f06 29105+
e4b2b4a8
JK
29106 kfree(attrs->name);
29107 kfree(attrs->sort_key_str);
29108 kfree(attrs->keys_str);
29109 kfree(attrs->vals_str);
29110+ kfree(attrs->clock);
29111 kfree(attrs);
29112 }
29113
29114+static int parse_action(char *str, struct hist_trigger_attrs *attrs)
29115+{
29116+ int ret = -EINVAL;
1a6e0f06 29117+
e4b2b4a8
JK
29118+ if (attrs->n_actions >= HIST_ACTIONS_MAX)
29119+ return ret;
1a6e0f06 29120+
e4b2b4a8
JK
29121+ if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
29122+ (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
29123+ attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
29124+ if (!attrs->action_str[attrs->n_actions]) {
29125+ ret = -ENOMEM;
29126+ return ret;
29127+ }
29128+ attrs->n_actions++;
29129+ ret = 0;
1a6e0f06
JK
29130+ }
29131+
e4b2b4a8 29132+ return ret;
1a6e0f06
JK
29133+}
29134+
e4b2b4a8 29135+static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
1a6e0f06 29136+{
e4b2b4a8
JK
29137+ int ret = 0;
29138+
29139+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
29140+ (strncmp(str, "keys=", strlen("keys=")) == 0)) {
29141+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
29142+ if (!attrs->keys_str) {
29143+ ret = -ENOMEM;
29144+ goto out;
29145+ }
29146+ } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
29147+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
29148+ (strncmp(str, "values=", strlen("values=")) == 0)) {
29149+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
29150+ if (!attrs->vals_str) {
29151+ ret = -ENOMEM;
29152+ goto out;
29153+ }
29154+ } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
29155+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
29156+ if (!attrs->sort_key_str) {
29157+ ret = -ENOMEM;
29158+ goto out;
29159+ }
29160+ } else if (strncmp(str, "name=", strlen("name=")) == 0) {
29161+ attrs->name = kstrdup(str, GFP_KERNEL);
29162+ if (!attrs->name) {
29163+ ret = -ENOMEM;
29164+ goto out;
29165+ }
29166+ } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
29167+ strsep(&str, "=");
29168+ if (!str) {
29169+ ret = -EINVAL;
29170+ goto out;
29171+ }
29172+
29173+ str = strstrip(str);
29174+ attrs->clock = kstrdup(str, GFP_KERNEL);
29175+ if (!attrs->clock) {
29176+ ret = -ENOMEM;
29177+ goto out;
29178+ }
29179+ } else if (strncmp(str, "size=", strlen("size=")) == 0) {
29180+ int map_bits = parse_map_size(str);
29181+
29182+ if (map_bits < 0) {
29183+ ret = map_bits;
29184+ goto out;
29185+ }
29186+ attrs->map_bits = map_bits;
29187+ } else {
29188+ char *assignment;
29189+
29190+ if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
29191+ hist_err("Too many variables defined: ", str);
29192+ ret = -EINVAL;
29193+ goto out;
29194+ }
29195+
29196+ assignment = kstrdup(str, GFP_KERNEL);
29197+ if (!assignment) {
29198+ ret = -ENOMEM;
29199+ goto out;
29200+ }
29201+
29202+ attrs->assignment_str[attrs->n_assignments++] = assignment;
1a6e0f06 29203+ }
e4b2b4a8
JK
29204+ out:
29205+ return ret;
1a6e0f06
JK
29206+}
29207+
e4b2b4a8
JK
29208 static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
29209 {
29210 struct hist_trigger_attrs *attrs;
29211@@ -229,35 +1896,21 @@
29212 while (trigger_str) {
29213 char *str = strsep(&trigger_str, ":");
29214
29215- if ((strncmp(str, "key=", strlen("key=")) == 0) ||
29216- (strncmp(str, "keys=", strlen("keys=")) == 0))
29217- attrs->keys_str = kstrdup(str, GFP_KERNEL);
29218- else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
29219- (strncmp(str, "vals=", strlen("vals=")) == 0) ||
29220- (strncmp(str, "values=", strlen("values=")) == 0))
29221- attrs->vals_str = kstrdup(str, GFP_KERNEL);
29222- else if (strncmp(str, "sort=", strlen("sort=")) == 0)
29223- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
29224- else if (strncmp(str, "name=", strlen("name=")) == 0)
29225- attrs->name = kstrdup(str, GFP_KERNEL);
29226- else if (strcmp(str, "pause") == 0)
29227+ if (strchr(str, '=')) {
29228+ ret = parse_assignment(str, attrs);
29229+ if (ret)
29230+ goto free;
29231+ } else if (strcmp(str, "pause") == 0)
29232 attrs->pause = true;
29233 else if ((strcmp(str, "cont") == 0) ||
29234 (strcmp(str, "continue") == 0))
29235 attrs->cont = true;
29236 else if (strcmp(str, "clear") == 0)
29237 attrs->clear = true;
29238- else if (strncmp(str, "size=", strlen("size=")) == 0) {
29239- int map_bits = parse_map_size(str);
29240-
29241- if (map_bits < 0) {
29242- ret = map_bits;
29243+ else {
29244+ ret = parse_action(str, attrs);
29245+ if (ret)
29246 goto free;
29247- }
29248- attrs->map_bits = map_bits;
29249- } else {
29250- ret = -EINVAL;
29251- goto free;
29252 }
29253 }
1a6e0f06 29254
e4b2b4a8
JK
29255@@ -266,6 +1919,14 @@
29256 goto free;
29257 }
1a6e0f06 29258
e4b2b4a8
JK
29259+ if (!attrs->clock) {
29260+ attrs->clock = kstrdup("global", GFP_KERNEL);
29261+ if (!attrs->clock) {
29262+ ret = -ENOMEM;
29263+ goto free;
29264+ }
29265+ }
29266+
29267 return attrs;
29268 free:
29269 destroy_hist_trigger_attrs(attrs);
29270@@ -288,65 +1949,222 @@
29271 memcpy(comm, task->comm, TASK_COMM_LEN);
29272 }
1a6e0f06 29273
e4b2b4a8
JK
29274-static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
29275+static void hist_elt_data_free(struct hist_elt_data *elt_data)
29276 {
29277- kfree((char *)elt->private_data);
29278+ unsigned int i;
29279+
29280+ for (i = 0; i < SYNTH_FIELDS_MAX; i++)
29281+ kfree(elt_data->field_var_str[i]);
29282+
29283+ kfree(elt_data->comm);
29284+ kfree(elt_data);
1a6e0f06
JK
29285 }
29286
e4b2b4a8
JK
29287-static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
29288+static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
1a6e0f06 29289+{
e4b2b4a8 29290+ struct hist_elt_data *elt_data = elt->private_data;
1a6e0f06 29291+
e4b2b4a8 29292+ hist_elt_data_free(elt_data);
1a6e0f06
JK
29293+}
29294+
e4b2b4a8
JK
29295+static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
29296 {
29297 struct hist_trigger_data *hist_data = elt->map->private_data;
29298+ unsigned int size = TASK_COMM_LEN;
29299+ struct hist_elt_data *elt_data;
29300 struct hist_field *key_field;
29301- unsigned int i;
29302+ unsigned int i, n_str;
1a6e0f06 29303+
e4b2b4a8
JK
29304+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
29305+ if (!elt_data)
29306+ return -ENOMEM;
29307
29308 for_each_hist_key_field(i, hist_data) {
29309 key_field = hist_data->fields[i];
29310
29311 if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
29312- unsigned int size = TASK_COMM_LEN + 1;
29313-
29314- elt->private_data = kzalloc(size, GFP_KERNEL);
29315- if (!elt->private_data)
29316+ elt_data->comm = kzalloc(size, GFP_KERNEL);
29317+ if (!elt_data->comm) {
29318+ kfree(elt_data);
29319 return -ENOMEM;
29320+ }
29321 break;
29322 }
29323 }
29324
29325+ n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
29326+
29327+ size = STR_VAR_LEN_MAX;
1a6e0f06 29328+
e4b2b4a8
JK
29329+ for (i = 0; i < n_str; i++) {
29330+ elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
29331+ if (!elt_data->field_var_str[i]) {
29332+ hist_elt_data_free(elt_data);
29333+ return -ENOMEM;
29334+ }
29335+ }
1a6e0f06 29336+
e4b2b4a8 29337+ elt->private_data = elt_data;
1a6e0f06 29338+
e4b2b4a8
JK
29339 return 0;
29340 }
29341
29342-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
29343- struct tracing_map_elt *from)
29344+static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
29345 {
29346- char *comm_from = from->private_data;
29347- char *comm_to = to->private_data;
29348+ struct hist_elt_data *elt_data = elt->private_data;
29349
29350- if (comm_from)
29351- memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
29352+ if (elt_data->comm)
29353+ save_comm(elt_data->comm, current);
29354 }
29355
29356-static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
29357+static const struct tracing_map_ops hist_trigger_elt_data_ops = {
29358+ .elt_alloc = hist_trigger_elt_data_alloc,
29359+ .elt_free = hist_trigger_elt_data_free,
29360+ .elt_init = hist_trigger_elt_data_init,
29361+};
29362+
29363+static const char *get_hist_field_flags(struct hist_field *hist_field)
29364 {
29365- char *comm = elt->private_data;
29366+ const char *flags_str = NULL;
29367
29368- if (comm)
29369- save_comm(comm, current);
29370+ if (hist_field->flags & HIST_FIELD_FL_HEX)
29371+ flags_str = "hex";
29372+ else if (hist_field->flags & HIST_FIELD_FL_SYM)
29373+ flags_str = "sym";
29374+ else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
29375+ flags_str = "sym-offset";
29376+ else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
29377+ flags_str = "execname";
29378+ else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
29379+ flags_str = "syscall";
29380+ else if (hist_field->flags & HIST_FIELD_FL_LOG2)
29381+ flags_str = "log2";
29382+ else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
29383+ flags_str = "usecs";
29384+
29385+ return flags_str;
29386 }
29387
29388-static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
29389- .elt_alloc = hist_trigger_elt_comm_alloc,
29390- .elt_copy = hist_trigger_elt_comm_copy,
29391- .elt_free = hist_trigger_elt_comm_free,
29392- .elt_init = hist_trigger_elt_comm_init,
29393-};
29394+static void expr_field_str(struct hist_field *field, char *expr)
1a6e0f06 29395+{
e4b2b4a8
JK
29396+ if (field->flags & HIST_FIELD_FL_VAR_REF)
29397+ strcat(expr, "$");
29398
29399-static void destroy_hist_field(struct hist_field *hist_field)
29400+ strcat(expr, hist_field_name(field, 0));
1a6e0f06 29401+
e4b2b4a8
JK
29402+ if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
29403+ const char *flags_str = get_hist_field_flags(field);
1a6e0f06 29404+
e4b2b4a8
JK
29405+ if (flags_str) {
29406+ strcat(expr, ".");
29407+ strcat(expr, flags_str);
29408+ }
29409+ }
1a6e0f06
JK
29410+}
29411+
e4b2b4a8 29412+static char *expr_str(struct hist_field *field, unsigned int level)
1a6e0f06 29413+{
e4b2b4a8 29414+ char *expr;
1a6e0f06 29415+
e4b2b4a8
JK
29416+ if (level > 1)
29417+ return NULL;
1a6e0f06 29418+
e4b2b4a8
JK
29419+ expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
29420+ if (!expr)
29421+ return NULL;
1a6e0f06 29422+
e4b2b4a8
JK
29423+ if (!field->operands[0]) {
29424+ expr_field_str(field, expr);
29425+ return expr;
29426+ }
1a6e0f06 29427+
e4b2b4a8
JK
29428+ if (field->operator == FIELD_OP_UNARY_MINUS) {
29429+ char *subexpr;
1a6e0f06 29430+
e4b2b4a8
JK
29431+ strcat(expr, "-(");
29432+ subexpr = expr_str(field->operands[0], ++level);
29433+ if (!subexpr) {
29434+ kfree(expr);
29435+ return NULL;
1a6e0f06 29436+ }
e4b2b4a8
JK
29437+ strcat(expr, subexpr);
29438+ strcat(expr, ")");
1a6e0f06 29439+
e4b2b4a8 29440+ kfree(subexpr);
1a6e0f06 29441+
e4b2b4a8
JK
29442+ return expr;
29443+ }
1a6e0f06 29444+
e4b2b4a8 29445+ expr_field_str(field->operands[0], expr);
1a6e0f06 29446+
e4b2b4a8
JK
29447+ switch (field->operator) {
29448+ case FIELD_OP_MINUS:
29449+ strcat(expr, "-");
29450+ break;
29451+ case FIELD_OP_PLUS:
29452+ strcat(expr, "+");
29453+ break;
29454+ default:
29455+ kfree(expr);
29456+ return NULL;
29457+ }
1a6e0f06 29458+
e4b2b4a8 29459+ expr_field_str(field->operands[1], expr);
1a6e0f06 29460+
e4b2b4a8 29461+ return expr;
1a6e0f06 29462+}
1a6e0f06 29463+
e4b2b4a8 29464+static int contains_operator(char *str)
1a6e0f06 29465+{
e4b2b4a8
JK
29466+ enum field_op_id field_op = FIELD_OP_NONE;
29467+ char *op;
1a6e0f06 29468+
e4b2b4a8
JK
29469+ op = strpbrk(str, "+-");
29470+ if (!op)
29471+ return FIELD_OP_NONE;
1a6e0f06 29472+
e4b2b4a8
JK
29473+ switch (*op) {
29474+ case '-':
29475+ if (*str == '-')
29476+ field_op = FIELD_OP_UNARY_MINUS;
29477+ else
29478+ field_op = FIELD_OP_MINUS;
29479+ break;
29480+ case '+':
29481+ field_op = FIELD_OP_PLUS;
29482+ break;
29483+ default:
29484+ break;
1a6e0f06 29485+ }
1a6e0f06 29486+
e4b2b4a8
JK
29487+ return field_op;
29488+}
1a6e0f06 29489+
e4b2b4a8
JK
29490+static void destroy_hist_field(struct hist_field *hist_field,
29491+ unsigned int level)
29492 {
29493+ unsigned int i;
1a6e0f06 29494+
e4b2b4a8
JK
29495+ if (level > 3)
29496+ return;
1a6e0f06 29497+
e4b2b4a8 29498+ if (!hist_field)
1a6e0f06
JK
29499+ return;
29500+
e4b2b4a8
JK
29501+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
29502+ destroy_hist_field(hist_field->operands[i], level + 1);
1a6e0f06 29503+
e4b2b4a8
JK
29504+ kfree(hist_field->var.name);
29505+ kfree(hist_field->name);
29506+ kfree(hist_field->type);
1a6e0f06 29507+
e4b2b4a8
JK
29508 kfree(hist_field);
29509 }
29510
29511-static struct hist_field *create_hist_field(struct ftrace_event_field *field,
29512- unsigned long flags)
29513+static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
29514+ struct ftrace_event_field *field,
29515+ unsigned long flags,
29516+ char *var_name)
29517 {
29518 struct hist_field *hist_field;
29519
29520@@ -357,8 +2175,22 @@
29521 if (!hist_field)
29522 return NULL;
29523
29524+ hist_field->hist_data = hist_data;
1a6e0f06 29525+
e4b2b4a8
JK
29526+ if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
29527+ goto out; /* caller will populate */
1a6e0f06 29528+
e4b2b4a8
JK
29529+ if (flags & HIST_FIELD_FL_VAR_REF) {
29530+ hist_field->fn = hist_field_var_ref;
29531+ goto out;
29532+ }
1a6e0f06 29533+
e4b2b4a8
JK
29534 if (flags & HIST_FIELD_FL_HITCOUNT) {
29535 hist_field->fn = hist_field_counter;
29536+ hist_field->size = sizeof(u64);
29537+ hist_field->type = kstrdup("u64", GFP_KERNEL);
29538+ if (!hist_field->type)
29539+ goto free;
29540 goto out;
29541 }
29542
29543@@ -368,7 +2200,31 @@
29544 }
29545
29546 if (flags & HIST_FIELD_FL_LOG2) {
29547+ unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
29548 hist_field->fn = hist_field_log2;
29549+ hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
29550+ hist_field->size = hist_field->operands[0]->size;
29551+ hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
29552+ if (!hist_field->type)
29553+ goto free;
29554+ goto out;
29555+ }
1a6e0f06 29556+
e4b2b4a8
JK
29557+ if (flags & HIST_FIELD_FL_TIMESTAMP) {
29558+ hist_field->fn = hist_field_timestamp;
29559+ hist_field->size = sizeof(u64);
29560+ hist_field->type = kstrdup("u64", GFP_KERNEL);
29561+ if (!hist_field->type)
29562+ goto free;
29563+ goto out;
29564+ }
1a6e0f06 29565+
e4b2b4a8
JK
29566+ if (flags & HIST_FIELD_FL_CPU) {
29567+ hist_field->fn = hist_field_cpu;
29568+ hist_field->size = sizeof(int);
29569+ hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
29570+ if (!hist_field->type)
29571+ goto free;
29572 goto out;
29573 }
29574
29575@@ -378,6 +2234,11 @@
29576 if (is_string_field(field)) {
29577 flags |= HIST_FIELD_FL_STRING;
29578
29579+ hist_field->size = MAX_FILTER_STR_VAL;
29580+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
29581+ if (!hist_field->type)
29582+ goto free;
29583+
29584 if (field->filter_type == FILTER_STATIC_STRING)
29585 hist_field->fn = hist_field_string;
29586 else if (field->filter_type == FILTER_DYN_STRING)
29587@@ -385,10 +2246,16 @@
29588 else
29589 hist_field->fn = hist_field_pstring;
29590 } else {
29591+ hist_field->size = field->size;
29592+ hist_field->is_signed = field->is_signed;
29593+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
29594+ if (!hist_field->type)
29595+ goto free;
29596+
29597 hist_field->fn = select_value_fn(field->size,
29598 field->is_signed);
29599 if (!hist_field->fn) {
29600- destroy_hist_field(hist_field);
29601+ destroy_hist_field(hist_field, 0);
29602 return NULL;
29603 }
29604 }
29605@@ -396,84 +2263,1636 @@
29606 hist_field->field = field;
29607 hist_field->flags = flags;
29608
29609+ if (var_name) {
29610+ hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
29611+ if (!hist_field->var.name)
29612+ goto free;
29613+ }
29614+
29615 return hist_field;
29616+ free:
29617+ destroy_hist_field(hist_field, 0);
29618+ return NULL;
29619 }
29620
29621 static void destroy_hist_fields(struct hist_trigger_data *hist_data)
29622 {
29623 unsigned int i;
29624
29625- for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
29626+ for (i = 0; i < HIST_FIELDS_MAX; i++) {
29627 if (hist_data->fields[i]) {
29628- destroy_hist_field(hist_data->fields[i]);
29629+ destroy_hist_field(hist_data->fields[i], 0);
29630 hist_data->fields[i] = NULL;
29631 }
29632 }
29633 }
29634
29635-static int create_hitcount_val(struct hist_trigger_data *hist_data)
29636+static int init_var_ref(struct hist_field *ref_field,
29637+ struct hist_field *var_field,
29638+ char *system, char *event_name)
29639 {
29640- hist_data->fields[HITCOUNT_IDX] =
29641- create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
29642- if (!hist_data->fields[HITCOUNT_IDX])
29643- return -ENOMEM;
29644+ int err = 0;
29645
29646- hist_data->n_vals++;
29647+ ref_field->var.idx = var_field->var.idx;
29648+ ref_field->var.hist_data = var_field->hist_data;
29649+ ref_field->size = var_field->size;
29650+ ref_field->is_signed = var_field->is_signed;
29651+ ref_field->flags |= var_field->flags &
29652+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
29653
29654- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
29655+ if (system) {
29656+ ref_field->system = kstrdup(system, GFP_KERNEL);
29657+ if (!ref_field->system)
29658+ return -ENOMEM;
29659+ }
1a6e0f06 29660+
e4b2b4a8
JK
29661+ if (event_name) {
29662+ ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
29663+ if (!ref_field->event_name) {
29664+ err = -ENOMEM;
29665+ goto free;
29666+ }
29667+ }
1a6e0f06 29668+
e4b2b4a8
JK
29669+ if (var_field->var.name) {
29670+ ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
29671+ if (!ref_field->name) {
29672+ err = -ENOMEM;
29673+ goto free;
29674+ }
29675+ } else if (var_field->name) {
29676+ ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
29677+ if (!ref_field->name) {
29678+ err = -ENOMEM;
29679+ goto free;
29680+ }
29681+ }
1a6e0f06 29682+
e4b2b4a8
JK
29683+ ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
29684+ if (!ref_field->type) {
29685+ err = -ENOMEM;
29686+ goto free;
29687+ }
29688+ out:
29689+ return err;
29690+ free:
29691+ kfree(ref_field->system);
29692+ kfree(ref_field->event_name);
29693+ kfree(ref_field->name);
29694+
29695+ goto out;
1a6e0f06
JK
29696+}
29697+
e4b2b4a8
JK
29698+static struct hist_field *create_var_ref(struct hist_field *var_field,
29699+ char *system, char *event_name)
1a6e0f06 29700+{
e4b2b4a8
JK
29701+ unsigned long flags = HIST_FIELD_FL_VAR_REF;
29702+ struct hist_field *ref_field;
1a6e0f06 29703+
e4b2b4a8
JK
29704+ ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
29705+ if (ref_field) {
29706+ if (init_var_ref(ref_field, var_field, system, event_name)) {
29707+ destroy_hist_field(ref_field, 0);
29708+ return NULL;
29709+ }
29710+ }
1a6e0f06 29711+
e4b2b4a8 29712+ return ref_field;
1a6e0f06
JK
29713+}
29714+
e4b2b4a8 29715+static bool is_var_ref(char *var_name)
1a6e0f06 29716+{
e4b2b4a8
JK
29717+ if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
29718+ return false;
1a6e0f06 29719+
e4b2b4a8 29720+ return true;
1a6e0f06
JK
29721+}
29722+
e4b2b4a8
JK
29723+static char *field_name_from_var(struct hist_trigger_data *hist_data,
29724+ char *var_name)
1a6e0f06 29725+{
e4b2b4a8
JK
29726+ char *name, *field;
29727+ unsigned int i;
1a6e0f06 29728+
e4b2b4a8
JK
29729+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
29730+ name = hist_data->attrs->var_defs.name[i];
1a6e0f06 29731+
e4b2b4a8
JK
29732+ if (strcmp(var_name, name) == 0) {
29733+ field = hist_data->attrs->var_defs.expr[i];
29734+ if (contains_operator(field) || is_var_ref(field))
29735+ continue;
29736+ return field;
1a6e0f06
JK
29737+ }
29738+ }
e4b2b4a8
JK
29739+
29740+ return NULL;
1a6e0f06
JK
29741+}
29742+
e4b2b4a8
JK
29743+static char *local_field_var_ref(struct hist_trigger_data *hist_data,
29744+ char *system, char *event_name,
29745+ char *var_name)
29746+{
29747+ struct trace_event_call *call;
29748+
29749+ if (system && event_name) {
29750+ call = hist_data->event_file->event_call;
29751+
29752+ if (strcmp(system, call->class->system) != 0)
29753+ return NULL;
29754+
29755+ if (strcmp(event_name, trace_event_name(call)) != 0)
29756+ return NULL;
29757+ }
29758+
29759+ if (!!system != !!event_name)
29760+ return NULL;
29761+
29762+ if (!is_var_ref(var_name))
29763+ return NULL;
29764+
29765+ var_name++;
29766+
29767+ return field_name_from_var(hist_data, var_name);
1a6e0f06 29768+}
e4b2b4a8
JK
29769+
29770+static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
29771+ char *system, char *event_name,
29772+ char *var_name)
1a6e0f06 29773+{
e4b2b4a8
JK
29774+ struct hist_field *var_field = NULL, *ref_field = NULL;
29775+
29776+ if (!is_var_ref(var_name))
29777+ return NULL;
29778+
29779+ var_name++;
29780+
29781+ var_field = find_event_var(hist_data, system, event_name, var_name);
29782+ if (var_field)
29783+ ref_field = create_var_ref(var_field, system, event_name);
29784+
29785+ if (!ref_field)
29786+ hist_err_event("Couldn't find variable: $",
29787+ system, event_name, var_name);
29788+
29789+ return ref_field;
29790+}
29791+
29792+static struct ftrace_event_field *
29793+parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
29794+ char *field_str, unsigned long *flags)
29795+{
29796+ struct ftrace_event_field *field = NULL;
29797+ char *field_name, *modifier, *str;
29798+
29799+ modifier = str = kstrdup(field_str, GFP_KERNEL);
29800+ if (!modifier)
29801+ return ERR_PTR(-ENOMEM);
1a6e0f06 29802+
e4b2b4a8
JK
29803+ field_name = strsep(&modifier, ".");
29804+ if (modifier) {
29805+ if (strcmp(modifier, "hex") == 0)
29806+ *flags |= HIST_FIELD_FL_HEX;
29807+ else if (strcmp(modifier, "sym") == 0)
29808+ *flags |= HIST_FIELD_FL_SYM;
29809+ else if (strcmp(modifier, "sym-offset") == 0)
29810+ *flags |= HIST_FIELD_FL_SYM_OFFSET;
29811+ else if ((strcmp(modifier, "execname") == 0) &&
29812+ (strcmp(field_name, "common_pid") == 0))
29813+ *flags |= HIST_FIELD_FL_EXECNAME;
29814+ else if (strcmp(modifier, "syscall") == 0)
29815+ *flags |= HIST_FIELD_FL_SYSCALL;
29816+ else if (strcmp(modifier, "log2") == 0)
29817+ *flags |= HIST_FIELD_FL_LOG2;
29818+ else if (strcmp(modifier, "usecs") == 0)
29819+ *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
29820+ else {
29821+ hist_err("Invalid field modifier: ", modifier);
29822+ field = ERR_PTR(-EINVAL);
29823+ goto out;
1a6e0f06 29824+ }
e4b2b4a8 29825+ }
1a6e0f06 29826+
e4b2b4a8
JK
29827+ if (strcmp(field_name, "common_timestamp") == 0) {
29828+ *flags |= HIST_FIELD_FL_TIMESTAMP;
29829+ hist_data->enable_timestamps = true;
29830+ if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
29831+ hist_data->attrs->ts_in_usecs = true;
29832+ } else if (strcmp(field_name, "cpu") == 0)
29833+ *flags |= HIST_FIELD_FL_CPU;
29834+ else {
29835+ field = trace_find_event_field(file->event_call, field_name);
29836+ if (!field || !field->size) {
29837+ hist_err("Couldn't find field: ", field_name);
29838+ field = ERR_PTR(-EINVAL);
29839+ goto out;
29840+ }
29841+ }
29842+ out:
29843+ kfree(str);
1a6e0f06 29844+
e4b2b4a8
JK
29845+ return field;
29846+}
1a6e0f06 29847+
e4b2b4a8
JK
29848+static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
29849+ struct hist_field *var_ref,
29850+ char *var_name)
1a6e0f06 29851+{
e4b2b4a8
JK
29852+ struct hist_field *alias = NULL;
29853+ unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
1a6e0f06 29854+
e4b2b4a8
JK
29855+ alias = create_hist_field(hist_data, NULL, flags, var_name);
29856+ if (!alias)
29857+ return NULL;
1a6e0f06 29858+
e4b2b4a8
JK
29859+ alias->fn = var_ref->fn;
29860+ alias->operands[0] = var_ref;
1a6e0f06 29861+
e4b2b4a8
JK
29862+ if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
29863+ destroy_hist_field(alias, 0);
29864+ return NULL;
29865+ }
1a6e0f06 29866+
e4b2b4a8 29867+ return alias;
1a6e0f06
JK
29868+}
29869+
e4b2b4a8
JK
29870+static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
29871+ struct trace_event_file *file, char *str,
29872+ unsigned long *flags, char *var_name)
29873+{
29874+ char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
29875+ struct ftrace_event_field *field = NULL;
29876+ struct hist_field *hist_field = NULL;
29877+ int ret = 0;
1a6e0f06 29878+
e4b2b4a8
JK
29879+ s = strchr(str, '.');
29880+ if (s) {
29881+ s = strchr(++s, '.');
29882+ if (s) {
29883+ ref_system = strsep(&str, ".");
29884+ if (!str) {
29885+ ret = -EINVAL;
29886+ goto out;
29887+ }
29888+ ref_event = strsep(&str, ".");
29889+ if (!str) {
29890+ ret = -EINVAL;
29891+ goto out;
29892+ }
29893+ ref_var = str;
29894+ }
29895+ }
1a6e0f06 29896+
e4b2b4a8
JK
29897+ s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
29898+ if (!s) {
29899+ hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
29900+ if (hist_field) {
29901+ hist_data->var_refs[hist_data->n_var_refs] = hist_field;
29902+ hist_field->var_ref_idx = hist_data->n_var_refs++;
29903+ if (var_name) {
29904+ hist_field = create_alias(hist_data, hist_field, var_name);
29905+ if (!hist_field) {
29906+ ret = -ENOMEM;
29907+ goto out;
29908+ }
29909+ }
29910+ return hist_field;
29911+ }
29912+ } else
29913+ str = s;
29914+
29915+ field = parse_field(hist_data, file, str, flags);
29916+ if (IS_ERR(field)) {
29917+ ret = PTR_ERR(field);
29918+ goto out;
29919+ }
29920+
29921+ hist_field = create_hist_field(hist_data, field, *flags, var_name);
29922+ if (!hist_field) {
29923+ ret = -ENOMEM;
29924+ goto out;
29925+ }
29926+
29927+ return hist_field;
29928+ out:
29929+ return ERR_PTR(ret);
29930+}
29931+
29932+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
29933+ struct trace_event_file *file,
29934+ char *str, unsigned long flags,
29935+ char *var_name, unsigned int level);
29936+
29937+static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
29938+ struct trace_event_file *file,
29939+ char *str, unsigned long flags,
29940+ char *var_name, unsigned int level)
1a6e0f06 29941+{
e4b2b4a8
JK
29942+ struct hist_field *operand1, *expr = NULL;
29943+ unsigned long operand_flags;
29944+ int ret = 0;
29945+ char *s;
29946+
29947+ // we support only -(xxx) i.e. explicit parens required
29948+
29949+ if (level > 3) {
29950+ hist_err("Too many subexpressions (3 max): ", str);
29951+ ret = -EINVAL;
29952+ goto free;
1a6e0f06 29953+ }
e4b2b4a8
JK
29954+
29955+ str++; // skip leading '-'
29956+
29957+ s = strchr(str, '(');
29958+ if (s)
29959+ str++;
29960+ else {
29961+ ret = -EINVAL;
29962+ goto free;
29963+ }
29964+
29965+ s = strrchr(str, ')');
29966+ if (s)
29967+ *s = '\0';
29968+ else {
29969+ ret = -EINVAL; // no closing ')'
29970+ goto free;
29971+ }
29972+
29973+ flags |= HIST_FIELD_FL_EXPR;
29974+ expr = create_hist_field(hist_data, NULL, flags, var_name);
29975+ if (!expr) {
29976+ ret = -ENOMEM;
29977+ goto free;
29978+ }
29979+
29980+ operand_flags = 0;
29981+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
29982+ if (IS_ERR(operand1)) {
29983+ ret = PTR_ERR(operand1);
29984+ goto free;
29985+ }
29986+
29987+ expr->flags |= operand1->flags &
29988+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
29989+ expr->fn = hist_field_unary_minus;
29990+ expr->operands[0] = operand1;
29991+ expr->operator = FIELD_OP_UNARY_MINUS;
29992+ expr->name = expr_str(expr, 0);
29993+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
29994+ if (!expr->type) {
29995+ ret = -ENOMEM;
29996+ goto free;
29997+ }
29998+
29999+ return expr;
30000+ free:
30001+ destroy_hist_field(expr, 0);
30002+ return ERR_PTR(ret);
1a6e0f06 30003+}
1a6e0f06 30004+
e4b2b4a8
JK
30005+static int check_expr_operands(struct hist_field *operand1,
30006+ struct hist_field *operand2)
30007+{
30008+ unsigned long operand1_flags = operand1->flags;
30009+ unsigned long operand2_flags = operand2->flags;
1a6e0f06 30010+
e4b2b4a8
JK
30011+ if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
30012+ (operand1_flags & HIST_FIELD_FL_ALIAS)) {
30013+ struct hist_field *var;
30014+
30015+ var = find_var_field(operand1->var.hist_data, operand1->name);
30016+ if (!var)
30017+ return -EINVAL;
30018+ operand1_flags = var->flags;
30019+ }
30020+
30021+ if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
30022+ (operand2_flags & HIST_FIELD_FL_ALIAS)) {
30023+ struct hist_field *var;
30024+
30025+ var = find_var_field(operand2->var.hist_data, operand2->name);
30026+ if (!var)
30027+ return -EINVAL;
30028+ operand2_flags = var->flags;
30029+ }
30030+
30031+ if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
30032+ (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
30033+ hist_err("Timestamp units in expression don't match", NULL);
30034 return -EINVAL;
30035+ }
1a6e0f06 30036
e4b2b4a8 30037 return 0;
1a6e0f06
JK
30038 }
30039
e4b2b4a8
JK
30040-static int create_val_field(struct hist_trigger_data *hist_data,
30041- unsigned int val_idx,
30042- struct trace_event_file *file,
30043- char *field_str)
30044+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
30045+ struct trace_event_file *file,
30046+ char *str, unsigned long flags,
30047+ char *var_name, unsigned int level)
30048 {
30049- struct ftrace_event_field *field = NULL;
30050- unsigned long flags = 0;
30051- char *field_name;
30052+ struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
30053+ unsigned long operand_flags;
30054+ int field_op, ret = -EINVAL;
30055+ char *sep, *operand1_str;
30056+
30057+ if (level > 3) {
30058+ hist_err("Too many subexpressions (3 max): ", str);
30059+ return ERR_PTR(-EINVAL);
30060+ }
30061+
30062+ field_op = contains_operator(str);
30063+
30064+ if (field_op == FIELD_OP_NONE)
30065+ return parse_atom(hist_data, file, str, &flags, var_name);
30066+
30067+ if (field_op == FIELD_OP_UNARY_MINUS)
30068+ return parse_unary(hist_data, file, str, flags, var_name, ++level);
30069+
30070+ switch (field_op) {
30071+ case FIELD_OP_MINUS:
30072+ sep = "-";
30073+ break;
30074+ case FIELD_OP_PLUS:
30075+ sep = "+";
30076+ break;
30077+ default:
30078+ goto free;
30079+ }
30080+
30081+ operand1_str = strsep(&str, sep);
30082+ if (!operand1_str || !str)
30083+ goto free;
30084+
30085+ operand_flags = 0;
30086+ operand1 = parse_atom(hist_data, file, operand1_str,
30087+ &operand_flags, NULL);
30088+ if (IS_ERR(operand1)) {
30089+ ret = PTR_ERR(operand1);
30090+ operand1 = NULL;
30091+ goto free;
30092+ }
30093+
30094+ // rest of string could be another expression e.g. b+c in a+b+c
30095+ operand_flags = 0;
30096+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
30097+ if (IS_ERR(operand2)) {
30098+ ret = PTR_ERR(operand2);
30099+ operand2 = NULL;
30100+ goto free;
30101+ }
30102+
30103+ ret = check_expr_operands(operand1, operand2);
30104+ if (ret)
30105+ goto free;
30106+
30107+ flags |= HIST_FIELD_FL_EXPR;
30108+
30109+ flags |= operand1->flags &
30110+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
1a6e0f06 30111+
e4b2b4a8
JK
30112+ expr = create_hist_field(hist_data, NULL, flags, var_name);
30113+ if (!expr) {
30114+ ret = -ENOMEM;
30115+ goto free;
30116+ }
1a6e0f06 30117+
e4b2b4a8
JK
30118+ operand1->read_once = true;
30119+ operand2->read_once = true;
30120+
30121+ expr->operands[0] = operand1;
30122+ expr->operands[1] = operand2;
30123+ expr->operator = field_op;
30124+ expr->name = expr_str(expr, 0);
30125+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
30126+ if (!expr->type) {
30127+ ret = -ENOMEM;
30128+ goto free;
30129+ }
1a6e0f06 30130+
e4b2b4a8
JK
30131+ switch (field_op) {
30132+ case FIELD_OP_MINUS:
30133+ expr->fn = hist_field_minus;
30134+ break;
30135+ case FIELD_OP_PLUS:
30136+ expr->fn = hist_field_plus;
30137+ break;
30138+ default:
30139+ ret = -EINVAL;
30140+ goto free;
30141+ }
30142+
30143+ return expr;
30144+ free:
30145+ destroy_hist_field(operand1, 0);
30146+ destroy_hist_field(operand2, 0);
30147+ destroy_hist_field(expr, 0);
30148+
30149+ return ERR_PTR(ret);
30150+}
30151+
30152+static char *find_trigger_filter(struct hist_trigger_data *hist_data,
30153+ struct trace_event_file *file)
1a6e0f06 30154+{
e4b2b4a8
JK
30155+ struct event_trigger_data *test;
30156+
30157+ list_for_each_entry_rcu(test, &file->triggers, list) {
30158+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
30159+ if (test->private_data == hist_data)
30160+ return test->filter_str;
30161+ }
30162+ }
30163+
30164+ return NULL;
1a6e0f06
JK
30165+}
30166+
e4b2b4a8
JK
30167+static struct event_command trigger_hist_cmd;
30168+static int event_hist_trigger_func(struct event_command *cmd_ops,
30169+ struct trace_event_file *file,
30170+ char *glob, char *cmd, char *param);
30171+
30172+static bool compatible_keys(struct hist_trigger_data *target_hist_data,
30173+ struct hist_trigger_data *hist_data,
30174+ unsigned int n_keys)
1a6e0f06 30175+{
e4b2b4a8
JK
30176+ struct hist_field *target_hist_field, *hist_field;
30177+ unsigned int n, i, j;
30178+
30179+ if (hist_data->n_fields - hist_data->n_vals != n_keys)
30180+ return false;
30181+
30182+ i = hist_data->n_vals;
30183+ j = target_hist_data->n_vals;
30184+
30185+ for (n = 0; n < n_keys; n++) {
30186+ hist_field = hist_data->fields[i + n];
30187+ target_hist_field = target_hist_data->fields[j + n];
30188+
30189+ if (strcmp(hist_field->type, target_hist_field->type) != 0)
30190+ return false;
30191+ if (hist_field->size != target_hist_field->size)
30192+ return false;
30193+ if (hist_field->is_signed != target_hist_field->is_signed)
30194+ return false;
30195+ }
30196+
30197+ return true;
1a6e0f06
JK
30198+}
30199+
e4b2b4a8
JK
30200+static struct hist_trigger_data *
30201+find_compatible_hist(struct hist_trigger_data *target_hist_data,
30202+ struct trace_event_file *file)
1a6e0f06 30203+{
e4b2b4a8
JK
30204+ struct hist_trigger_data *hist_data;
30205+ struct event_trigger_data *test;
30206+ unsigned int n_keys;
30207+
30208+ n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
30209+
30210+ list_for_each_entry_rcu(test, &file->triggers, list) {
30211+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
30212+ hist_data = test->private_data;
30213+
30214+ if (compatible_keys(target_hist_data, hist_data, n_keys))
30215+ return hist_data;
30216+ }
30217+ }
30218+
30219+ return NULL;
1a6e0f06 30220+}
1a6e0f06 30221+
e4b2b4a8
JK
30222+static struct trace_event_file *event_file(struct trace_array *tr,
30223+ char *system, char *event_name)
30224+{
30225+ struct trace_event_file *file;
30226+
30227+ file = find_event_file(tr, system, event_name);
30228+ if (!file)
30229+ return ERR_PTR(-EINVAL);
30230+
30231+ return file;
30232+}
30233+
30234+static struct hist_field *
30235+find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
30236+ char *system, char *event_name, char *field_name)
30237+{
30238+ struct hist_field *event_var;
30239+ char *synthetic_name;
30240+
30241+ synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30242+ if (!synthetic_name)
30243+ return ERR_PTR(-ENOMEM);
30244+
30245+ strcpy(synthetic_name, "synthetic_");
30246+ strcat(synthetic_name, field_name);
30247+
30248+ event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
30249+
30250+ kfree(synthetic_name);
30251+
30252+ return event_var;
30253+}
1a6e0f06
JK
30254+
30255+/**
e4b2b4a8
JK
30256+ * create_field_var_hist - Automatically create a histogram and var for a field
30257+ * @target_hist_data: The target hist trigger
30258+ * @subsys_name: Optional subsystem name
30259+ * @event_name: Optional event name
30260+ * @field_name: The name of the field (and the resulting variable)
1a6e0f06 30261+ *
e4b2b4a8
JK
30262+ * Hist trigger actions fetch data from variables, not directly from
30263+ * events. However, for convenience, users are allowed to directly
30264+ * specify an event field in an action, which will be automatically
30265+ * converted into a variable on their behalf.
30266+
30267+ * If a user specifies a field on an event that isn't the event the
30268+ * histogram currently being defined (the target event histogram), the
30269+ * only way that can be accomplished is if a new hist trigger is
30270+ * created and the field variable defined on that.
1a6e0f06 30271+ *
e4b2b4a8
JK
30272+ * This function creates a new histogram compatible with the target
30273+ * event (meaning a histogram with the same key as the target
30274+ * histogram), and creates a variable for the specified field, but
30275+ * with 'synthetic_' prepended to the variable name in order to avoid
30276+ * collision with normal field variables.
30277+ *
30278+ * Return: The variable created for the field.
1a6e0f06 30279+ */
e4b2b4a8
JK
30280+static struct hist_field *
30281+create_field_var_hist(struct hist_trigger_data *target_hist_data,
30282+ char *subsys_name, char *event_name, char *field_name)
30283+{
30284+ struct trace_array *tr = target_hist_data->event_file->tr;
30285+ struct hist_field *event_var = ERR_PTR(-EINVAL);
30286+ struct hist_trigger_data *hist_data;
30287+ unsigned int i, n, first = true;
30288+ struct field_var_hist *var_hist;
30289+ struct trace_event_file *file;
30290+ struct hist_field *key_field;
30291+ char *saved_filter;
30292+ char *cmd;
30293+ int ret;
1a6e0f06 30294+
e4b2b4a8
JK
30295+ if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
30296+ hist_err_event("onmatch: Too many field variables defined: ",
30297+ subsys_name, event_name, field_name);
30298+ return ERR_PTR(-EINVAL);
30299+ }
1a6e0f06 30300+
e4b2b4a8 30301+ file = event_file(tr, subsys_name, event_name);
1a6e0f06 30302+
e4b2b4a8
JK
30303+ if (IS_ERR(file)) {
30304+ hist_err_event("onmatch: Event file not found: ",
30305+ subsys_name, event_name, field_name);
30306+ ret = PTR_ERR(file);
30307+ return ERR_PTR(ret);
1a6e0f06
JK
30308+ }
30309+
e4b2b4a8
JK
30310+ /*
30311+ * Look for a histogram compatible with target. We'll use the
30312+ * found histogram specification to create a new matching
30313+ * histogram with our variable on it. target_hist_data is not
30314+ * yet a registered histogram so we can't use that.
30315+ */
30316+ hist_data = find_compatible_hist(target_hist_data, file);
30317+ if (!hist_data) {
30318+ hist_err_event("onmatch: Matching event histogram not found: ",
30319+ subsys_name, event_name, field_name);
30320+ return ERR_PTR(-EINVAL);
1a6e0f06 30321+ }
1a6e0f06 30322+
e4b2b4a8
JK
30323+ /* See if a synthetic field variable has already been created */
30324+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
30325+ event_name, field_name);
30326+ if (!IS_ERR_OR_NULL(event_var))
30327+ return event_var;
1a6e0f06 30328+
e4b2b4a8
JK
30329+ var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
30330+ if (!var_hist)
30331+ return ERR_PTR(-ENOMEM);
1a6e0f06 30332+
e4b2b4a8
JK
30333+ cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30334+ if (!cmd) {
30335+ kfree(var_hist);
30336+ return ERR_PTR(-ENOMEM);
30337+ }
30338+
30339+ /* Use the same keys as the compatible histogram */
30340+ strcat(cmd, "keys=");
30341+
30342+ for_each_hist_key_field(i, hist_data) {
30343+ key_field = hist_data->fields[i];
30344+ if (!first)
30345+ strcat(cmd, ",");
30346+ strcat(cmd, key_field->field->name);
30347+ first = false;
30348+ }
30349+
30350+ /* Create the synthetic field variable specification */
30351+ strcat(cmd, ":synthetic_");
30352+ strcat(cmd, field_name);
30353+ strcat(cmd, "=");
30354+ strcat(cmd, field_name);
30355+
30356+ /* Use the same filter as the compatible histogram */
30357+ saved_filter = find_trigger_filter(hist_data, file);
30358+ if (saved_filter) {
30359+ strcat(cmd, " if ");
30360+ strcat(cmd, saved_filter);
30361+ }
30362+
30363+ var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
30364+ if (!var_hist->cmd) {
30365+ kfree(cmd);
30366+ kfree(var_hist);
30367+ return ERR_PTR(-ENOMEM);
30368+ }
30369+
30370+ /* Save the compatible histogram information */
30371+ var_hist->hist_data = hist_data;
30372+
30373+ /* Create the new histogram with our variable */
30374+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
30375+ "", "hist", cmd);
30376+ if (ret) {
30377+ kfree(cmd);
30378+ kfree(var_hist->cmd);
30379+ kfree(var_hist);
30380+ hist_err_event("onmatch: Couldn't create histogram for field: ",
30381+ subsys_name, event_name, field_name);
30382+ return ERR_PTR(ret);
30383+ }
30384+
30385+ kfree(cmd);
30386+
30387+ /* If we can't find the variable, something went wrong */
30388+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
30389+ event_name, field_name);
30390+ if (IS_ERR_OR_NULL(event_var)) {
30391+ kfree(var_hist->cmd);
30392+ kfree(var_hist);
30393+ hist_err_event("onmatch: Couldn't find synthetic variable: ",
30394+ subsys_name, event_name, field_name);
30395+ return ERR_PTR(-EINVAL);
1a6e0f06 30396+ }
e4b2b4a8
JK
30397+
30398+ n = target_hist_data->n_field_var_hists;
30399+ target_hist_data->field_var_hists[n] = var_hist;
30400+ target_hist_data->n_field_var_hists++;
30401+
30402+ return event_var;
1a6e0f06
JK
30403+}
30404+
e4b2b4a8
JK
30405+static struct hist_field *
30406+find_target_event_var(struct hist_trigger_data *hist_data,
30407+ char *subsys_name, char *event_name, char *var_name)
1a6e0f06 30408+{
e4b2b4a8
JK
30409+ struct trace_event_file *file = hist_data->event_file;
30410+ struct hist_field *hist_field = NULL;
1a6e0f06 30411+
e4b2b4a8
JK
30412+ if (subsys_name) {
30413+ struct trace_event_call *call;
1a6e0f06 30414+
e4b2b4a8
JK
30415+ if (!event_name)
30416+ return NULL;
1a6e0f06 30417+
e4b2b4a8 30418+ call = file->event_call;
1a6e0f06 30419+
e4b2b4a8
JK
30420+ if (strcmp(subsys_name, call->class->system) != 0)
30421+ return NULL;
1a6e0f06 30422+
e4b2b4a8
JK
30423+ if (strcmp(event_name, trace_event_name(call)) != 0)
30424+ return NULL;
30425+ }
30426+
30427+ hist_field = find_var_field(hist_data, var_name);
30428+
30429+ return hist_field;
30430+}
30431+
30432+static inline void __update_field_vars(struct tracing_map_elt *elt,
30433+ struct ring_buffer_event *rbe,
30434+ void *rec,
30435+ struct field_var **field_vars,
30436+ unsigned int n_field_vars,
30437+ unsigned int field_var_str_start)
30438+{
30439+ struct hist_elt_data *elt_data = elt->private_data;
30440+ unsigned int i, j, var_idx;
30441+ u64 var_val;
1a6e0f06 30442+
e4b2b4a8
JK
30443+ for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
30444+ struct field_var *field_var = field_vars[i];
30445+ struct hist_field *var = field_var->var;
30446+ struct hist_field *val = field_var->val;
1a6e0f06 30447+
e4b2b4a8
JK
30448+ var_val = val->fn(val, elt, rbe, rec);
30449+ var_idx = var->var.idx;
1a6e0f06 30450+
e4b2b4a8
JK
30451+ if (val->flags & HIST_FIELD_FL_STRING) {
30452+ char *str = elt_data->field_var_str[j++];
30453+ char *val_str = (char *)(uintptr_t)var_val;
1a6e0f06 30454+
e4b2b4a8
JK
30455+ strscpy(str, val_str, STR_VAR_LEN_MAX);
30456+ var_val = (u64)(uintptr_t)str;
1a6e0f06 30457+ }
e4b2b4a8 30458+ tracing_map_set_var(elt, var_idx, var_val);
1a6e0f06 30459+ }
1a6e0f06
JK
30460+}
30461+
e4b2b4a8
JK
30462+static void update_field_vars(struct hist_trigger_data *hist_data,
30463+ struct tracing_map_elt *elt,
30464+ struct ring_buffer_event *rbe,
30465+ void *rec)
1a6e0f06 30466+{
e4b2b4a8
JK
30467+ __update_field_vars(elt, rbe, rec, hist_data->field_vars,
30468+ hist_data->n_field_vars, 0);
30469+}
1a6e0f06 30470+
e4b2b4a8
JK
30471+static void update_max_vars(struct hist_trigger_data *hist_data,
30472+ struct tracing_map_elt *elt,
30473+ struct ring_buffer_event *rbe,
30474+ void *rec)
30475+{
30476+ __update_field_vars(elt, rbe, rec, hist_data->max_vars,
30477+ hist_data->n_max_vars, hist_data->n_field_var_str);
1a6e0f06
JK
30478+}
30479+
e4b2b4a8
JK
30480+static struct hist_field *create_var(struct hist_trigger_data *hist_data,
30481+ struct trace_event_file *file,
30482+ char *name, int size, const char *type)
30483+{
30484+ struct hist_field *var;
30485+ int idx;
1a6e0f06 30486+
e4b2b4a8
JK
30487+ if (find_var(hist_data, file, name) && !hist_data->remove) {
30488+ var = ERR_PTR(-EINVAL);
30489+ goto out;
30490+ }
1a6e0f06 30491+
e4b2b4a8
JK
30492+ var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
30493+ if (!var) {
30494+ var = ERR_PTR(-ENOMEM);
30495+ goto out;
30496+ }
1a6e0f06 30497+
e4b2b4a8
JK
30498+ idx = tracing_map_add_var(hist_data->map);
30499+ if (idx < 0) {
30500+ kfree(var);
30501+ var = ERR_PTR(-EINVAL);
30502+ goto out;
30503+ }
1a6e0f06 30504+
e4b2b4a8
JK
30505+ var->flags = HIST_FIELD_FL_VAR;
30506+ var->var.idx = idx;
30507+ var->var.hist_data = var->hist_data = hist_data;
30508+ var->size = size;
30509+ var->var.name = kstrdup(name, GFP_KERNEL);
30510+ var->type = kstrdup(type, GFP_KERNEL);
30511+ if (!var->var.name || !var->type) {
30512+ kfree(var->var.name);
30513+ kfree(var->type);
30514+ kfree(var);
30515+ var = ERR_PTR(-ENOMEM);
30516+ }
30517+ out:
30518+ return var;
30519+}
1a6e0f06 30520+
e4b2b4a8
JK
30521+static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
30522+ struct trace_event_file *file,
30523+ char *field_name)
1a6e0f06 30524+{
e4b2b4a8
JK
30525+ struct hist_field *val = NULL, *var = NULL;
30526+ unsigned long flags = HIST_FIELD_FL_VAR;
30527+ struct field_var *field_var;
30528 int ret = 0;
30529
30530- if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
30531+ if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
30532+ hist_err("Too many field variables defined: ", field_name);
30533+ ret = -EINVAL;
30534+ goto err;
30535+ }
30536+
30537+ val = parse_atom(hist_data, file, field_name, &flags, NULL);
30538+ if (IS_ERR(val)) {
30539+ hist_err("Couldn't parse field variable: ", field_name);
30540+ ret = PTR_ERR(val);
30541+ goto err;
30542+ }
30543+
30544+ var = create_var(hist_data, file, field_name, val->size, val->type);
30545+ if (IS_ERR(var)) {
30546+ hist_err("Couldn't create or find variable: ", field_name);
30547+ kfree(val);
30548+ ret = PTR_ERR(var);
30549+ goto err;
30550+ }
30551+
30552+ field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
30553+ if (!field_var) {
30554+ kfree(val);
30555+ kfree(var);
30556+ ret = -ENOMEM;
30557+ goto err;
30558+ }
30559+
30560+ field_var->var = var;
30561+ field_var->val = val;
30562+ out:
30563+ return field_var;
30564+ err:
30565+ field_var = ERR_PTR(ret);
30566+ goto out;
1a6e0f06
JK
30567+}
30568+
e4b2b4a8
JK
30569+/**
30570+ * create_target_field_var - Automatically create a variable for a field
30571+ * @target_hist_data: The target hist trigger
30572+ * @subsys_name: Optional subsystem name
30573+ * @event_name: Optional event name
30574+ * @var_name: The name of the field (and the resulting variable)
30575+ *
30576+ * Hist trigger actions fetch data from variables, not directly from
30577+ * events. However, for convenience, users are allowed to directly
30578+ * specify an event field in an action, which will be automatically
30579+ * converted into a variable on their behalf.
30580+
30581+ * This function creates a field variable with the name var_name on
30582+ * the hist trigger currently being defined on the target event. If
30583+ * subsys_name and event_name are specified, this function simply
30584+ * verifies that they do in fact match the target event subsystem and
30585+ * event name.
30586+ *
30587+ * Return: The variable created for the field.
1a6e0f06 30588+ */
e4b2b4a8
JK
30589+static struct field_var *
30590+create_target_field_var(struct hist_trigger_data *target_hist_data,
30591+ char *subsys_name, char *event_name, char *var_name)
1a6e0f06 30592+{
e4b2b4a8 30593+ struct trace_event_file *file = target_hist_data->event_file;
1a6e0f06 30594+
e4b2b4a8
JK
30595+ if (subsys_name) {
30596+ struct trace_event_call *call;
1a6e0f06 30597+
e4b2b4a8
JK
30598+ if (!event_name)
30599+ return NULL;
1a6e0f06 30600+
e4b2b4a8
JK
30601+ call = file->event_call;
30602+
30603+ if (strcmp(subsys_name, call->class->system) != 0)
30604+ return NULL;
30605+
30606+ if (strcmp(event_name, trace_event_name(call)) != 0)
30607+ return NULL;
30608+ }
30609+
30610+ return create_field_var(target_hist_data, file, var_name);
1a6e0f06
JK
30611+}
30612+
e4b2b4a8
JK
30613+static void onmax_print(struct seq_file *m,
30614+ struct hist_trigger_data *hist_data,
30615+ struct tracing_map_elt *elt,
30616+ struct action_data *data)
1a6e0f06 30617+{
e4b2b4a8 30618+ unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
1a6e0f06 30619+
e4b2b4a8 30620+ seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
1a6e0f06 30621+
e4b2b4a8
JK
30622+ for (i = 0; i < hist_data->n_max_vars; i++) {
30623+ struct hist_field *save_val = hist_data->max_vars[i]->val;
30624+ struct hist_field *save_var = hist_data->max_vars[i]->var;
30625+ u64 val;
1a6e0f06 30626+
e4b2b4a8 30627+ save_var_idx = save_var->var.idx;
1a6e0f06 30628+
e4b2b4a8 30629+ val = tracing_map_read_var(elt, save_var_idx);
1a6e0f06 30630+
e4b2b4a8
JK
30631+ if (save_val->flags & HIST_FIELD_FL_STRING) {
30632+ seq_printf(m, " %s: %-32s", save_var->var.name,
30633+ (char *)(uintptr_t)(val));
30634+ } else
30635+ seq_printf(m, " %s: %10llu", save_var->var.name, val);
30636+ }
1a6e0f06
JK
30637+}
30638+
e4b2b4a8
JK
30639+static void onmax_save(struct hist_trigger_data *hist_data,
30640+ struct tracing_map_elt *elt, void *rec,
30641+ struct ring_buffer_event *rbe,
30642+ struct action_data *data, u64 *var_ref_vals)
1a6e0f06 30643+{
e4b2b4a8
JK
30644+ unsigned int max_idx = data->onmax.max_var->var.idx;
30645+ unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
1a6e0f06 30646+
e4b2b4a8 30647+ u64 var_val, max_val;
1a6e0f06 30648+
e4b2b4a8
JK
30649+ var_val = var_ref_vals[max_var_ref_idx];
30650+ max_val = tracing_map_read_var(elt, max_idx);
30651+
30652+ if (var_val <= max_val)
30653+ return;
30654+
30655+ tracing_map_set_var(elt, max_idx, var_val);
30656+
30657+ update_max_vars(hist_data, elt, rbe, rec);
30658+}
1a6e0f06 30659+
e4b2b4a8 30660+static void onmax_destroy(struct action_data *data)
1a6e0f06 30661+{
e4b2b4a8 30662+ unsigned int i;
1a6e0f06 30663+
e4b2b4a8
JK
30664+ destroy_hist_field(data->onmax.max_var, 0);
30665+ destroy_hist_field(data->onmax.var, 0);
1a6e0f06 30666+
e4b2b4a8
JK
30667+ kfree(data->onmax.var_str);
30668+ kfree(data->onmax.fn_name);
1a6e0f06 30669+
e4b2b4a8
JK
30670+ for (i = 0; i < data->n_params; i++)
30671+ kfree(data->params[i]);
1a6e0f06 30672+
e4b2b4a8
JK
30673+ kfree(data);
30674+}
1a6e0f06 30675+
e4b2b4a8
JK
30676+static int onmax_create(struct hist_trigger_data *hist_data,
30677+ struct action_data *data)
30678+{
30679+ struct trace_event_file *file = hist_data->event_file;
30680+ struct hist_field *var_field, *ref_field, *max_var;
30681+ unsigned int var_ref_idx = hist_data->n_var_refs;
30682+ struct field_var *field_var;
30683+ char *onmax_var_str, *param;
30684+ unsigned long flags;
30685+ unsigned int i;
30686+ int ret = 0;
1a6e0f06 30687+
e4b2b4a8
JK
30688+ onmax_var_str = data->onmax.var_str;
30689+ if (onmax_var_str[0] != '$') {
30690+ hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
30691 return -EINVAL;
30692+ }
30693+ onmax_var_str++;
30694
30695- field_name = strsep(&field_str, ".");
30696- if (field_str) {
30697- if (strcmp(field_str, "hex") == 0)
30698- flags |= HIST_FIELD_FL_HEX;
30699- else {
30700+ var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
30701+ if (!var_field) {
30702+ hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
30703+ return -EINVAL;
30704+ }
1a6e0f06 30705+
e4b2b4a8
JK
30706+ flags = HIST_FIELD_FL_VAR_REF;
30707+ ref_field = create_hist_field(hist_data, NULL, flags, NULL);
30708+ if (!ref_field)
30709+ return -ENOMEM;
1a6e0f06 30710+
e4b2b4a8
JK
30711+ if (init_var_ref(ref_field, var_field, NULL, NULL)) {
30712+ destroy_hist_field(ref_field, 0);
30713+ ret = -ENOMEM;
30714+ goto out;
1a6e0f06 30715+ }
e4b2b4a8
JK
30716+ hist_data->var_refs[hist_data->n_var_refs] = ref_field;
30717+ ref_field->var_ref_idx = hist_data->n_var_refs++;
30718+ data->onmax.var = ref_field;
30719+
30720+ data->fn = onmax_save;
30721+ data->onmax.max_var_ref_idx = var_ref_idx;
30722+ max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
30723+ if (IS_ERR(max_var)) {
30724+ hist_err("onmax: Couldn't create onmax variable: ", "max");
30725+ ret = PTR_ERR(max_var);
30726+ goto out;
30727+ }
30728+ data->onmax.max_var = max_var;
1a6e0f06 30729+
e4b2b4a8
JK
30730+ for (i = 0; i < data->n_params; i++) {
30731+ param = kstrdup(data->params[i], GFP_KERNEL);
30732+ if (!param) {
30733+ ret = -ENOMEM;
30734+ goto out;
30735+ }
30736+
30737+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
30738+ if (IS_ERR(field_var)) {
30739+ hist_err("onmax: Couldn't create field variable: ", param);
30740+ ret = PTR_ERR(field_var);
30741+ kfree(param);
30742+ goto out;
30743+ }
30744+
30745+ hist_data->max_vars[hist_data->n_max_vars++] = field_var;
30746+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
30747+ hist_data->n_max_var_str++;
30748+
30749+ kfree(param);
1a6e0f06 30750+ }
e4b2b4a8
JK
30751+ out:
30752+ return ret;
1a6e0f06
JK
30753+}
30754+
e4b2b4a8 30755+static int parse_action_params(char *params, struct action_data *data)
1a6e0f06 30756+{
e4b2b4a8
JK
30757+ char *param, *saved_param;
30758+ int ret = 0;
1a6e0f06 30759+
e4b2b4a8
JK
30760+ while (params) {
30761+ if (data->n_params >= SYNTH_FIELDS_MAX)
30762+ goto out;
1a6e0f06 30763+
e4b2b4a8
JK
30764+ param = strsep(&params, ",");
30765+ if (!param) {
30766+ ret = -EINVAL;
30767+ goto out;
30768+ }
1a6e0f06 30769+
e4b2b4a8
JK
30770+ param = strstrip(param);
30771+ if (strlen(param) < 2) {
30772+ hist_err("Invalid action param: ", param);
30773 ret = -EINVAL;
30774 goto out;
30775 }
1a6e0f06 30776+
e4b2b4a8
JK
30777+ saved_param = kstrdup(param, GFP_KERNEL);
30778+ if (!saved_param) {
30779+ ret = -ENOMEM;
30780+ goto out;
30781+ }
30782+
30783+ data->params[data->n_params++] = saved_param;
30784 }
30785+ out:
30786+ return ret;
30787+}
30788
30789- field = trace_find_event_field(file->event_call, field_name);
30790- if (!field || !field->size) {
30791+static struct action_data *onmax_parse(char *str)
1a6e0f06 30792+{
e4b2b4a8
JK
30793+ char *onmax_fn_name, *onmax_var_str;
30794+ struct action_data *data;
30795+ int ret = -EINVAL;
1a6e0f06 30796+
e4b2b4a8
JK
30797+ data = kzalloc(sizeof(*data), GFP_KERNEL);
30798+ if (!data)
30799+ return ERR_PTR(-ENOMEM);
1a6e0f06 30800+
e4b2b4a8
JK
30801+ onmax_var_str = strsep(&str, ")");
30802+ if (!onmax_var_str || !str) {
30803 ret = -EINVAL;
30804- goto out;
30805+ goto free;
30806 }
30807
30808- hist_data->fields[val_idx] = create_hist_field(field, flags);
30809- if (!hist_data->fields[val_idx]) {
30810+ data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
30811+ if (!data->onmax.var_str) {
30812+ ret = -ENOMEM;
30813+ goto free;
30814+ }
30815+
30816+ strsep(&str, ".");
30817+ if (!str)
30818+ goto free;
30819+
30820+ onmax_fn_name = strsep(&str, "(");
30821+ if (!onmax_fn_name || !str)
30822+ goto free;
30823+
30824+ if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
30825+ char *params = strsep(&str, ")");
30826+
30827+ if (!params) {
30828+ ret = -EINVAL;
30829+ goto free;
1a6e0f06 30830+ }
1a6e0f06 30831+
e4b2b4a8
JK
30832+ ret = parse_action_params(params, data);
30833+ if (ret)
30834+ goto free;
30835+ } else
30836+ goto free;
30837+
30838+ data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
30839+ if (!data->onmax.fn_name) {
30840+ ret = -ENOMEM;
30841+ goto free;
1a6e0f06 30842+ }
e4b2b4a8
JK
30843+ out:
30844+ return data;
30845+ free:
30846+ onmax_destroy(data);
30847+ data = ERR_PTR(ret);
30848+ goto out;
1a6e0f06
JK
30849+}
30850+
e4b2b4a8
JK
30851+static void onmatch_destroy(struct action_data *data)
30852+{
30853+ unsigned int i;
30854+
30855+ mutex_lock(&synth_event_mutex);
30856+
30857+ kfree(data->onmatch.match_event);
30858+ kfree(data->onmatch.match_event_system);
30859+ kfree(data->onmatch.synth_event_name);
30860+
30861+ for (i = 0; i < data->n_params; i++)
30862+ kfree(data->params[i]);
30863+
30864+ if (data->onmatch.synth_event)
30865+ data->onmatch.synth_event->ref--;
30866+
30867+ kfree(data);
30868+
30869+ mutex_unlock(&synth_event_mutex);
30870+}
30871+
30872+static void destroy_field_var(struct field_var *field_var)
30873+{
30874+ if (!field_var)
30875+ return;
30876+
30877+ destroy_hist_field(field_var->var, 0);
30878+ destroy_hist_field(field_var->val, 0);
30879+
30880+ kfree(field_var);
1a6e0f06
JK
30881+}
30882+
e4b2b4a8
JK
30883+static void destroy_field_vars(struct hist_trigger_data *hist_data)
30884+{
30885+ unsigned int i;
1a6e0f06 30886+
e4b2b4a8
JK
30887+ for (i = 0; i < hist_data->n_field_vars; i++)
30888+ destroy_field_var(hist_data->field_vars[i]);
30889+}
30890+
30891+static void save_field_var(struct hist_trigger_data *hist_data,
30892+ struct field_var *field_var)
1a6e0f06 30893+{
e4b2b4a8 30894+ hist_data->field_vars[hist_data->n_field_vars++] = field_var;
1a6e0f06 30895+
e4b2b4a8
JK
30896+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
30897+ hist_data->n_field_var_str++;
30898+}
1a6e0f06 30899+
e4b2b4a8
JK
30900+
30901+static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
30902+{
30903+ unsigned int i;
30904+
30905+ for (i = 0; i < hist_data->n_synth_var_refs; i++)
30906+ destroy_hist_field(hist_data->synth_var_refs[i], 0);
1a6e0f06 30907+}
e4b2b4a8
JK
30908+
30909+static void save_synth_var_ref(struct hist_trigger_data *hist_data,
30910+ struct hist_field *var_ref)
1a6e0f06 30911+{
e4b2b4a8
JK
30912+ hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
30913+
30914+ hist_data->var_refs[hist_data->n_var_refs] = var_ref;
30915+ var_ref->var_ref_idx = hist_data->n_var_refs++;
1a6e0f06 30916+}
1a6e0f06 30917+
e4b2b4a8
JK
30918+static int check_synth_field(struct synth_event *event,
30919+ struct hist_field *hist_field,
30920+ unsigned int field_pos)
1a6e0f06 30921+{
e4b2b4a8
JK
30922+ struct synth_field *field;
30923+
30924+ if (field_pos >= event->n_fields)
30925+ return -EINVAL;
30926+
30927+ field = event->fields[field_pos];
30928+
30929+ if (strcmp(field->type, hist_field->type) != 0)
30930+ return -EINVAL;
30931+
30932+ return 0;
1a6e0f06
JK
30933+}
30934+
e4b2b4a8
JK
30935+static struct hist_field *
30936+onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
30937+ char *system, char *event, char *var)
30938+{
30939+ struct hist_field *hist_field;
30940+
30941+ var++; /* skip '$' */
30942+
30943+ hist_field = find_target_event_var(hist_data, system, event, var);
30944+ if (!hist_field) {
30945+ if (!system) {
30946+ system = data->onmatch.match_event_system;
30947+ event = data->onmatch.match_event;
30948+ }
30949+
30950+ hist_field = find_event_var(hist_data, system, event, var);
1a6e0f06
JK
30951+ }
30952+
e4b2b4a8
JK
30953+ if (!hist_field)
30954+ hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
30955+
30956+ return hist_field;
30957+}
30958+
30959+static struct hist_field *
30960+onmatch_create_field_var(struct hist_trigger_data *hist_data,
30961+ struct action_data *data, char *system,
30962+ char *event, char *var)
30963+{
30964+ struct hist_field *hist_field = NULL;
30965+ struct field_var *field_var;
30966+
30967+ /*
30968+ * First try to create a field var on the target event (the
30969+ * currently being defined). This will create a variable for
30970+ * unqualified fields on the target event, or if qualified,
30971+ * target fields that have qualified names matching the target.
30972+ */
30973+ field_var = create_target_field_var(hist_data, system, event, var);
30974+
30975+ if (field_var && !IS_ERR(field_var)) {
30976+ save_field_var(hist_data, field_var);
30977+ hist_field = field_var->var;
30978+ } else {
30979+ field_var = NULL;
30980+ /*
30981+ * If no explicit system.event is specfied, default to
30982+ * looking for fields on the onmatch(system.event.xxx)
30983+ * event.
30984+ */
30985+ if (!system) {
30986+ system = data->onmatch.match_event_system;
30987+ event = data->onmatch.match_event;
30988+ }
30989+
30990+ /*
30991+ * At this point, we're looking at a field on another
30992+ * event. Because we can't modify a hist trigger on
30993+ * another event to add a variable for a field, we need
30994+ * to create a new trigger on that event and create the
30995+ * variable at the same time.
30996+ */
30997+ hist_field = create_field_var_hist(hist_data, system, event, var);
30998+ if (IS_ERR(hist_field))
30999+ goto free;
31000+ }
31001+ out:
31002+ return hist_field;
31003+ free:
31004+ destroy_field_var(field_var);
31005+ hist_field = NULL;
31006+ goto out;
31007+}
31008+
31009+static int onmatch_create(struct hist_trigger_data *hist_data,
31010+ struct trace_event_file *file,
31011+ struct action_data *data)
31012+{
31013+ char *event_name, *param, *system = NULL;
31014+ struct hist_field *hist_field, *var_ref;
31015+ unsigned int i, var_ref_idx;
31016+ unsigned int field_pos = 0;
31017+ struct synth_event *event;
31018+ int ret = 0;
31019+
31020+ mutex_lock(&synth_event_mutex);
31021+ event = find_synth_event(data->onmatch.synth_event_name);
31022+ if (!event) {
31023+ hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
31024+ mutex_unlock(&synth_event_mutex);
31025+ return -EINVAL;
31026+ }
31027+ event->ref++;
31028+ mutex_unlock(&synth_event_mutex);
31029+
31030+ var_ref_idx = hist_data->n_var_refs;
31031+
31032+ for (i = 0; i < data->n_params; i++) {
31033+ char *p;
31034+
31035+ p = param = kstrdup(data->params[i], GFP_KERNEL);
31036+ if (!param) {
31037+ ret = -ENOMEM;
31038+ goto err;
31039+ }
31040+
31041+ system = strsep(&param, ".");
31042+ if (!param) {
31043+ param = (char *)system;
31044+ system = event_name = NULL;
31045+ } else {
31046+ event_name = strsep(&param, ".");
31047+ if (!param) {
31048+ kfree(p);
31049+ ret = -EINVAL;
31050+ goto err;
31051+ }
31052+ }
31053+
31054+ if (param[0] == '$')
31055+ hist_field = onmatch_find_var(hist_data, data, system,
31056+ event_name, param);
31057+ else
31058+ hist_field = onmatch_create_field_var(hist_data, data,
31059+ system,
31060+ event_name,
31061+ param);
31062+
31063+ if (!hist_field) {
31064+ kfree(p);
31065+ ret = -EINVAL;
31066+ goto err;
31067+ }
31068+
31069+ if (check_synth_field(event, hist_field, field_pos) == 0) {
31070+ var_ref = create_var_ref(hist_field, system, event_name);
31071+ if (!var_ref) {
31072+ kfree(p);
31073+ ret = -ENOMEM;
31074+ goto err;
31075+ }
31076+
31077+ save_synth_var_ref(hist_data, var_ref);
31078+ field_pos++;
31079+ kfree(p);
31080+ continue;
31081+ }
31082+
31083+ hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
31084+ system, event_name, param);
31085+ kfree(p);
31086+ ret = -EINVAL;
31087+ goto err;
31088+ }
31089+
31090+ if (field_pos != event->n_fields) {
31091+ hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
31092+ ret = -EINVAL;
31093+ goto err;
31094+ }
31095+
31096+ data->fn = action_trace;
31097+ data->onmatch.synth_event = event;
31098+ data->onmatch.var_ref_idx = var_ref_idx;
31099+ out:
31100+ return ret;
31101+ err:
31102+ mutex_lock(&synth_event_mutex);
31103+ event->ref--;
31104+ mutex_unlock(&synth_event_mutex);
31105+
31106+ goto out;
31107+}
31108+
31109+static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
31110+{
31111+ char *match_event, *match_event_system;
31112+ char *synth_event_name, *params;
31113+ struct action_data *data;
31114+ int ret = -EINVAL;
31115+
31116+ data = kzalloc(sizeof(*data), GFP_KERNEL);
31117+ if (!data)
31118+ return ERR_PTR(-ENOMEM);
31119+
31120+ match_event = strsep(&str, ")");
31121+ if (!match_event || !str) {
31122+ hist_err("onmatch: Missing closing paren: ", match_event);
31123+ goto free;
31124+ }
31125+
31126+ match_event_system = strsep(&match_event, ".");
31127+ if (!match_event) {
31128+ hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
31129+ goto free;
31130+ }
31131+
31132+ if (IS_ERR(event_file(tr, match_event_system, match_event))) {
31133+ hist_err_event("onmatch: Invalid subsystem or event name: ",
31134+ match_event_system, match_event, NULL);
31135+ goto free;
31136+ }
31137+
31138+ data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
31139+ if (!data->onmatch.match_event) {
31140+ ret = -ENOMEM;
31141+ goto free;
31142+ }
31143+
31144+ data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
31145+ if (!data->onmatch.match_event_system) {
31146+ ret = -ENOMEM;
31147+ goto free;
31148+ }
31149+
31150+ strsep(&str, ".");
31151+ if (!str) {
31152+ hist_err("onmatch: Missing . after onmatch(): ", str);
31153+ goto free;
31154+ }
31155+
31156+ synth_event_name = strsep(&str, "(");
31157+ if (!synth_event_name || !str) {
31158+ hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
31159+ goto free;
31160+ }
31161+
31162+ data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
31163+ if (!data->onmatch.synth_event_name) {
31164 ret = -ENOMEM;
31165+ goto free;
31166+ }
31167+
31168+ params = strsep(&str, ")");
31169+ if (!params || !str || (str && strlen(str))) {
31170+ hist_err("onmatch: Missing closing paramlist paren: ", params);
31171+ goto free;
31172+ }
31173+
31174+ ret = parse_action_params(params, data);
31175+ if (ret)
31176+ goto free;
31177+ out:
31178+ return data;
31179+ free:
31180+ onmatch_destroy(data);
31181+ data = ERR_PTR(ret);
31182+ goto out;
31183+}
31184+
31185+static int create_hitcount_val(struct hist_trigger_data *hist_data)
31186+{
31187+ hist_data->fields[HITCOUNT_IDX] =
31188+ create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
31189+ if (!hist_data->fields[HITCOUNT_IDX])
31190+ return -ENOMEM;
31191+
31192+ hist_data->n_vals++;
31193+ hist_data->n_fields++;
31194+
31195+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
31196+ return -EINVAL;
31197+
31198+ return 0;
31199+}
31200+
31201+static int __create_val_field(struct hist_trigger_data *hist_data,
31202+ unsigned int val_idx,
31203+ struct trace_event_file *file,
31204+ char *var_name, char *field_str,
31205+ unsigned long flags)
31206+{
31207+ struct hist_field *hist_field;
31208+ int ret = 0;
31209+
31210+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
31211+ if (IS_ERR(hist_field)) {
31212+ ret = PTR_ERR(hist_field);
31213 goto out;
1a6e0f06
JK
31214 }
31215
e4b2b4a8
JK
31216+ hist_data->fields[val_idx] = hist_field;
31217+
31218 ++hist_data->n_vals;
31219+ ++hist_data->n_fields;
1a6e0f06 31220
e4b2b4a8
JK
31221- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
31222+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
31223 ret = -EINVAL;
31224 out:
1a6e0f06
JK
31225 return ret;
31226 }
1a6e0f06 31227
e4b2b4a8
JK
31228+static int create_val_field(struct hist_trigger_data *hist_data,
31229+ unsigned int val_idx,
31230+ struct trace_event_file *file,
31231+ char *field_str)
1a6e0f06 31232+{
e4b2b4a8
JK
31233+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
31234+ return -EINVAL;
1a6e0f06 31235+
e4b2b4a8 31236+ return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
1a6e0f06
JK
31237+}
31238+
e4b2b4a8
JK
31239+static int create_var_field(struct hist_trigger_data *hist_data,
31240+ unsigned int val_idx,
31241+ struct trace_event_file *file,
31242+ char *var_name, char *expr_str)
1a6e0f06 31243+{
e4b2b4a8 31244+ unsigned long flags = 0;
1a6e0f06 31245+
e4b2b4a8
JK
31246+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
31247+ return -EINVAL;
1a6e0f06 31248+
e4b2b4a8
JK
31249+ if (find_var(hist_data, file, var_name) && !hist_data->remove) {
31250+ hist_err("Variable already defined: ", var_name);
31251+ return -EINVAL;
31252+ }
31253+
31254+ flags |= HIST_FIELD_FL_VAR;
31255+ hist_data->n_vars++;
31256+ if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
31257+ return -EINVAL;
31258+
31259+ return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
31260+}
31261+
31262 static int create_val_fields(struct hist_trigger_data *hist_data,
31263 struct trace_event_file *file)
31264 {
31265 char *fields_str, *field_str;
31266- unsigned int i, j;
31267+ unsigned int i, j = 1;
31268 int ret;
1a6e0f06 31269
e4b2b4a8
JK
31270 ret = create_hitcount_val(hist_data);
31271@@ -493,12 +3912,15 @@
31272 field_str = strsep(&fields_str, ",");
31273 if (!field_str)
31274 break;
31275+
31276 if (strcmp(field_str, "hitcount") == 0)
31277 continue;
31278+
31279 ret = create_val_field(hist_data, j++, file, field_str);
31280 if (ret)
31281 goto out;
1a6e0f06 31282 }
e4b2b4a8
JK
31283+
31284 if (fields_str && (strcmp(fields_str, "hitcount") != 0))
31285 ret = -EINVAL;
31286 out:
31287@@ -511,12 +3933,13 @@
31288 struct trace_event_file *file,
31289 char *field_str)
31290 {
31291- struct ftrace_event_field *field = NULL;
31292+ struct hist_field *hist_field = NULL;
31293+
31294 unsigned long flags = 0;
31295 unsigned int key_size;
31296 int ret = 0;
1a6e0f06 31297
e4b2b4a8
JK
31298- if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
31299+ if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
31300 return -EINVAL;
1a6e0f06 31301
e4b2b4a8
JK
31302 flags |= HIST_FIELD_FL_KEY;
31303@@ -524,57 +3947,40 @@
31304 if (strcmp(field_str, "stacktrace") == 0) {
31305 flags |= HIST_FIELD_FL_STACKTRACE;
31306 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
31307+ hist_field = create_hist_field(hist_data, NULL, flags, NULL);
31308 } else {
31309- char *field_name = strsep(&field_str, ".");
31310-
31311- if (field_str) {
31312- if (strcmp(field_str, "hex") == 0)
31313- flags |= HIST_FIELD_FL_HEX;
31314- else if (strcmp(field_str, "sym") == 0)
31315- flags |= HIST_FIELD_FL_SYM;
31316- else if (strcmp(field_str, "sym-offset") == 0)
31317- flags |= HIST_FIELD_FL_SYM_OFFSET;
31318- else if ((strcmp(field_str, "execname") == 0) &&
31319- (strcmp(field_name, "common_pid") == 0))
31320- flags |= HIST_FIELD_FL_EXECNAME;
31321- else if (strcmp(field_str, "syscall") == 0)
31322- flags |= HIST_FIELD_FL_SYSCALL;
31323- else if (strcmp(field_str, "log2") == 0)
31324- flags |= HIST_FIELD_FL_LOG2;
31325- else {
31326- ret = -EINVAL;
31327- goto out;
31328- }
31329+ hist_field = parse_expr(hist_data, file, field_str, flags,
31330+ NULL, 0);
31331+ if (IS_ERR(hist_field)) {
31332+ ret = PTR_ERR(hist_field);
31333+ goto out;
31334 }
1a6e0f06 31335
e4b2b4a8
JK
31336- field = trace_find_event_field(file->event_call, field_name);
31337- if (!field || !field->size) {
31338+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
31339+ hist_err("Using variable references as keys not supported: ", field_str);
31340+ destroy_hist_field(hist_field, 0);
31341 ret = -EINVAL;
31342 goto out;
1a6e0f06 31343 }
e4b2b4a8
JK
31344
31345- if (is_string_field(field))
31346- key_size = MAX_FILTER_STR_VAL;
31347- else
31348- key_size = field->size;
31349+ key_size = hist_field->size;
1a6e0f06 31350 }
1a6e0f06 31351
e4b2b4a8
JK
31352- hist_data->fields[key_idx] = create_hist_field(field, flags);
31353- if (!hist_data->fields[key_idx]) {
31354- ret = -ENOMEM;
31355- goto out;
31356- }
31357+ hist_data->fields[key_idx] = hist_field;
31358
31359 key_size = ALIGN(key_size, sizeof(u64));
31360 hist_data->fields[key_idx]->size = key_size;
31361 hist_data->fields[key_idx]->offset = key_offset;
31362+
31363 hist_data->key_size += key_size;
31364+
31365 if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
31366 ret = -EINVAL;
31367 goto out;
1a6e0f06 31368 }
1a6e0f06 31369
e4b2b4a8
JK
31370 hist_data->n_keys++;
31371+ hist_data->n_fields++;
1a6e0f06 31372
e4b2b4a8
JK
31373 if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
31374 return -EINVAL;
31375@@ -618,21 +4024,113 @@
31376 return ret;
1a6e0f06
JK
31377 }
31378
e4b2b4a8
JK
31379+static int create_var_fields(struct hist_trigger_data *hist_data,
31380+ struct trace_event_file *file)
31381+{
31382+ unsigned int i, j = hist_data->n_vals;
31383+ int ret = 0;
31384+
31385+ unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
31386+
31387+ for (i = 0; i < n_vars; i++) {
31388+ char *var_name = hist_data->attrs->var_defs.name[i];
31389+ char *expr = hist_data->attrs->var_defs.expr[i];
31390+
31391+ ret = create_var_field(hist_data, j++, file, var_name, expr);
31392+ if (ret)
31393+ goto out;
31394+ }
31395+ out:
31396+ return ret;
31397+}
31398+
31399+static void free_var_defs(struct hist_trigger_data *hist_data)
31400+{
31401+ unsigned int i;
31402+
31403+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
31404+ kfree(hist_data->attrs->var_defs.name[i]);
31405+ kfree(hist_data->attrs->var_defs.expr[i]);
31406+ }
31407+
31408+ hist_data->attrs->var_defs.n_vars = 0;
31409+}
31410+
31411+static int parse_var_defs(struct hist_trigger_data *hist_data)
31412+{
31413+ char *s, *str, *var_name, *field_str;
31414+ unsigned int i, j, n_vars = 0;
31415+ int ret = 0;
31416+
31417+ for (i = 0; i < hist_data->attrs->n_assignments; i++) {
31418+ str = hist_data->attrs->assignment_str[i];
31419+ for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
31420+ field_str = strsep(&str, ",");
31421+ if (!field_str)
31422+ break;
31423+
31424+ var_name = strsep(&field_str, "=");
31425+ if (!var_name || !field_str) {
31426+ hist_err("Malformed assignment: ", var_name);
31427+ ret = -EINVAL;
31428+ goto free;
31429+ }
31430+
31431+ if (n_vars == TRACING_MAP_VARS_MAX) {
31432+ hist_err("Too many variables defined: ", var_name);
31433+ ret = -EINVAL;
31434+ goto free;
31435+ }
31436+
31437+ s = kstrdup(var_name, GFP_KERNEL);
31438+ if (!s) {
31439+ ret = -ENOMEM;
31440+ goto free;
31441+ }
31442+ hist_data->attrs->var_defs.name[n_vars] = s;
31443+
31444+ s = kstrdup(field_str, GFP_KERNEL);
31445+ if (!s) {
31446+ kfree(hist_data->attrs->var_defs.name[n_vars]);
31447+ ret = -ENOMEM;
31448+ goto free;
31449+ }
31450+ hist_data->attrs->var_defs.expr[n_vars++] = s;
31451+
31452+ hist_data->attrs->var_defs.n_vars = n_vars;
31453+ }
31454+ }
31455+
31456+ return ret;
31457+ free:
31458+ free_var_defs(hist_data);
31459+
31460+ return ret;
31461+}
31462+
31463 static int create_hist_fields(struct hist_trigger_data *hist_data,
31464 struct trace_event_file *file)
31465 {
31466 int ret;
1a6e0f06 31467
e4b2b4a8
JK
31468+ ret = parse_var_defs(hist_data);
31469+ if (ret)
31470+ goto out;
31471+
31472 ret = create_val_fields(hist_data, file);
31473 if (ret)
31474 goto out;
1a6e0f06 31475
e4b2b4a8
JK
31476- ret = create_key_fields(hist_data, file);
31477+ ret = create_var_fields(hist_data, file);
31478 if (ret)
31479 goto out;
1a6e0f06 31480
e4b2b4a8
JK
31481- hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
31482+ ret = create_key_fields(hist_data, file);
31483+ if (ret)
31484+ goto out;
31485 out:
31486+ free_var_defs(hist_data);
31487+
31488 return ret;
1a6e0f06
JK
31489 }
31490
e4b2b4a8
JK
31491@@ -653,10 +4151,9 @@
31492 static int create_sort_keys(struct hist_trigger_data *hist_data)
1a6e0f06 31493 {
e4b2b4a8
JK
31494 char *fields_str = hist_data->attrs->sort_key_str;
31495- struct ftrace_event_field *field = NULL;
31496 struct tracing_map_sort_key *sort_key;
31497 int descending, ret = 0;
31498- unsigned int i, j;
31499+ unsigned int i, j, k;
1a6e0f06 31500
e4b2b4a8 31501 hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
1a6e0f06 31502
e4b2b4a8
JK
31503@@ -670,7 +4167,9 @@
31504 }
1a6e0f06 31505
e4b2b4a8
JK
31506 for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
31507+ struct hist_field *hist_field;
31508 char *field_str, *field_name;
31509+ const char *test_name;
1a6e0f06 31510
e4b2b4a8 31511 sort_key = &hist_data->sort_keys[i];
1a6e0f06 31512
e4b2b4a8
JK
31513@@ -702,10 +4201,19 @@
31514 continue;
31515 }
1a6e0f06 31516
e4b2b4a8
JK
31517- for (j = 1; j < hist_data->n_fields; j++) {
31518- field = hist_data->fields[j]->field;
31519- if (field && (strcmp(field_name, field->name) == 0)) {
31520- sort_key->field_idx = j;
31521+ for (j = 1, k = 1; j < hist_data->n_fields; j++) {
31522+ unsigned int idx;
1a6e0f06 31523+
e4b2b4a8
JK
31524+ hist_field = hist_data->fields[j];
31525+ if (hist_field->flags & HIST_FIELD_FL_VAR)
31526+ continue;
1a6e0f06 31527+
e4b2b4a8 31528+ idx = k++;
1a6e0f06 31529+
e4b2b4a8 31530+ test_name = hist_field_name(hist_field, 0);
1a6e0f06 31531+
e4b2b4a8
JK
31532+ if (strcmp(field_name, test_name) == 0) {
31533+ sort_key->field_idx = idx;
31534 descending = is_descending(field_str);
31535 if (descending < 0) {
31536 ret = descending;
31537@@ -720,16 +4228,230 @@
31538 break;
31539 }
31540 }
31541+
31542 hist_data->n_sort_keys = i;
31543 out:
31544 return ret;
31545 }
1a6e0f06 31546
e4b2b4a8
JK
31547+static void destroy_actions(struct hist_trigger_data *hist_data)
31548+{
31549+ unsigned int i;
1a6e0f06 31550+
e4b2b4a8
JK
31551+ for (i = 0; i < hist_data->n_actions; i++) {
31552+ struct action_data *data = hist_data->actions[i];
1a6e0f06 31553+
e4b2b4a8
JK
31554+ if (data->fn == action_trace)
31555+ onmatch_destroy(data);
31556+ else if (data->fn == onmax_save)
31557+ onmax_destroy(data);
31558+ else
31559+ kfree(data);
31560+ }
31561+}
1a6e0f06 31562+
e4b2b4a8
JK
31563+static int parse_actions(struct hist_trigger_data *hist_data)
31564+{
31565+ struct trace_array *tr = hist_data->event_file->tr;
31566+ struct action_data *data;
31567+ unsigned int i;
31568+ int ret = 0;
31569+ char *str;
1a6e0f06 31570+
e4b2b4a8
JK
31571+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
31572+ str = hist_data->attrs->action_str[i];
1a6e0f06 31573+
e4b2b4a8
JK
31574+ if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
31575+ char *action_str = str + strlen("onmatch(");
1a6e0f06 31576+
e4b2b4a8
JK
31577+ data = onmatch_parse(tr, action_str);
31578+ if (IS_ERR(data)) {
31579+ ret = PTR_ERR(data);
31580+ break;
31581+ }
31582+ data->fn = action_trace;
31583+ } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
31584+ char *action_str = str + strlen("onmax(");
1a6e0f06 31585+
e4b2b4a8
JK
31586+ data = onmax_parse(action_str);
31587+ if (IS_ERR(data)) {
31588+ ret = PTR_ERR(data);
31589+ break;
31590+ }
31591+ data->fn = onmax_save;
31592+ } else {
31593+ ret = -EINVAL;
31594+ break;
31595+ }
1a6e0f06 31596+
e4b2b4a8
JK
31597+ hist_data->actions[hist_data->n_actions++] = data;
31598+ }
1a6e0f06 31599+
e4b2b4a8
JK
31600+ return ret;
31601+}
1a6e0f06 31602+
e4b2b4a8
JK
31603+static int create_actions(struct hist_trigger_data *hist_data,
31604+ struct trace_event_file *file)
31605+{
31606+ struct action_data *data;
31607+ unsigned int i;
31608+ int ret = 0;
1a6e0f06 31609+
e4b2b4a8
JK
31610+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
31611+ data = hist_data->actions[i];
1a6e0f06 31612+
e4b2b4a8
JK
31613+ if (data->fn == action_trace) {
31614+ ret = onmatch_create(hist_data, file, data);
31615+ if (ret)
31616+ return ret;
31617+ } else if (data->fn == onmax_save) {
31618+ ret = onmax_create(hist_data, data);
31619+ if (ret)
31620+ return ret;
31621+ }
31622+ }
1a6e0f06 31623+
e4b2b4a8
JK
31624+ return ret;
31625+}
1a6e0f06 31626+
e4b2b4a8
JK
31627+static void print_actions(struct seq_file *m,
31628+ struct hist_trigger_data *hist_data,
31629+ struct tracing_map_elt *elt)
31630+{
31631+ unsigned int i;
1a6e0f06 31632+
e4b2b4a8
JK
31633+ for (i = 0; i < hist_data->n_actions; i++) {
31634+ struct action_data *data = hist_data->actions[i];
1a6e0f06 31635+
e4b2b4a8
JK
31636+ if (data->fn == onmax_save)
31637+ onmax_print(m, hist_data, elt, data);
31638+ }
31639+}
1a6e0f06 31640+
e4b2b4a8
JK
31641+static void print_onmax_spec(struct seq_file *m,
31642+ struct hist_trigger_data *hist_data,
31643+ struct action_data *data)
31644+{
31645+ unsigned int i;
1a6e0f06 31646+
e4b2b4a8
JK
31647+ seq_puts(m, ":onmax(");
31648+ seq_printf(m, "%s", data->onmax.var_str);
31649+ seq_printf(m, ").%s(", data->onmax.fn_name);
1a6e0f06 31650+
e4b2b4a8
JK
31651+ for (i = 0; i < hist_data->n_max_vars; i++) {
31652+ seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
31653+ if (i < hist_data->n_max_vars - 1)
31654+ seq_puts(m, ",");
31655+ }
31656+ seq_puts(m, ")");
31657+}
1a6e0f06 31658+
e4b2b4a8
JK
31659+static void print_onmatch_spec(struct seq_file *m,
31660+ struct hist_trigger_data *hist_data,
31661+ struct action_data *data)
31662+{
31663+ unsigned int i;
1a6e0f06 31664+
e4b2b4a8
JK
31665+ seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
31666+ data->onmatch.match_event);
1a6e0f06 31667+
e4b2b4a8 31668+ seq_printf(m, "%s(", data->onmatch.synth_event->name);
1a6e0f06 31669+
e4b2b4a8
JK
31670+ for (i = 0; i < data->n_params; i++) {
31671+ if (i)
31672+ seq_puts(m, ",");
31673+ seq_printf(m, "%s", data->params[i]);
31674+ }
1a6e0f06 31675+
e4b2b4a8
JK
31676+ seq_puts(m, ")");
31677+}
1a6e0f06 31678+
e4b2b4a8
JK
31679+static bool actions_match(struct hist_trigger_data *hist_data,
31680+ struct hist_trigger_data *hist_data_test)
1a6e0f06 31681+{
e4b2b4a8 31682+ unsigned int i, j;
1a6e0f06 31683+
e4b2b4a8
JK
31684+ if (hist_data->n_actions != hist_data_test->n_actions)
31685+ return false;
1a6e0f06 31686+
e4b2b4a8
JK
31687+ for (i = 0; i < hist_data->n_actions; i++) {
31688+ struct action_data *data = hist_data->actions[i];
31689+ struct action_data *data_test = hist_data_test->actions[i];
1a6e0f06 31690+
e4b2b4a8
JK
31691+ if (data->fn != data_test->fn)
31692+ return false;
1a6e0f06 31693+
e4b2b4a8
JK
31694+ if (data->n_params != data_test->n_params)
31695+ return false;
1a6e0f06 31696+
e4b2b4a8
JK
31697+ for (j = 0; j < data->n_params; j++) {
31698+ if (strcmp(data->params[j], data_test->params[j]) != 0)
31699+ return false;
31700+ }
1a6e0f06 31701+
e4b2b4a8
JK
31702+ if (data->fn == action_trace) {
31703+ if (strcmp(data->onmatch.synth_event_name,
31704+ data_test->onmatch.synth_event_name) != 0)
31705+ return false;
31706+ if (strcmp(data->onmatch.match_event_system,
31707+ data_test->onmatch.match_event_system) != 0)
31708+ return false;
31709+ if (strcmp(data->onmatch.match_event,
31710+ data_test->onmatch.match_event) != 0)
31711+ return false;
31712+ } else if (data->fn == onmax_save) {
31713+ if (strcmp(data->onmax.var_str,
31714+ data_test->onmax.var_str) != 0)
31715+ return false;
31716+ if (strcmp(data->onmax.fn_name,
31717+ data_test->onmax.fn_name) != 0)
31718+ return false;
1a6e0f06 31719+ }
1a6e0f06 31720+ }
1a6e0f06 31721+
e4b2b4a8
JK
31722+ return true;
31723+}
1a6e0f06 31724+
1a6e0f06 31725+
e4b2b4a8
JK
31726+static void print_actions_spec(struct seq_file *m,
31727+ struct hist_trigger_data *hist_data)
31728+{
31729+ unsigned int i;
1a6e0f06 31730+
e4b2b4a8
JK
31731+ for (i = 0; i < hist_data->n_actions; i++) {
31732+ struct action_data *data = hist_data->actions[i];
1a6e0f06 31733+
e4b2b4a8
JK
31734+ if (data->fn == action_trace)
31735+ print_onmatch_spec(m, hist_data, data);
31736+ else if (data->fn == onmax_save)
31737+ print_onmax_spec(m, hist_data, data);
1a6e0f06 31738+ }
1a6e0f06
JK
31739+}
31740+
e4b2b4a8 31741+static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
1a6e0f06 31742+{
e4b2b4a8 31743+ unsigned int i;
1a6e0f06 31744+
e4b2b4a8
JK
31745+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
31746+ kfree(hist_data->field_var_hists[i]->cmd);
31747+ kfree(hist_data->field_var_hists[i]);
1a6e0f06 31748+ }
1a6e0f06
JK
31749+}
31750+
e4b2b4a8
JK
31751 static void destroy_hist_data(struct hist_trigger_data *hist_data)
31752 {
31753+ if (!hist_data)
31754+ return;
1a6e0f06 31755+
e4b2b4a8
JK
31756 destroy_hist_trigger_attrs(hist_data->attrs);
31757 destroy_hist_fields(hist_data);
31758 tracing_map_destroy(hist_data->map);
1a6e0f06 31759+
e4b2b4a8
JK
31760+ destroy_actions(hist_data);
31761+ destroy_field_vars(hist_data);
31762+ destroy_field_var_hists(hist_data);
31763+ destroy_synth_var_refs(hist_data);
31764+
31765 kfree(hist_data);
31766 }
31767
31768@@ -738,7 +4460,7 @@
31769 struct tracing_map *map = hist_data->map;
31770 struct ftrace_event_field *field;
31771 struct hist_field *hist_field;
31772- int i, idx;
31773+ int i, idx = 0;
31774
31775 for_each_hist_field(i, hist_data) {
31776 hist_field = hist_data->fields[i];
31777@@ -749,6 +4471,9 @@
31778
31779 if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
31780 cmp_fn = tracing_map_cmp_none;
31781+ else if (!field)
31782+ cmp_fn = tracing_map_cmp_num(hist_field->size,
31783+ hist_field->is_signed);
31784 else if (is_string_field(field))
31785 cmp_fn = tracing_map_cmp_string;
31786 else
31787@@ -757,36 +4482,29 @@
31788 idx = tracing_map_add_key_field(map,
31789 hist_field->offset,
31790 cmp_fn);
31791-
31792- } else
31793+ } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
31794 idx = tracing_map_add_sum_field(map);
31795
31796 if (idx < 0)
31797 return idx;
31798- }
31799-
31800- return 0;
31801-}
31802-
31803-static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
31804-{
31805- struct hist_field *key_field;
31806- unsigned int i;
31807-
31808- for_each_hist_key_field(i, hist_data) {
31809- key_field = hist_data->fields[i];
31810
31811- if (key_field->flags & HIST_FIELD_FL_EXECNAME)
31812- return true;
31813+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
31814+ idx = tracing_map_add_var(map);
31815+ if (idx < 0)
31816+ return idx;
31817+ hist_field->var.idx = idx;
31818+ hist_field->var.hist_data = hist_data;
31819+ }
31820 }
31821
31822- return false;
1a6e0f06 31823+ return 0;
e4b2b4a8
JK
31824 }
31825
31826 static struct hist_trigger_data *
31827 create_hist_data(unsigned int map_bits,
31828 struct hist_trigger_attrs *attrs,
31829- struct trace_event_file *file)
31830+ struct trace_event_file *file,
31831+ bool remove)
31832 {
31833 const struct tracing_map_ops *map_ops = NULL;
31834 struct hist_trigger_data *hist_data;
31835@@ -797,6 +4515,12 @@
31836 return ERR_PTR(-ENOMEM);
31837
31838 hist_data->attrs = attrs;
31839+ hist_data->remove = remove;
31840+ hist_data->event_file = file;
1a6e0f06 31841+
e4b2b4a8
JK
31842+ ret = parse_actions(hist_data);
31843+ if (ret)
31844+ goto free;
31845
31846 ret = create_hist_fields(hist_data, file);
31847 if (ret)
31848@@ -806,8 +4530,7 @@
31849 if (ret)
31850 goto free;
31851
31852- if (need_tracing_map_ops(hist_data))
31853- map_ops = &hist_trigger_elt_comm_ops;
31854+ map_ops = &hist_trigger_elt_data_ops;
31855
31856 hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
31857 map_ops, hist_data);
31858@@ -820,12 +4543,6 @@
31859 ret = create_tracing_map_fields(hist_data);
31860 if (ret)
31861 goto free;
31862-
31863- ret = tracing_map_init(hist_data->map);
31864- if (ret)
31865- goto free;
31866-
31867- hist_data->event_file = file;
31868 out:
31869 return hist_data;
31870 free:
31871@@ -839,18 +4556,39 @@
31872 }
31873
31874 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
31875- struct tracing_map_elt *elt,
31876- void *rec)
31877+ struct tracing_map_elt *elt, void *rec,
31878+ struct ring_buffer_event *rbe,
31879+ u64 *var_ref_vals)
31880 {
31881+ struct hist_elt_data *elt_data;
31882 struct hist_field *hist_field;
31883- unsigned int i;
31884+ unsigned int i, var_idx;
31885 u64 hist_val;
31886
31887+ elt_data = elt->private_data;
31888+ elt_data->var_ref_vals = var_ref_vals;
31889+
31890 for_each_hist_val_field(i, hist_data) {
31891 hist_field = hist_data->fields[i];
31892- hist_val = hist_field->fn(hist_field, rec);
31893+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
31894+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
31895+ var_idx = hist_field->var.idx;
31896+ tracing_map_set_var(elt, var_idx, hist_val);
31897+ continue;
31898+ }
31899 tracing_map_update_sum(elt, i, hist_val);
31900 }
31901+
31902+ for_each_hist_key_field(i, hist_data) {
31903+ hist_field = hist_data->fields[i];
31904+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
31905+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
31906+ var_idx = hist_field->var.idx;
31907+ tracing_map_set_var(elt, var_idx, hist_val);
31908+ }
31909+ }
1a6e0f06 31910+
e4b2b4a8
JK
31911+ update_field_vars(hist_data, elt, rbe, rec);
31912 }
31913
31914 static inline void add_to_key(char *compound_key, void *key,
31915@@ -877,15 +4615,31 @@
31916 memcpy(compound_key + key_field->offset, key, size);
31917 }
31918
31919-static void event_hist_trigger(struct event_trigger_data *data, void *rec)
31920+static void
31921+hist_trigger_actions(struct hist_trigger_data *hist_data,
31922+ struct tracing_map_elt *elt, void *rec,
31923+ struct ring_buffer_event *rbe, u64 *var_ref_vals)
1a6e0f06 31924+{
e4b2b4a8
JK
31925+ struct action_data *data;
31926+ unsigned int i;
1a6e0f06 31927+
e4b2b4a8
JK
31928+ for (i = 0; i < hist_data->n_actions; i++) {
31929+ data = hist_data->actions[i];
31930+ data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
1a6e0f06 31931+ }
1a6e0f06
JK
31932+}
31933+
e4b2b4a8
JK
31934+static void event_hist_trigger(struct event_trigger_data *data, void *rec,
31935+ struct ring_buffer_event *rbe)
31936 {
31937 struct hist_trigger_data *hist_data = data->private_data;
31938 bool use_compound_key = (hist_data->n_keys > 1);
31939 unsigned long entries[HIST_STACKTRACE_DEPTH];
31940+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
31941 char compound_key[HIST_KEY_SIZE_MAX];
31942+ struct tracing_map_elt *elt = NULL;
31943 struct stack_trace stacktrace;
31944 struct hist_field *key_field;
31945- struct tracing_map_elt *elt;
31946 u64 field_contents;
31947 void *key = NULL;
31948 unsigned int i;
31949@@ -906,7 +4660,7 @@
31950
31951 key = entries;
31952 } else {
31953- field_contents = key_field->fn(key_field, rec);
31954+ field_contents = key_field->fn(key_field, elt, rbe, rec);
31955 if (key_field->flags & HIST_FIELD_FL_STRING) {
31956 key = (void *)(unsigned long)field_contents;
31957 use_compound_key = true;
31958@@ -921,9 +4675,18 @@
31959 if (use_compound_key)
31960 key = compound_key;
31961
31962+ if (hist_data->n_var_refs &&
31963+ !resolve_var_refs(hist_data, key, var_ref_vals, false))
31964+ return;
1a6e0f06 31965+
e4b2b4a8
JK
31966 elt = tracing_map_insert(hist_data->map, key);
31967- if (elt)
31968- hist_trigger_elt_update(hist_data, elt, rec);
31969+ if (!elt)
31970+ return;
1a6e0f06 31971+
e4b2b4a8
JK
31972+ hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
31973+
31974+ if (resolve_var_refs(hist_data, key, var_ref_vals, true))
31975+ hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
31976 }
31977
31978 static void hist_trigger_stacktrace_print(struct seq_file *m,
31979@@ -952,6 +4715,7 @@
31980 struct hist_field *key_field;
31981 char str[KSYM_SYMBOL_LEN];
31982 bool multiline = false;
31983+ const char *field_name;
31984 unsigned int i;
31985 u64 uval;
31986
31987@@ -963,26 +4727,33 @@
31988 if (i > hist_data->n_vals)
31989 seq_puts(m, ", ");
31990
31991+ field_name = hist_field_name(key_field, 0);
31992+
31993 if (key_field->flags & HIST_FIELD_FL_HEX) {
31994 uval = *(u64 *)(key + key_field->offset);
31995- seq_printf(m, "%s: %llx",
31996- key_field->field->name, uval);
31997+ seq_printf(m, "%s: %llx", field_name, uval);
31998 } else if (key_field->flags & HIST_FIELD_FL_SYM) {
31999 uval = *(u64 *)(key + key_field->offset);
32000 sprint_symbol_no_offset(str, uval);
32001- seq_printf(m, "%s: [%llx] %-45s",
32002- key_field->field->name, uval, str);
32003+ seq_printf(m, "%s: [%llx] %-45s", field_name,
32004+ uval, str);
32005 } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
32006 uval = *(u64 *)(key + key_field->offset);
32007 sprint_symbol(str, uval);
32008- seq_printf(m, "%s: [%llx] %-55s",
32009- key_field->field->name, uval, str);
32010+ seq_printf(m, "%s: [%llx] %-55s", field_name,
32011+ uval, str);
32012 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
32013- char *comm = elt->private_data;
32014+ struct hist_elt_data *elt_data = elt->private_data;
32015+ char *comm;
32016+
32017+ if (WARN_ON_ONCE(!elt_data))
32018+ return;
1a6e0f06 32019+
e4b2b4a8
JK
32020+ comm = elt_data->comm;
32021
32022 uval = *(u64 *)(key + key_field->offset);
32023- seq_printf(m, "%s: %-16s[%10llu]",
32024- key_field->field->name, comm, uval);
32025+ seq_printf(m, "%s: %-16s[%10llu]", field_name,
32026+ comm, uval);
32027 } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
32028 const char *syscall_name;
32029
32030@@ -991,8 +4762,8 @@
32031 if (!syscall_name)
32032 syscall_name = "unknown_syscall";
32033
32034- seq_printf(m, "%s: %-30s[%3llu]",
32035- key_field->field->name, syscall_name, uval);
32036+ seq_printf(m, "%s: %-30s[%3llu]", field_name,
32037+ syscall_name, uval);
32038 } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
32039 seq_puts(m, "stacktrace:\n");
32040 hist_trigger_stacktrace_print(m,
32041@@ -1000,15 +4771,14 @@
32042 HIST_STACKTRACE_DEPTH);
32043 multiline = true;
32044 } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
32045- seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
32046+ seq_printf(m, "%s: ~ 2^%-2llu", field_name,
32047 *(u64 *)(key + key_field->offset));
32048 } else if (key_field->flags & HIST_FIELD_FL_STRING) {
32049- seq_printf(m, "%s: %-50s", key_field->field->name,
32050+ seq_printf(m, "%s: %-50s", field_name,
32051 (char *)(key + key_field->offset));
32052 } else {
32053 uval = *(u64 *)(key + key_field->offset);
32054- seq_printf(m, "%s: %10llu", key_field->field->name,
32055- uval);
32056+ seq_printf(m, "%s: %10llu", field_name, uval);
32057 }
32058 }
32059
32060@@ -1021,17 +4791,23 @@
32061 tracing_map_read_sum(elt, HITCOUNT_IDX));
32062
32063 for (i = 1; i < hist_data->n_vals; i++) {
32064+ field_name = hist_field_name(hist_data->fields[i], 0);
1a6e0f06 32065+
e4b2b4a8
JK
32066+ if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
32067+ hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
32068+ continue;
1a6e0f06 32069+
e4b2b4a8
JK
32070 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
32071- seq_printf(m, " %s: %10llx",
32072- hist_data->fields[i]->field->name,
32073+ seq_printf(m, " %s: %10llx", field_name,
32074 tracing_map_read_sum(elt, i));
32075 } else {
32076- seq_printf(m, " %s: %10llu",
32077- hist_data->fields[i]->field->name,
32078+ seq_printf(m, " %s: %10llu", field_name,
32079 tracing_map_read_sum(elt, i));
32080 }
32081 }
32082
32083+ print_actions(m, hist_data, elt);
1a6e0f06 32084+
e4b2b4a8
JK
32085 seq_puts(m, "\n");
32086 }
32087
32088@@ -1102,6 +4878,11 @@
32089 hist_trigger_show(m, data, n++);
32090 }
32091
32092+ if (have_hist_err()) {
32093+ seq_printf(m, "\nERROR: %s\n", hist_err_str);
32094+ seq_printf(m, " Last command: %s\n", last_hist_cmd);
32095+ }
1a6e0f06 32096+
e4b2b4a8
JK
32097 out_unlock:
32098 mutex_unlock(&event_mutex);
32099
32100@@ -1120,34 +4901,31 @@
32101 .release = single_release,
32102 };
32103
32104-static const char *get_hist_field_flags(struct hist_field *hist_field)
32105+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
32106 {
32107- const char *flags_str = NULL;
32108+ const char *field_name = hist_field_name(hist_field, 0);
32109
32110- if (hist_field->flags & HIST_FIELD_FL_HEX)
32111- flags_str = "hex";
32112- else if (hist_field->flags & HIST_FIELD_FL_SYM)
32113- flags_str = "sym";
32114- else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
32115- flags_str = "sym-offset";
32116- else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
32117- flags_str = "execname";
32118- else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
32119- flags_str = "syscall";
32120- else if (hist_field->flags & HIST_FIELD_FL_LOG2)
32121- flags_str = "log2";
32122+ if (hist_field->var.name)
32123+ seq_printf(m, "%s=", hist_field->var.name);
32124
32125- return flags_str;
32126-}
32127+ if (hist_field->flags & HIST_FIELD_FL_CPU)
32128+ seq_puts(m, "cpu");
32129+ else if (field_name) {
32130+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
32131+ hist_field->flags & HIST_FIELD_FL_ALIAS)
32132+ seq_putc(m, '$');
32133+ seq_printf(m, "%s", field_name);
32134+ } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
32135+ seq_puts(m, "common_timestamp");
32136
32137-static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
32138-{
32139- seq_printf(m, "%s", hist_field->field->name);
32140 if (hist_field->flags) {
32141- const char *flags_str = get_hist_field_flags(hist_field);
32142+ if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
32143+ !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
32144+ const char *flags = get_hist_field_flags(hist_field);
32145
32146- if (flags_str)
32147- seq_printf(m, ".%s", flags_str);
32148+ if (flags)
32149+ seq_printf(m, ".%s", flags);
32150+ }
32151 }
32152 }
32153
32154@@ -1156,7 +4934,8 @@
32155 struct event_trigger_data *data)
32156 {
32157 struct hist_trigger_data *hist_data = data->private_data;
32158- struct hist_field *key_field;
32159+ struct hist_field *field;
32160+ bool have_var = false;
32161 unsigned int i;
32162
32163 seq_puts(m, "hist:");
32164@@ -1167,25 +4946,47 @@
32165 seq_puts(m, "keys=");
32166
32167 for_each_hist_key_field(i, hist_data) {
32168- key_field = hist_data->fields[i];
32169+ field = hist_data->fields[i];
32170
32171 if (i > hist_data->n_vals)
32172 seq_puts(m, ",");
32173
32174- if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
32175+ if (field->flags & HIST_FIELD_FL_STACKTRACE)
32176 seq_puts(m, "stacktrace");
32177 else
32178- hist_field_print(m, key_field);
32179+ hist_field_print(m, field);
32180 }
32181
32182 seq_puts(m, ":vals=");
32183
32184 for_each_hist_val_field(i, hist_data) {
32185+ field = hist_data->fields[i];
32186+ if (field->flags & HIST_FIELD_FL_VAR) {
32187+ have_var = true;
32188+ continue;
1a6e0f06
JK
32189+ }
32190+
e4b2b4a8
JK
32191 if (i == HITCOUNT_IDX)
32192 seq_puts(m, "hitcount");
32193 else {
32194 seq_puts(m, ",");
32195- hist_field_print(m, hist_data->fields[i]);
32196+ hist_field_print(m, field);
32197+ }
1a6e0f06
JK
32198+ }
32199+
e4b2b4a8
JK
32200+ if (have_var) {
32201+ unsigned int n = 0;
32202+
32203+ seq_puts(m, ":");
32204+
32205+ for_each_hist_val_field(i, hist_data) {
32206+ field = hist_data->fields[i];
32207+
32208+ if (field->flags & HIST_FIELD_FL_VAR) {
32209+ if (n++)
32210+ seq_puts(m, ",");
32211+ hist_field_print(m, field);
32212+ }
32213 }
32214 }
32215
32216@@ -1193,28 +4994,36 @@
32217
32218 for (i = 0; i < hist_data->n_sort_keys; i++) {
32219 struct tracing_map_sort_key *sort_key;
32220+ unsigned int idx, first_key_idx;
32221+
32222+ /* skip VAR vals */
32223+ first_key_idx = hist_data->n_vals - hist_data->n_vars;
32224
32225 sort_key = &hist_data->sort_keys[i];
32226+ idx = sort_key->field_idx;
32227+
32228+ if (WARN_ON(idx >= HIST_FIELDS_MAX))
32229+ return -EINVAL;
32230
32231 if (i > 0)
32232 seq_puts(m, ",");
32233
32234- if (sort_key->field_idx == HITCOUNT_IDX)
32235+ if (idx == HITCOUNT_IDX)
32236 seq_puts(m, "hitcount");
32237 else {
32238- unsigned int idx = sort_key->field_idx;
32239-
32240- if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
32241- return -EINVAL;
32242-
32243+ if (idx >= first_key_idx)
32244+ idx += hist_data->n_vars;
32245 hist_field_print(m, hist_data->fields[idx]);
32246 }
32247
32248 if (sort_key->descending)
32249 seq_puts(m, ".descending");
32250 }
32251-
32252 seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
32253+ if (hist_data->enable_timestamps)
32254+ seq_printf(m, ":clock=%s", hist_data->attrs->clock);
1a6e0f06 32255+
e4b2b4a8
JK
32256+ print_actions_spec(m, hist_data);
32257
32258 if (data->filter_str)
32259 seq_printf(m, " if %s", data->filter_str);
32260@@ -1242,6 +5051,21 @@
32261 return 0;
32262 }
32263
32264+static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
1a6e0f06 32265+{
e4b2b4a8
JK
32266+ struct trace_event_file *file;
32267+ unsigned int i;
32268+ char *cmd;
32269+ int ret;
1a6e0f06 32270+
e4b2b4a8
JK
32271+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
32272+ file = hist_data->field_var_hists[i]->hist_data->event_file;
32273+ cmd = hist_data->field_var_hists[i]->cmd;
32274+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
32275+ "!hist", "hist", cmd);
32276+ }
1a6e0f06
JK
32277+}
32278+
e4b2b4a8
JK
32279 static void event_hist_trigger_free(struct event_trigger_ops *ops,
32280 struct event_trigger_data *data)
32281 {
32282@@ -1254,7 +5078,13 @@
32283 if (!data->ref) {
32284 if (data->name)
32285 del_named_trigger(data);
1a6e0f06 32286+
e4b2b4a8 32287 trigger_data_free(data);
1a6e0f06 32288+
e4b2b4a8 32289+ remove_hist_vars(hist_data);
1a6e0f06 32290+
e4b2b4a8 32291+ unregister_field_var_hists(hist_data);
1a6e0f06 32292+
e4b2b4a8
JK
32293 destroy_hist_data(hist_data);
32294 }
32295 }
32296@@ -1381,6 +5211,15 @@
32297 return false;
32298 if (key_field->offset != key_field_test->offset)
32299 return false;
32300+ if (key_field->size != key_field_test->size)
32301+ return false;
32302+ if (key_field->is_signed != key_field_test->is_signed)
32303+ return false;
32304+ if (!!key_field->var.name != !!key_field_test->var.name)
32305+ return false;
32306+ if (key_field->var.name &&
32307+ strcmp(key_field->var.name, key_field_test->var.name) != 0)
32308+ return false;
32309 }
32310
32311 for (i = 0; i < hist_data->n_sort_keys; i++) {
32312@@ -1396,6 +5235,9 @@
32313 (strcmp(data->filter_str, data_test->filter_str) != 0))
32314 return false;
32315
32316+ if (!actions_match(hist_data, hist_data_test))
32317+ return false;
1a6e0f06 32318+
e4b2b4a8
JK
32319 return true;
32320 }
32321
32322@@ -1412,6 +5254,7 @@
32323 if (named_data) {
32324 if (!hist_trigger_match(data, named_data, named_data,
32325 true)) {
32326+ hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
32327 ret = -EINVAL;
32328 goto out;
32329 }
32330@@ -1431,13 +5274,16 @@
32331 test->paused = false;
32332 else if (hist_data->attrs->clear)
32333 hist_clear(test);
32334- else
32335+ else {
32336+ hist_err("Hist trigger already exists", NULL);
32337 ret = -EEXIST;
32338+ }
32339 goto out;
32340 }
32341 }
32342 new:
32343 if (hist_data->attrs->cont || hist_data->attrs->clear) {
32344+ hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
32345 ret = -ENOENT;
32346 goto out;
32347 }
32348@@ -1446,7 +5292,6 @@
32349 data->paused = true;
32350
32351 if (named_data) {
32352- destroy_hist_data(data->private_data);
32353 data->private_data = named_data->private_data;
32354 set_named_trigger_data(data, named_data);
32355 data->ops = &event_hist_trigger_named_ops;
32356@@ -1458,8 +5303,32 @@
32357 goto out;
32358 }
32359
32360- list_add_rcu(&data->list, &file->triggers);
32361+ if (hist_data->enable_timestamps) {
32362+ char *clock = hist_data->attrs->clock;
1a6e0f06 32363+
e4b2b4a8
JK
32364+ ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
32365+ if (ret) {
32366+ hist_err("Couldn't set trace_clock: ", clock);
32367+ goto out;
32368+ }
1a6e0f06 32369+
e4b2b4a8 32370+ tracing_set_time_stamp_abs(file->tr, true);
1a6e0f06
JK
32371+ }
32372+
e4b2b4a8
JK
32373+ if (named_data)
32374+ destroy_hist_data(hist_data);
1a6e0f06 32375+
e4b2b4a8
JK
32376 ret++;
32377+ out:
32378+ return ret;
1a6e0f06 32379+}
1a6e0f06 32380+
e4b2b4a8
JK
32381+static int hist_trigger_enable(struct event_trigger_data *data,
32382+ struct trace_event_file *file)
1a6e0f06 32383+{
e4b2b4a8 32384+ int ret = 0;
1a6e0f06 32385+
e4b2b4a8
JK
32386+ list_add_tail_rcu(&data->list, &file->triggers);
32387
32388 update_cond_flag(file);
32389
32390@@ -1468,10 +5337,55 @@
32391 update_cond_flag(file);
32392 ret--;
32393 }
32394- out:
1a6e0f06 32395+
e4b2b4a8
JK
32396 return ret;
32397 }
32398
32399+static bool have_hist_trigger_match(struct event_trigger_data *data,
32400+ struct trace_event_file *file)
1a6e0f06 32401+{
e4b2b4a8
JK
32402+ struct hist_trigger_data *hist_data = data->private_data;
32403+ struct event_trigger_data *test, *named_data = NULL;
32404+ bool match = false;
1a6e0f06 32405+
e4b2b4a8
JK
32406+ if (hist_data->attrs->name)
32407+ named_data = find_named_trigger(hist_data->attrs->name);
1a6e0f06 32408+
e4b2b4a8
JK
32409+ list_for_each_entry_rcu(test, &file->triggers, list) {
32410+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32411+ if (hist_trigger_match(data, test, named_data, false)) {
32412+ match = true;
32413+ break;
1a6e0f06 32414+ }
1a6e0f06
JK
32415+ }
32416+ }
1a6e0f06 32417+
e4b2b4a8
JK
32418+ return match;
32419+}
1a6e0f06 32420+
e4b2b4a8
JK
32421+static bool hist_trigger_check_refs(struct event_trigger_data *data,
32422+ struct trace_event_file *file)
1a6e0f06 32423+{
e4b2b4a8
JK
32424+ struct hist_trigger_data *hist_data = data->private_data;
32425+ struct event_trigger_data *test, *named_data = NULL;
1a6e0f06 32426+
e4b2b4a8
JK
32427+ if (hist_data->attrs->name)
32428+ named_data = find_named_trigger(hist_data->attrs->name);
1a6e0f06 32429+
e4b2b4a8
JK
32430+ list_for_each_entry_rcu(test, &file->triggers, list) {
32431+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32432+ if (!hist_trigger_match(data, test, named_data, false))
32433+ continue;
32434+ hist_data = test->private_data;
32435+ if (check_var_refs(hist_data))
32436+ return true;
32437+ break;
1a6e0f06 32438+ }
e4b2b4a8 32439+ }
1a6e0f06 32440+
e4b2b4a8
JK
32441+ return false;
32442+}
1a6e0f06 32443+
e4b2b4a8
JK
32444 static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
32445 struct event_trigger_data *data,
32446 struct trace_event_file *file)
32447@@ -1497,17 +5411,55 @@
32448
32449 if (unregistered && test->ops->free)
32450 test->ops->free(test->ops, test);
1a6e0f06 32451+
e4b2b4a8
JK
32452+ if (hist_data->enable_timestamps) {
32453+ if (!hist_data->remove || unregistered)
32454+ tracing_set_time_stamp_abs(file->tr, false);
32455+ }
32456+}
1a6e0f06 32457+
e4b2b4a8
JK
32458+static bool hist_file_check_refs(struct trace_event_file *file)
32459+{
32460+ struct hist_trigger_data *hist_data;
32461+ struct event_trigger_data *test;
1a6e0f06 32462+
e4b2b4a8
JK
32463+ list_for_each_entry_rcu(test, &file->triggers, list) {
32464+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32465+ hist_data = test->private_data;
32466+ if (check_var_refs(hist_data))
32467+ return true;
1a6e0f06 32468+ }
e4b2b4a8 32469+ }
1a6e0f06 32470+
e4b2b4a8
JK
32471+ return false;
32472 }
32473
32474 static void hist_unreg_all(struct trace_event_file *file)
32475 {
32476 struct event_trigger_data *test, *n;
32477+ struct hist_trigger_data *hist_data;
32478+ struct synth_event *se;
32479+ const char *se_name;
1a6e0f06 32480+
e4b2b4a8
JK
32481+ if (hist_file_check_refs(file))
32482+ return;
32483
32484 list_for_each_entry_safe(test, n, &file->triggers, list) {
32485 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32486+ hist_data = test->private_data;
32487 list_del_rcu(&test->list);
32488 trace_event_trigger_enable_disable(file, 0);
32489+
32490+ mutex_lock(&synth_event_mutex);
32491+ se_name = trace_event_name(file->event_call);
32492+ se = find_synth_event(se_name);
32493+ if (se)
32494+ se->ref--;
32495+ mutex_unlock(&synth_event_mutex);
32496+
32497 update_cond_flag(file);
32498+ if (hist_data->enable_timestamps)
32499+ tracing_set_time_stamp_abs(file->tr, false);
32500 if (test->ops->free)
32501 test->ops->free(test->ops, test);
32502 }
32503@@ -1523,16 +5475,54 @@
32504 struct hist_trigger_attrs *attrs;
32505 struct event_trigger_ops *trigger_ops;
32506 struct hist_trigger_data *hist_data;
32507- char *trigger;
32508+ struct synth_event *se;
32509+ const char *se_name;
32510+ bool remove = false;
32511+ char *trigger, *p;
32512 int ret = 0;
32513
32514+ if (glob && strlen(glob)) {
32515+ last_cmd_set(param);
32516+ hist_err_clear();
32517+ }
1a6e0f06 32518+
e4b2b4a8
JK
32519 if (!param)
32520 return -EINVAL;
32521
32522- /* separate the trigger from the filter (k:v [if filter]) */
32523- trigger = strsep(&param, " \t");
32524- if (!trigger)
32525- return -EINVAL;
32526+ if (glob[0] == '!')
32527+ remove = true;
1a6e0f06 32528+
e4b2b4a8
JK
32529+ /*
32530+ * separate the trigger from the filter (k:v [if filter])
32531+ * allowing for whitespace in the trigger
32532+ */
32533+ p = trigger = param;
32534+ do {
32535+ p = strstr(p, "if");
32536+ if (!p)
32537+ break;
32538+ if (p == param)
32539+ return -EINVAL;
32540+ if (*(p - 1) != ' ' && *(p - 1) != '\t') {
32541+ p++;
32542+ continue;
1a6e0f06 32543+ }
e4b2b4a8
JK
32544+ if (p >= param + strlen(param) - strlen("if") - 1)
32545+ return -EINVAL;
32546+ if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
32547+ p++;
32548+ continue;
32549+ }
32550+ break;
32551+ } while (p);
32552+
32553+ if (!p)
32554+ param = NULL;
32555+ else {
32556+ *(p - 1) = '\0';
32557+ param = strstrip(p);
32558+ trigger = strstrip(trigger);
1a6e0f06 32559+ }
e4b2b4a8
JK
32560
32561 attrs = parse_hist_trigger_attrs(trigger);
32562 if (IS_ERR(attrs))
32563@@ -1541,7 +5531,7 @@
32564 if (attrs->map_bits)
32565 hist_trigger_bits = attrs->map_bits;
32566
32567- hist_data = create_hist_data(hist_trigger_bits, attrs, file);
32568+ hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
32569 if (IS_ERR(hist_data)) {
32570 destroy_hist_trigger_attrs(attrs);
32571 return PTR_ERR(hist_data);
32572@@ -1549,10 +5539,11 @@
32573
32574 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
32575
32576- ret = -ENOMEM;
32577 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
32578- if (!trigger_data)
32579+ if (!trigger_data) {
32580+ ret = -ENOMEM;
32581 goto out_free;
32582+ }
32583
32584 trigger_data->count = -1;
32585 trigger_data->ops = trigger_ops;
32586@@ -1570,8 +5561,24 @@
32587 goto out_free;
32588 }
32589
32590- if (glob[0] == '!') {
32591+ if (remove) {
32592+ if (!have_hist_trigger_match(trigger_data, file))
32593+ goto out_free;
1a6e0f06 32594+
e4b2b4a8
JK
32595+ if (hist_trigger_check_refs(trigger_data, file)) {
32596+ ret = -EBUSY;
32597+ goto out_free;
1a6e0f06
JK
32598+ }
32599+
e4b2b4a8 32600 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
1a6e0f06 32601+
e4b2b4a8
JK
32602+ mutex_lock(&synth_event_mutex);
32603+ se_name = trace_event_name(file->event_call);
32604+ se = find_synth_event(se_name);
32605+ if (se)
32606+ se->ref--;
32607+ mutex_unlock(&synth_event_mutex);
1a6e0f06 32608+
e4b2b4a8
JK
32609 ret = 0;
32610 goto out_free;
32611 }
32612@@ -1588,14 +5595,47 @@
32613 goto out_free;
32614 } else if (ret < 0)
32615 goto out_free;
1a6e0f06 32616+
e4b2b4a8
JK
32617+ if (get_named_trigger_data(trigger_data))
32618+ goto enable;
1a6e0f06 32619+
e4b2b4a8
JK
32620+ if (has_hist_vars(hist_data))
32621+ save_hist_vars(hist_data);
1a6e0f06 32622+
e4b2b4a8
JK
32623+ ret = create_actions(hist_data, file);
32624+ if (ret)
32625+ goto out_unreg;
1a6e0f06 32626+
e4b2b4a8
JK
32627+ ret = tracing_map_init(hist_data->map);
32628+ if (ret)
32629+ goto out_unreg;
32630+enable:
32631+ ret = hist_trigger_enable(trigger_data, file);
32632+ if (ret)
32633+ goto out_unreg;
1a6e0f06 32634+
e4b2b4a8
JK
32635+ mutex_lock(&synth_event_mutex);
32636+ se_name = trace_event_name(file->event_call);
32637+ se = find_synth_event(se_name);
32638+ if (se)
32639+ se->ref++;
32640+ mutex_unlock(&synth_event_mutex);
1a6e0f06 32641+
e4b2b4a8
JK
32642 /* Just return zero, not the number of registered triggers */
32643 ret = 0;
32644 out:
32645+ if (ret == 0)
32646+ hist_err_clear();
1a6e0f06 32647+
e4b2b4a8
JK
32648 return ret;
32649+ out_unreg:
32650+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
32651 out_free:
32652 if (cmd_ops->set_filter)
32653 cmd_ops->set_filter(NULL, trigger_data, NULL);
32654
32655+ remove_hist_vars(hist_data);
1a6e0f06 32656+
e4b2b4a8
JK
32657 kfree(trigger_data);
32658
32659 destroy_hist_data(hist_data);
32660@@ -1625,7 +5665,8 @@
32661 }
32662
32663 static void
32664-hist_enable_trigger(struct event_trigger_data *data, void *rec)
32665+hist_enable_trigger(struct event_trigger_data *data, void *rec,
32666+ struct ring_buffer_event *event)
32667 {
32668 struct enable_trigger_data *enable_data = data->private_data;
32669 struct event_trigger_data *test;
32670@@ -1641,7 +5682,8 @@
32671 }
32672
32673 static void
32674-hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
32675+hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
32676+ struct ring_buffer_event *event)
32677 {
32678 if (!data->count)
32679 return;
32680@@ -1649,7 +5691,7 @@
32681 if (data->count != -1)
32682 (data->count)--;
32683
32684- hist_enable_trigger(data, rec);
32685+ hist_enable_trigger(data, rec, event);
32686 }
32687
32688 static struct event_trigger_ops hist_enable_trigger_ops = {
32689@@ -1754,3 +5796,31 @@
32690
32691 return ret;
32692 }
1a6e0f06 32693+
e4b2b4a8
JK
32694+static __init int trace_events_hist_init(void)
32695+{
32696+ struct dentry *entry = NULL;
32697+ struct dentry *d_tracer;
32698+ int err = 0;
1a6e0f06 32699+
e4b2b4a8
JK
32700+ d_tracer = tracing_init_dentry();
32701+ if (IS_ERR(d_tracer)) {
32702+ err = PTR_ERR(d_tracer);
32703+ goto err;
1a6e0f06
JK
32704+ }
32705+
e4b2b4a8
JK
32706+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
32707+ NULL, &synth_events_fops);
32708+ if (!entry) {
32709+ err = -ENODEV;
32710+ goto err;
1a6e0f06
JK
32711+ }
32712+
e4b2b4a8
JK
32713+ return err;
32714+ err:
32715+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
32716+
32717+ return err;
1a6e0f06 32718+}
1a6e0f06 32719+
e4b2b4a8
JK
32720+fs_initcall(trace_events_hist_init);
32721diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events_trigger.c linux-4.14/kernel/trace/trace_events_trigger.c
32722--- linux-4.14.orig/kernel/trace/trace_events_trigger.c 2018-09-05 11:03:22.000000000 +0200
32723+++ linux-4.14/kernel/trace/trace_events_trigger.c 2018-09-05 11:05:07.000000000 +0200
32724@@ -63,7 +63,8 @@
32725 * any trigger that should be deferred, ETT_NONE if nothing to defer.
32726 */
32727 enum event_trigger_type
32728-event_triggers_call(struct trace_event_file *file, void *rec)
32729+event_triggers_call(struct trace_event_file *file, void *rec,
32730+ struct ring_buffer_event *event)
32731 {
32732 struct event_trigger_data *data;
32733 enum event_trigger_type tt = ETT_NONE;
32734@@ -76,7 +77,7 @@
32735 if (data->paused)
32736 continue;
32737 if (!rec) {
32738- data->ops->func(data, rec);
32739+ data->ops->func(data, rec, event);
32740 continue;
32741 }
32742 filter = rcu_dereference_sched(data->filter);
32743@@ -86,7 +87,7 @@
32744 tt |= data->cmd_ops->trigger_type;
32745 continue;
32746 }
32747- data->ops->func(data, rec);
32748+ data->ops->func(data, rec, event);
32749 }
32750 return tt;
32751 }
32752@@ -108,7 +109,7 @@
32753 void
32754 event_triggers_post_call(struct trace_event_file *file,
32755 enum event_trigger_type tt,
32756- void *rec)
32757+ void *rec, struct ring_buffer_event *event)
32758 {
32759 struct event_trigger_data *data;
32760
32761@@ -116,7 +117,7 @@
32762 if (data->paused)
32763 continue;
32764 if (data->cmd_ops->trigger_type & tt)
32765- data->ops->func(data, rec);
32766+ data->ops->func(data, rec, event);
32767 }
32768 }
32769 EXPORT_SYMBOL_GPL(event_triggers_post_call);
32770@@ -914,8 +915,15 @@
32771 data->named_data = named_data;
32772 }
32773
32774+struct event_trigger_data *
32775+get_named_trigger_data(struct event_trigger_data *data)
1a6e0f06 32776+{
e4b2b4a8 32777+ return data->named_data;
1a6e0f06
JK
32778+}
32779+
e4b2b4a8
JK
32780 static void
32781-traceon_trigger(struct event_trigger_data *data, void *rec)
32782+traceon_trigger(struct event_trigger_data *data, void *rec,
32783+ struct ring_buffer_event *event)
32784 {
32785 if (tracing_is_on())
32786 return;
32787@@ -924,7 +932,8 @@
32788 }
32789
32790 static void
32791-traceon_count_trigger(struct event_trigger_data *data, void *rec)
32792+traceon_count_trigger(struct event_trigger_data *data, void *rec,
32793+ struct ring_buffer_event *event)
32794 {
32795 if (tracing_is_on())
32796 return;
32797@@ -939,7 +948,8 @@
32798 }
32799
32800 static void
32801-traceoff_trigger(struct event_trigger_data *data, void *rec)
32802+traceoff_trigger(struct event_trigger_data *data, void *rec,
32803+ struct ring_buffer_event *event)
32804 {
32805 if (!tracing_is_on())
32806 return;
32807@@ -948,7 +958,8 @@
32808 }
32809
32810 static void
32811-traceoff_count_trigger(struct event_trigger_data *data, void *rec)
32812+traceoff_count_trigger(struct event_trigger_data *data, void *rec,
32813+ struct ring_buffer_event *event)
32814 {
32815 if (!tracing_is_on())
32816 return;
32817@@ -1045,7 +1056,8 @@
1a6e0f06 32818
e4b2b4a8
JK
32819 #ifdef CONFIG_TRACER_SNAPSHOT
32820 static void
32821-snapshot_trigger(struct event_trigger_data *data, void *rec)
32822+snapshot_trigger(struct event_trigger_data *data, void *rec,
32823+ struct ring_buffer_event *event)
32824 {
32825 struct trace_event_file *file = data->private_data;
32826
32827@@ -1056,7 +1068,8 @@
1a6e0f06 32828 }
1a6e0f06 32829
e4b2b4a8
JK
32830 static void
32831-snapshot_count_trigger(struct event_trigger_data *data, void *rec)
32832+snapshot_count_trigger(struct event_trigger_data *data, void *rec,
32833+ struct ring_buffer_event *event)
32834 {
32835 if (!data->count)
32836 return;
32837@@ -1064,7 +1077,7 @@
32838 if (data->count != -1)
32839 (data->count)--;
1a6e0f06 32840
e4b2b4a8
JK
32841- snapshot_trigger(data, rec);
32842+ snapshot_trigger(data, rec, event);
32843 }
32844
32845 static int
32846@@ -1143,13 +1156,15 @@
32847 #define STACK_SKIP 3
32848
32849 static void
32850-stacktrace_trigger(struct event_trigger_data *data, void *rec)
32851+stacktrace_trigger(struct event_trigger_data *data, void *rec,
32852+ struct ring_buffer_event *event)
1a6e0f06 32853 {
e4b2b4a8 32854 trace_dump_stack(STACK_SKIP);
1a6e0f06
JK
32855 }
32856
e4b2b4a8
JK
32857 static void
32858-stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
32859+stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
32860+ struct ring_buffer_event *event)
32861 {
32862 if (!data->count)
32863 return;
32864@@ -1157,7 +1172,7 @@
32865 if (data->count != -1)
32866 (data->count)--;
32867
32868- stacktrace_trigger(data, rec);
32869+ stacktrace_trigger(data, rec, event);
1a6e0f06
JK
32870 }
32871
e4b2b4a8
JK
32872 static int
32873@@ -1219,7 +1234,8 @@
32874 }
32875
32876 static void
32877-event_enable_trigger(struct event_trigger_data *data, void *rec)
32878+event_enable_trigger(struct event_trigger_data *data, void *rec,
32879+ struct ring_buffer_event *event)
32880 {
32881 struct enable_trigger_data *enable_data = data->private_data;
32882
32883@@ -1230,7 +1246,8 @@
32884 }
32885
32886 static void
32887-event_enable_count_trigger(struct event_trigger_data *data, void *rec)
32888+event_enable_count_trigger(struct event_trigger_data *data, void *rec,
32889+ struct ring_buffer_event *event)
32890 {
32891 struct enable_trigger_data *enable_data = data->private_data;
32892
32893@@ -1244,7 +1261,7 @@
32894 if (data->count != -1)
32895 (data->count)--;
32896
32897- event_enable_trigger(data, rec);
32898+ event_enable_trigger(data, rec, event);
32899 }
32900
32901 int event_enable_trigger_print(struct seq_file *m,
32902diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace.h linux-4.14/kernel/trace/trace.h
32903--- linux-4.14.orig/kernel/trace/trace.h 2018-09-05 11:03:22.000000000 +0200
32904+++ linux-4.14/kernel/trace/trace.h 2018-09-05 11:05:07.000000000 +0200
32905@@ -127,6 +127,7 @@
1a6e0f06
JK
32906 * NEED_RESCHED - reschedule is requested
32907 * HARDIRQ - inside an interrupt handler
32908 * SOFTIRQ - inside a softirq handler
32909+ * NEED_RESCHED_LAZY - lazy reschedule is requested
32910 */
32911 enum trace_flag_type {
32912 TRACE_FLAG_IRQS_OFF = 0x01,
e4b2b4a8 32913@@ -136,6 +137,7 @@
1a6e0f06
JK
32914 TRACE_FLAG_SOFTIRQ = 0x10,
32915 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
32916 TRACE_FLAG_NMI = 0x40,
32917+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
32918 };
32919
32920 #define TRACE_BUF_SIZE 1024
e4b2b4a8
JK
32921@@ -273,6 +275,8 @@
32922 /* function tracing enabled */
32923 int function_enabled;
32924 #endif
32925+ int time_stamp_abs_ref;
32926+ struct list_head hist_vars;
32927 };
1a6e0f06 32928
e4b2b4a8
JK
32929 enum {
32930@@ -286,6 +290,11 @@
32931 extern int trace_array_get(struct trace_array *tr);
32932 extern void trace_array_put(struct trace_array *tr);
1a6e0f06 32933
e4b2b4a8
JK
32934+extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
32935+extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
32936+
32937+extern bool trace_clock_in_ns(struct trace_array *tr);
32938+
32939 /*
32940 * The global tracer (top) should be the first trace array added,
32941 * but we check the flag anyway.
32942@@ -1293,7 +1302,7 @@
32943 unsigned long eflags = file->flags;
1a6e0f06 32944
e4b2b4a8
JK
32945 if (eflags & EVENT_FILE_FL_TRIGGER_COND)
32946- *tt = event_triggers_call(file, entry);
32947+ *tt = event_triggers_call(file, entry, event);
1a6e0f06 32948
e4b2b4a8
JK
32949 if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
32950 (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
32951@@ -1330,7 +1339,7 @@
32952 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
32953
32954 if (tt)
32955- event_triggers_post_call(file, tt, entry);
32956+ event_triggers_post_call(file, tt, entry, event);
1a6e0f06
JK
32957 }
32958
e4b2b4a8
JK
32959 /**
32960@@ -1363,7 +1372,7 @@
32961 irq_flags, pc, regs);
32962
32963 if (tt)
32964- event_triggers_post_call(file, tt, entry);
32965+ event_triggers_post_call(file, tt, entry, event);
32966 }
32967
32968 #define FILTER_PRED_INVALID ((unsigned short)-1)
32969@@ -1545,6 +1554,8 @@
32970 extern void unpause_named_trigger(struct event_trigger_data *data);
32971 extern void set_named_trigger_data(struct event_trigger_data *data,
32972 struct event_trigger_data *named_data);
32973+extern struct event_trigger_data *
32974+get_named_trigger_data(struct event_trigger_data *data);
32975 extern int register_event_command(struct event_command *cmd);
32976 extern int unregister_event_command(struct event_command *cmd);
32977 extern int register_trigger_hist_enable_disable_cmds(void);
32978@@ -1588,7 +1599,8 @@
1a6e0f06 32979 */
e4b2b4a8
JK
32980 struct event_trigger_ops {
32981 void (*func)(struct event_trigger_data *data,
32982- void *rec);
32983+ void *rec,
32984+ struct ring_buffer_event *rbe);
32985 int (*init)(struct event_trigger_ops *ops,
32986 struct event_trigger_data *data);
32987 void (*free)(struct event_trigger_ops *ops,
32988@@ -1755,6 +1767,13 @@
32989 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
32990 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
32991
32992+#define MAX_EVENT_NAME_LEN 64
32993+
32994+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
32995+extern ssize_t trace_parse_run_command(struct file *file,
32996+ const char __user *buffer, size_t count, loff_t *ppos,
32997+ int (*createfn)(int, char**));
32998+
32999 /*
33000 * Normal trace_printk() and friends allocates special buffers
33001 * to do the manipulation, as well as saves the print formats
33002diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_hwlat.c linux-4.14/kernel/trace/trace_hwlat.c
33003--- linux-4.14.orig/kernel/trace/trace_hwlat.c 2017-11-12 19:46:13.000000000 +0100
33004+++ linux-4.14/kernel/trace/trace_hwlat.c 2018-09-05 11:05:07.000000000 +0200
33005@@ -279,7 +279,7 @@
33006 * of this thread, than stop migrating for the duration
33007 * of the current test.
33008 */
33009- if (!cpumask_equal(current_mask, &current->cpus_allowed))
33010+ if (!cpumask_equal(current_mask, current->cpus_ptr))
33011 goto disable;
1a6e0f06 33012
e4b2b4a8
JK
33013 get_online_cpus();
33014diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_kprobe.c linux-4.14/kernel/trace/trace_kprobe.c
33015--- linux-4.14.orig/kernel/trace/trace_kprobe.c 2018-09-05 11:03:22.000000000 +0200
33016+++ linux-4.14/kernel/trace/trace_kprobe.c 2018-09-05 11:05:07.000000000 +0200
33017@@ -918,8 +918,8 @@
33018 static ssize_t probes_write(struct file *file, const char __user *buffer,
33019 size_t count, loff_t *ppos)
33020 {
33021- return traceprobe_probes_write(file, buffer, count, ppos,
33022- create_trace_kprobe);
33023+ return trace_parse_run_command(file, buffer, count, ppos,
33024+ create_trace_kprobe);
33025 }
33026
33027 static const struct file_operations kprobe_events_ops = {
33028@@ -1444,9 +1444,9 @@
33029
33030 pr_info("Testing kprobe tracing: ");
33031
33032- ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
33033- "$stack $stack0 +0($stack)",
33034- create_trace_kprobe);
33035+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
33036+ "$stack $stack0 +0($stack)",
33037+ create_trace_kprobe);
33038 if (WARN_ON_ONCE(ret)) {
33039 pr_warn("error on probing function entry.\n");
33040 warn++;
33041@@ -1466,8 +1466,8 @@
33042 }
33043 }
1a6e0f06 33044
e4b2b4a8
JK
33045- ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
33046- "$retval", create_trace_kprobe);
33047+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
33048+ "$retval", create_trace_kprobe);
33049 if (WARN_ON_ONCE(ret)) {
33050 pr_warn("error on probing function return.\n");
33051 warn++;
33052@@ -1537,13 +1537,13 @@
33053 disable_trace_kprobe(tk, file);
33054 }
1a6e0f06 33055
e4b2b4a8
JK
33056- ret = traceprobe_command("-:testprobe", create_trace_kprobe);
33057+ ret = trace_run_command("-:testprobe", create_trace_kprobe);
33058 if (WARN_ON_ONCE(ret)) {
33059 pr_warn("error on deleting a probe.\n");
33060 warn++;
33061 }
33062
33063- ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
33064+ ret = trace_run_command("-:testprobe2", create_trace_kprobe);
33065 if (WARN_ON_ONCE(ret)) {
33066 pr_warn("error on deleting a probe.\n");
33067 warn++;
33068diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_output.c linux-4.14/kernel/trace/trace_output.c
33069--- linux-4.14.orig/kernel/trace/trace_output.c 2018-09-05 11:03:22.000000000 +0200
33070+++ linux-4.14/kernel/trace/trace_output.c 2018-09-05 11:05:07.000000000 +0200
33071@@ -447,6 +447,7 @@
1a6e0f06
JK
33072 {
33073 char hardsoft_irq;
33074 char need_resched;
33075+ char need_resched_lazy;
33076 char irqs_off;
33077 int hardirq;
33078 int softirq;
e4b2b4a8 33079@@ -477,6 +478,9 @@
1a6e0f06
JK
33080 break;
33081 }
33082
33083+ need_resched_lazy =
33084+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
33085+
33086 hardsoft_irq =
33087 (nmi && hardirq) ? 'Z' :
33088 nmi ? 'z' :
e4b2b4a8 33089@@ -485,14 +489,25 @@
1a6e0f06
JK
33090 softirq ? 's' :
33091 '.' ;
33092
e4b2b4a8
JK
33093- trace_seq_printf(s, "%c%c%c",
33094- irqs_off, need_resched, hardsoft_irq);
33095+ trace_seq_printf(s, "%c%c%c%c",
33096+ irqs_off, need_resched, need_resched_lazy,
33097+ hardsoft_irq);
33098
33099 if (entry->preempt_count)
33100 trace_seq_printf(s, "%x", entry->preempt_count);
33101 else
33102 trace_seq_putc(s, '.');
33103
33104+ if (entry->preempt_lazy_count)
33105+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
33106+ else
33107+ trace_seq_putc(s, '.');
33108+
33109+ if (entry->migrate_disable)
33110+ trace_seq_printf(s, "%x", entry->migrate_disable);
33111+ else
33112+ trace_seq_putc(s, '.');
33113+
33114 return !trace_seq_has_overflowed(s);
33115 }
33116
33117diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_probe.c linux-4.14/kernel/trace/trace_probe.c
33118--- linux-4.14.orig/kernel/trace/trace_probe.c 2018-09-05 11:03:22.000000000 +0200
33119+++ linux-4.14/kernel/trace/trace_probe.c 2018-09-05 11:05:07.000000000 +0200
33120@@ -621,92 +621,6 @@
33121 kfree(arg->comm);
33122 }
33123
33124-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
33125-{
33126- char **argv;
33127- int argc, ret;
33128-
33129- argc = 0;
33130- ret = 0;
33131- argv = argv_split(GFP_KERNEL, buf, &argc);
33132- if (!argv)
33133- return -ENOMEM;
33134-
33135- if (argc)
33136- ret = createfn(argc, argv);
33137-
33138- argv_free(argv);
33139-
33140- return ret;
33141-}
33142-
33143-#define WRITE_BUFSIZE 4096
33144-
33145-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
33146- size_t count, loff_t *ppos,
33147- int (*createfn)(int, char **))
33148-{
33149- char *kbuf, *buf, *tmp;
33150- int ret = 0;
33151- size_t done = 0;
33152- size_t size;
33153-
33154- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
33155- if (!kbuf)
33156- return -ENOMEM;
33157-
33158- while (done < count) {
33159- size = count - done;
33160-
33161- if (size >= WRITE_BUFSIZE)
33162- size = WRITE_BUFSIZE - 1;
33163-
33164- if (copy_from_user(kbuf, buffer + done, size)) {
33165- ret = -EFAULT;
33166- goto out;
33167- }
33168- kbuf[size] = '\0';
33169- buf = kbuf;
33170- do {
33171- tmp = strchr(buf, '\n');
33172- if (tmp) {
33173- *tmp = '\0';
33174- size = tmp - buf + 1;
33175- } else {
33176- size = strlen(buf);
33177- if (done + size < count) {
33178- if (buf != kbuf)
33179- break;
33180- /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
33181- pr_warn("Line length is too long: Should be less than %d\n",
33182- WRITE_BUFSIZE - 2);
33183- ret = -EINVAL;
33184- goto out;
33185- }
33186- }
33187- done += size;
33188-
33189- /* Remove comments */
33190- tmp = strchr(buf, '#');
33191-
33192- if (tmp)
33193- *tmp = '\0';
33194-
33195- ret = traceprobe_command(buf, createfn);
33196- if (ret)
33197- goto out;
33198- buf += size;
33199-
33200- } while (done < count);
33201- }
33202- ret = done;
33203-
33204-out:
33205- kfree(kbuf);
33206-
33207- return ret;
33208-}
33209-
33210 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
33211 bool is_return)
33212 {
33213diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_probe.h linux-4.14/kernel/trace/trace_probe.h
33214--- linux-4.14.orig/kernel/trace/trace_probe.h 2018-09-05 11:03:22.000000000 +0200
33215+++ linux-4.14/kernel/trace/trace_probe.h 2018-09-05 11:05:07.000000000 +0200
33216@@ -42,7 +42,6 @@
33217
33218 #define MAX_TRACE_ARGS 128
33219 #define MAX_ARGSTR_LEN 63
33220-#define MAX_EVENT_NAME_LEN 64
33221 #define MAX_STRING_SIZE PATH_MAX
33222
33223 /* Reserved field names */
33224@@ -356,12 +355,6 @@
33225
33226 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
33227
33228-extern ssize_t traceprobe_probes_write(struct file *file,
33229- const char __user *buffer, size_t count, loff_t *ppos,
33230- int (*createfn)(int, char**));
33231-
33232-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
33233-
33234 /* Sum up total data length for dynamic arraies (strings) */
33235 static nokprobe_inline int
33236 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
33237diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_uprobe.c linux-4.14/kernel/trace/trace_uprobe.c
33238--- linux-4.14.orig/kernel/trace/trace_uprobe.c 2018-09-05 11:03:22.000000000 +0200
33239+++ linux-4.14/kernel/trace/trace_uprobe.c 2018-09-05 11:05:07.000000000 +0200
33240@@ -647,7 +647,7 @@
33241 static ssize_t probes_write(struct file *file, const char __user *buffer,
33242 size_t count, loff_t *ppos)
33243 {
33244- return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
33245+ return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
33246 }
33247
33248 static const struct file_operations uprobe_events_ops = {
33249diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/tracing_map.c linux-4.14/kernel/trace/tracing_map.c
33250--- linux-4.14.orig/kernel/trace/tracing_map.c 2017-11-12 19:46:13.000000000 +0100
33251+++ linux-4.14/kernel/trace/tracing_map.c 2018-09-05 11:05:07.000000000 +0200
33252@@ -66,6 +66,73 @@
33253 return (u64)atomic64_read(&elt->fields[i].sum);
33254 }
33255
33256+/**
33257+ * tracing_map_set_var - Assign a tracing_map_elt's variable field
33258+ * @elt: The tracing_map_elt
33259+ * @i: The index of the given variable associated with the tracing_map_elt
33260+ * @n: The value to assign
33261+ *
33262+ * Assign n to variable i associated with the specified tracing_map_elt
33263+ * instance. The index i is the index returned by the call to
33264+ * tracing_map_add_var() when the tracing map was set up.
33265+ */
33266+void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
33267+{
33268+ atomic64_set(&elt->vars[i], n);
33269+ elt->var_set[i] = true;
33270+}
33271+
33272+/**
33273+ * tracing_map_var_set - Return whether or not a variable has been set
33274+ * @elt: The tracing_map_elt
33275+ * @i: The index of the given variable associated with the tracing_map_elt
33276+ *
33277+ * Return true if the variable has been set, false otherwise. The
33278+ * index i is the index returned by the call to tracing_map_add_var()
33279+ * when the tracing map was set up.
33280+ */
33281+bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
33282+{
33283+ return elt->var_set[i];
33284+}
33285+
33286+/**
33287+ * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
33288+ * @elt: The tracing_map_elt
33289+ * @i: The index of the given variable associated with the tracing_map_elt
33290+ *
33291+ * Retrieve the value of the variable i associated with the specified
33292+ * tracing_map_elt instance. The index i is the index returned by the
33293+ * call to tracing_map_add_var() when the tracing map was set
33294+ * up.
33295+ *
33296+ * Return: The variable value associated with field i for elt.
33297+ */
33298+u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
33299+{
33300+ return (u64)atomic64_read(&elt->vars[i]);
33301+}
33302+
33303+/**
33304+ * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
33305+ * @elt: The tracing_map_elt
33306+ * @i: The index of the given variable associated with the tracing_map_elt
33307+ *
33308+ * Retrieve the value of the variable i associated with the specified
33309+ * tracing_map_elt instance, and reset the variable to the 'not set'
33310+ * state. The index i is the index returned by the call to
33311+ * tracing_map_add_var() when the tracing map was set up. The reset
33312+ * essentially makes the variable a read-once variable if it's only
33313+ * accessed using this function.
33314+ *
33315+ * Return: The variable value associated with field i for elt.
33316+ */
33317+u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
33318+{
33319+ elt->var_set[i] = false;
33320+ return (u64)atomic64_read(&elt->vars[i]);
33321+}
33322+
33323 int tracing_map_cmp_string(void *val_a, void *val_b)
33324 {
33325 char *a = val_a;
33326@@ -171,6 +238,28 @@
33327 }
33328
33329 /**
33330+ * tracing_map_add_var - Add a field describing a tracing_map var
33331+ * @map: The tracing_map
33332+ *
33333+ * Add a var to the map and return the index identifying it in the map
33334+ * and associated tracing_map_elts. This is the index used for
33335+ * instance to update a var for a particular tracing_map_elt using
33336+ * tracing_map_update_var() or reading it via tracing_map_read_var().
33337+ *
33338+ * Return: The index identifying the var in the map and associated
33339+ * tracing_map_elts, or -EINVAL on error.
33340+ */
33341+int tracing_map_add_var(struct tracing_map *map)
33342+{
33343+ int ret = -EINVAL;
33344+
33345+ if (map->n_vars < TRACING_MAP_VARS_MAX)
33346+ ret = map->n_vars++;
33347+
33348+ return ret;
33349+}
33350+
33351+/**
33352 * tracing_map_add_key_field - Add a field describing a tracing_map key
33353 * @map: The tracing_map
33354 * @offset: The offset within the key
33355@@ -280,6 +369,11 @@
33356 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
33357 atomic64_set(&elt->fields[i].sum, 0);
33358
33359+ for (i = 0; i < elt->map->n_vars; i++) {
33360+ atomic64_set(&elt->vars[i], 0);
33361+ elt->var_set[i] = false;
33362+ }
33363+
33364 if (elt->map->ops && elt->map->ops->elt_clear)
33365 elt->map->ops->elt_clear(elt);
33366 }
33367@@ -306,6 +400,8 @@
33368 if (elt->map->ops && elt->map->ops->elt_free)
33369 elt->map->ops->elt_free(elt);
33370 kfree(elt->fields);
33371+ kfree(elt->vars);
33372+ kfree(elt->var_set);
33373 kfree(elt->key);
33374 kfree(elt);
33375 }
33376@@ -333,6 +429,18 @@
33377 goto free;
33378 }
33379
33380+ elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
33381+ if (!elt->vars) {
33382+ err = -ENOMEM;
33383+ goto free;
33384+ }
33385+
33386+ elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
33387+ if (!elt->var_set) {
33388+ err = -ENOMEM;
33389+ goto free;
33390+ }
33391+
33392 tracing_map_elt_init_fields(elt);
33393
33394 if (map->ops && map->ops->elt_alloc) {
33395@@ -414,7 +522,9 @@
33396 __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
33397 {
33398 u32 idx, key_hash, test_key;
33399+ int dup_try = 0;
33400 struct tracing_map_entry *entry;
33401+ struct tracing_map_elt *val;
33402
33403 key_hash = jhash(key, map->key_size, 0);
33404 if (key_hash == 0)
33405@@ -426,10 +536,33 @@
33406 entry = TRACING_MAP_ENTRY(map->map, idx);
33407 test_key = entry->key;
33408
33409- if (test_key && test_key == key_hash && entry->val &&
33410- keys_match(key, entry->val->key, map->key_size)) {
33411- atomic64_inc(&map->hits);
33412- return entry->val;
33413+ if (test_key && test_key == key_hash) {
33414+ val = READ_ONCE(entry->val);
33415+ if (val &&
33416+ keys_match(key, val->key, map->key_size)) {
33417+ if (!lookup_only)
33418+ atomic64_inc(&map->hits);
33419+ return val;
33420+ } else if (unlikely(!val)) {
33421+ /*
33422+ * The key is present. But, val (pointer to elt
33423+ * struct) is still NULL. which means some other
33424+ * thread is in the process of inserting an
33425+ * element.
33426+ *
33427+ * On top of that, it's key_hash is same as the
33428+ * one being inserted right now. So, it's
33429+ * possible that the element has the same
33430+ * key as well.
33431+ */
33432+
33433+ dup_try++;
33434+ if (dup_try > map->map_size) {
33435+ atomic64_inc(&map->drops);
33436+ break;
33437+ }
33438+ continue;
33439+ }
33440 }
33441
33442 if (!test_key) {
33443@@ -451,6 +584,13 @@
33444 atomic64_inc(&map->hits);
33445
33446 return entry->val;
33447+ } else {
33448+ /*
33449+ * cmpxchg() failed. Loop around once
33450+ * more to check what key was inserted.
33451+ */
33452+ dup_try++;
33453+ continue;
33454 }
33455 }
33456
33457@@ -815,67 +955,15 @@
33458 return sort_entry;
33459 }
33460
33461-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
33462-{
33463- struct tracing_map_elt *dup_elt;
33464- unsigned int i;
33465-
33466- dup_elt = tracing_map_elt_alloc(elt->map);
33467- if (IS_ERR(dup_elt))
33468- return NULL;
33469-
33470- if (elt->map->ops && elt->map->ops->elt_copy)
33471- elt->map->ops->elt_copy(dup_elt, elt);
33472-
33473- dup_elt->private_data = elt->private_data;
33474- memcpy(dup_elt->key, elt->key, elt->map->key_size);
33475-
33476- for (i = 0; i < elt->map->n_fields; i++) {
33477- atomic64_set(&dup_elt->fields[i].sum,
33478- atomic64_read(&elt->fields[i].sum));
33479- dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
33480- }
33481-
33482- return dup_elt;
33483-}
33484-
33485-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
33486- unsigned int target, unsigned int dup)
33487-{
33488- struct tracing_map_elt *target_elt, *elt;
33489- bool first_dup = (target - dup) == 1;
33490- int i;
33491-
33492- if (first_dup) {
33493- elt = sort_entries[target]->elt;
33494- target_elt = copy_elt(elt);
33495- if (!target_elt)
33496- return -ENOMEM;
33497- sort_entries[target]->elt = target_elt;
33498- sort_entries[target]->elt_copied = true;
33499- } else
33500- target_elt = sort_entries[target]->elt;
33501-
33502- elt = sort_entries[dup]->elt;
33503-
33504- for (i = 0; i < elt->map->n_fields; i++)
33505- atomic64_add(atomic64_read(&elt->fields[i].sum),
33506- &target_elt->fields[i].sum);
33507-
33508- sort_entries[dup]->dup = true;
33509-
33510- return 0;
33511-}
33512-
33513-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
33514+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
33515 int n_entries, unsigned int key_size)
33516 {
33517 unsigned int dups = 0, total_dups = 0;
33518- int err, i, j;
33519+ int i;
33520 void *key;
33521
33522 if (n_entries < 2)
33523- return total_dups;
33524+ return;
1a6e0f06 33525
e4b2b4a8
JK
33526 sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
33527 (int (*)(const void *, const void *))cmp_entries_dup, NULL);
33528@@ -884,30 +972,14 @@
33529 for (i = 1; i < n_entries; i++) {
33530 if (!memcmp(sort_entries[i]->key, key, key_size)) {
33531 dups++; total_dups++;
33532- err = merge_dup(sort_entries, i - dups, i);
33533- if (err)
33534- return err;
33535 continue;
33536 }
33537 key = sort_entries[i]->key;
33538 dups = 0;
33539 }
1a6e0f06 33540
e4b2b4a8
JK
33541- if (!total_dups)
33542- return total_dups;
33543-
33544- for (i = 0, j = 0; i < n_entries; i++) {
33545- if (!sort_entries[i]->dup) {
33546- sort_entries[j] = sort_entries[i];
33547- if (j++ != i)
33548- sort_entries[i] = NULL;
33549- } else {
33550- destroy_sort_entry(sort_entries[i]);
33551- sort_entries[i] = NULL;
33552- }
33553- }
33554-
33555- return total_dups;
33556+ WARN_ONCE(total_dups > 0,
33557+ "Duplicates detected: %d\n", total_dups);
1a6e0f06
JK
33558 }
33559
e4b2b4a8
JK
33560 static bool is_key(struct tracing_map *map, unsigned int field_idx)
33561@@ -1033,10 +1105,7 @@
33562 return 1;
33563 }
33564
33565- ret = merge_dups(entries, n_entries, map->key_size);
33566- if (ret < 0)
33567- goto free;
33568- n_entries -= ret;
33569+ detect_dups(entries, n_entries, map->key_size);
33570
33571 if (is_key(map, sort_keys[0].field_idx))
33572 cmp_entries_fn = cmp_entries_key;
33573diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/tracing_map.h linux-4.14/kernel/trace/tracing_map.h
33574--- linux-4.14.orig/kernel/trace/tracing_map.h 2017-11-12 19:46:13.000000000 +0100
33575+++ linux-4.14/kernel/trace/tracing_map.h 2018-09-05 11:05:07.000000000 +0200
33576@@ -6,10 +6,11 @@
33577 #define TRACING_MAP_BITS_MAX 17
33578 #define TRACING_MAP_BITS_MIN 7
33579
33580-#define TRACING_MAP_KEYS_MAX 2
33581+#define TRACING_MAP_KEYS_MAX 3
33582 #define TRACING_MAP_VALS_MAX 3
33583 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
33584 TRACING_MAP_VALS_MAX)
33585+#define TRACING_MAP_VARS_MAX 16
33586 #define TRACING_MAP_SORT_KEYS_MAX 2
33587
33588 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
33589@@ -137,6 +138,8 @@
33590 struct tracing_map_elt {
33591 struct tracing_map *map;
33592 struct tracing_map_field *fields;
33593+ atomic64_t *vars;
33594+ bool *var_set;
33595 void *key;
33596 void *private_data;
33597 };
33598@@ -192,6 +195,7 @@
33599 int key_idx[TRACING_MAP_KEYS_MAX];
33600 unsigned int n_keys;
33601 struct tracing_map_sort_key sort_key;
33602+ unsigned int n_vars;
33603 atomic64_t hits;
33604 atomic64_t drops;
33605 };
33606@@ -215,11 +219,6 @@
33607 * Element allocation occurs before tracing begins, when the
33608 * tracing_map_init() call is made by client code.
33609 *
33610- * @elt_copy: At certain points in the lifetime of an element, it may
33611- * need to be copied. The copy should include a copy of the
33612- * client-allocated data, which can be copied into the 'to'
33613- * element from the 'from' element.
33614- *
33615 * @elt_free: When a tracing_map_elt is freed, this function is called
33616 * and allows client-allocated per-element data to be freed.
33617 *
33618@@ -233,8 +232,6 @@
33619 */
33620 struct tracing_map_ops {
33621 int (*elt_alloc)(struct tracing_map_elt *elt);
33622- void (*elt_copy)(struct tracing_map_elt *to,
33623- struct tracing_map_elt *from);
33624 void (*elt_free)(struct tracing_map_elt *elt);
33625 void (*elt_clear)(struct tracing_map_elt *elt);
33626 void (*elt_init)(struct tracing_map_elt *elt);
33627@@ -248,6 +245,7 @@
33628 extern int tracing_map_init(struct tracing_map *map);
33629
33630 extern int tracing_map_add_sum_field(struct tracing_map *map);
33631+extern int tracing_map_add_var(struct tracing_map *map);
33632 extern int tracing_map_add_key_field(struct tracing_map *map,
33633 unsigned int offset,
33634 tracing_map_cmp_fn_t cmp_fn);
33635@@ -267,7 +265,13 @@
33636
33637 extern void tracing_map_update_sum(struct tracing_map_elt *elt,
33638 unsigned int i, u64 n);
33639+extern void tracing_map_set_var(struct tracing_map_elt *elt,
33640+ unsigned int i, u64 n);
33641+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
33642 extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
33643+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
33644+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
33645+
33646 extern void tracing_map_set_field_descr(struct tracing_map *map,
33647 unsigned int i,
33648 unsigned int key_offset,
33649diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/user.c linux-4.14/kernel/user.c
33650--- linux-4.14.orig/kernel/user.c 2017-11-12 19:46:13.000000000 +0100
33651+++ linux-4.14/kernel/user.c 2018-09-05 11:05:07.000000000 +0200
33652@@ -162,11 +162,11 @@
1a6e0f06
JK
33653 if (!up)
33654 return;
33655
33656- local_irq_save(flags);
33657+ local_irq_save_nort(flags);
33658 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
33659 free_user(up, flags);
33660 else
33661- local_irq_restore(flags);
33662+ local_irq_restore_nort(flags);
33663 }
33664
33665 struct user_struct *alloc_uid(kuid_t uid)
e4b2b4a8
JK
33666diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/watchdog.c linux-4.14/kernel/watchdog.c
33667--- linux-4.14.orig/kernel/watchdog.c 2017-11-12 19:46:13.000000000 +0100
33668+++ linux-4.14/kernel/watchdog.c 2018-09-05 11:05:07.000000000 +0200
33669@@ -462,7 +462,7 @@
33670 * Start the timer first to prevent the NMI watchdog triggering
33671 * before the timer has a chance to fire.
33672 */
33673- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
33674+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
33675 hrtimer->function = watchdog_timer_fn;
33676 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
33677 HRTIMER_MODE_REL_PINNED);
33678diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/watchdog_hld.c linux-4.14/kernel/watchdog_hld.c
33679--- linux-4.14.orig/kernel/watchdog_hld.c 2017-11-12 19:46:13.000000000 +0100
33680+++ linux-4.14/kernel/watchdog_hld.c 2018-09-05 11:05:07.000000000 +0200
33681@@ -24,6 +24,8 @@
33682 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
33683 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
33684 static DEFINE_PER_CPU(struct perf_event *, dead_event);
1a6e0f06
JK
33685+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
33686+
e4b2b4a8
JK
33687 static struct cpumask dead_events_mask;
33688
33689 static unsigned long hardlockup_allcpu_dumped;
33690@@ -134,6 +136,13 @@
1a6e0f06
JK
33691 /* only print hardlockups once */
33692 if (__this_cpu_read(hard_watchdog_warn) == true)
33693 return;
33694+ /*
33695+ * If early-printk is enabled then make sure we do not
33696+ * lock up in printk() and kill console logging:
33697+ */
33698+ printk_kill();
33699+
33700+ raw_spin_lock(&watchdog_output_lock);
33701
33702 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
33703 print_modules();
e4b2b4a8 33704@@ -151,6 +160,7 @@
1a6e0f06
JK
33705 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
33706 trigger_allbutself_cpu_backtrace();
33707
33708+ raw_spin_unlock(&watchdog_output_lock);
33709 if (hardlockup_panic)
33710 nmi_panic(regs, "Hard LOCKUP");
33711
e4b2b4a8
JK
33712diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/workqueue.c linux-4.14/kernel/workqueue.c
33713--- linux-4.14.orig/kernel/workqueue.c 2018-09-05 11:03:22.000000000 +0200
33714+++ linux-4.14/kernel/workqueue.c 2018-09-05 11:05:07.000000000 +0200
33715@@ -49,6 +49,8 @@
1a6e0f06
JK
33716 #include <linux/moduleparam.h>
33717 #include <linux/uaccess.h>
e4b2b4a8 33718 #include <linux/nmi.h>
1a6e0f06
JK
33719+#include <linux/locallock.h>
33720+#include <linux/delay.h>
33721
33722 #include "workqueue_internal.h"
33723
e4b2b4a8 33724@@ -123,11 +125,16 @@
1a6e0f06
JK
33725 * cpu or grabbing pool->lock is enough for read access. If
33726 * POOL_DISASSOCIATED is set, it's identical to L.
33727 *
33728+ * On RT we need the extra protection via rt_lock_idle_list() for
33729+ * the list manipulations against read access from
33730+ * wq_worker_sleeping(). All other places are nicely serialized via
33731+ * pool->lock.
33732+ *
33733 * A: pool->attach_mutex protected.
33734 *
33735 * PL: wq_pool_mutex protected.
33736 *
33737- * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
33738+ * PR: wq_pool_mutex protected for writes. RCU protected for reads.
33739 *
33740 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
33741 *
e4b2b4a8 33742@@ -136,7 +143,7 @@
1a6e0f06
JK
33743 *
33744 * WQ: wq->mutex protected.
33745 *
33746- * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
33747+ * WR: wq->mutex protected for writes. RCU protected for reads.
33748 *
33749 * MD: wq_mayday_lock protected.
33750 */
e4b2b4a8 33751@@ -186,7 +193,7 @@
1a6e0f06
JK
33752 atomic_t nr_running ____cacheline_aligned_in_smp;
33753
33754 /*
33755- * Destruction of pool is sched-RCU protected to allow dereferences
33756+ * Destruction of pool is RCU protected to allow dereferences
33757 * from get_work_pool().
33758 */
33759 struct rcu_head rcu;
e4b2b4a8 33760@@ -215,7 +222,7 @@
1a6e0f06
JK
33761 /*
33762 * Release of unbound pwq is punted to system_wq. See put_pwq()
33763 * and pwq_unbound_release_workfn() for details. pool_workqueue
33764- * itself is also sched-RCU protected so that the first pwq can be
33765+ * itself is also RCU protected so that the first pwq can be
33766 * determined without grabbing wq->mutex.
33767 */
33768 struct work_struct unbound_release_work;
e4b2b4a8 33769@@ -352,6 +359,8 @@
1a6e0f06
JK
33770 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
33771 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
33772
33773+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
33774+
33775 static int worker_thread(void *__worker);
33776 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
33777
e4b2b4a8 33778@@ -359,20 +368,20 @@
1a6e0f06
JK
33779 #include <trace/events/workqueue.h>
33780
33781 #define assert_rcu_or_pool_mutex() \
33782- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
33783+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
33784 !lockdep_is_held(&wq_pool_mutex), \
33785- "sched RCU or wq_pool_mutex should be held")
33786+ "RCU or wq_pool_mutex should be held")
33787
33788 #define assert_rcu_or_wq_mutex(wq) \
33789- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
33790+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
33791 !lockdep_is_held(&wq->mutex), \
33792- "sched RCU or wq->mutex should be held")
33793+ "RCU or wq->mutex should be held")
33794
33795 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
33796- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
33797+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
33798 !lockdep_is_held(&wq->mutex) && \
33799 !lockdep_is_held(&wq_pool_mutex), \
33800- "sched RCU, wq->mutex or wq_pool_mutex should be held")
33801+ "RCU, wq->mutex or wq_pool_mutex should be held")
33802
33803 #define for_each_cpu_worker_pool(pool, cpu) \
33804 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
e4b2b4a8 33805@@ -384,7 +393,7 @@
1a6e0f06
JK
33806 * @pool: iteration cursor
33807 * @pi: integer used for iteration
33808 *
33809- * This must be called either with wq_pool_mutex held or sched RCU read
33810+ * This must be called either with wq_pool_mutex held or RCU read
33811 * locked. If the pool needs to be used beyond the locking in effect, the
33812 * caller is responsible for guaranteeing that the pool stays online.
33813 *
e4b2b4a8 33814@@ -416,7 +425,7 @@
1a6e0f06
JK
33815 * @pwq: iteration cursor
33816 * @wq: the target workqueue
33817 *
33818- * This must be called either with wq->mutex held or sched RCU read locked.
33819+ * This must be called either with wq->mutex held or RCU read locked.
33820 * If the pwq needs to be used beyond the locking in effect, the caller is
33821 * responsible for guaranteeing that the pwq stays online.
33822 *
e4b2b4a8 33823@@ -428,6 +437,31 @@
1a6e0f06
JK
33824 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
33825 else
33826
33827+#ifdef CONFIG_PREEMPT_RT_BASE
33828+static inline void rt_lock_idle_list(struct worker_pool *pool)
33829+{
33830+ preempt_disable();
33831+}
33832+static inline void rt_unlock_idle_list(struct worker_pool *pool)
33833+{
33834+ preempt_enable();
33835+}
33836+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
33837+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
33838+#else
33839+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
33840+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
33841+static inline void sched_lock_idle_list(struct worker_pool *pool)
33842+{
33843+ spin_lock_irq(&pool->lock);
33844+}
33845+static inline void sched_unlock_idle_list(struct worker_pool *pool)
33846+{
33847+ spin_unlock_irq(&pool->lock);
33848+}
33849+#endif
33850+
33851+
33852 #ifdef CONFIG_DEBUG_OBJECTS_WORK
33853
33854 static struct debug_obj_descr work_debug_descr;
e4b2b4a8 33855@@ -552,7 +586,7 @@
1a6e0f06
JK
33856 * @wq: the target workqueue
33857 * @node: the node ID
33858 *
33859- * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
33860+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
33861 * read locked.
33862 * If the pwq needs to be used beyond the locking in effect, the caller is
33863 * responsible for guaranteeing that the pwq stays online.
e4b2b4a8 33864@@ -696,8 +730,8 @@
1a6e0f06
JK
33865 * @work: the work item of interest
33866 *
33867 * Pools are created and destroyed under wq_pool_mutex, and allows read
33868- * access under sched-RCU read lock. As such, this function should be
33869- * called under wq_pool_mutex or with preemption disabled.
33870+ * access under RCU read lock. As such, this function should be
33871+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
33872 *
33873 * All fields of the returned pool are accessible as long as the above
33874 * mentioned locking is in effect. If the returned pool needs to be used
e4b2b4a8 33875@@ -834,50 +868,45 @@
1a6e0f06
JK
33876 */
33877 static void wake_up_worker(struct worker_pool *pool)
33878 {
33879- struct worker *worker = first_idle_worker(pool);
33880+ struct worker *worker;
33881+
33882+ rt_lock_idle_list(pool);
33883+
33884+ worker = first_idle_worker(pool);
33885
33886 if (likely(worker))
33887 wake_up_process(worker->task);
33888+
33889+ rt_unlock_idle_list(pool);
33890 }
33891
33892 /**
33893- * wq_worker_waking_up - a worker is waking up
33894+ * wq_worker_running - a worker is running again
33895 * @task: task waking up
33896- * @cpu: CPU @task is waking up to
e4b2b4a8 33897- *
1a6e0f06
JK
33898- * This function is called during try_to_wake_up() when a worker is
33899- * being awoken.
e4b2b4a8 33900 *
1a6e0f06
JK
33901- * CONTEXT:
33902- * spin_lock_irq(rq->lock)
33903+ * This function is called when a worker returns from schedule()
33904 */
33905-void wq_worker_waking_up(struct task_struct *task, int cpu)
33906+void wq_worker_running(struct task_struct *task)
33907 {
33908 struct worker *worker = kthread_data(task);
33909
33910- if (!(worker->flags & WORKER_NOT_RUNNING)) {
33911- WARN_ON_ONCE(worker->pool->cpu != cpu);
33912+ if (!worker->sleeping)
33913+ return;
33914+ if (!(worker->flags & WORKER_NOT_RUNNING))
33915 atomic_inc(&worker->pool->nr_running);
33916- }
33917+ worker->sleeping = 0;
33918 }
33919
33920 /**
33921 * wq_worker_sleeping - a worker is going to sleep
33922 * @task: task going to sleep
33923 *
33924- * This function is called during schedule() when a busy worker is
33925- * going to sleep. Worker on the same cpu can be woken up by
33926- * returning pointer to its task.
33927- *
33928- * CONTEXT:
33929- * spin_lock_irq(rq->lock)
33930- *
33931- * Return:
33932- * Worker task on @cpu to wake up, %NULL if none.
33933+ * This function is called from schedule() when a busy worker is
33934+ * going to sleep.
33935 */
33936-struct task_struct *wq_worker_sleeping(struct task_struct *task)
33937+void wq_worker_sleeping(struct task_struct *task)
33938 {
33939- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
33940+ struct worker *worker = kthread_data(task);
33941 struct worker_pool *pool;
33942
33943 /*
e4b2b4a8 33944@@ -886,29 +915,26 @@
1a6e0f06
JK
33945 * checking NOT_RUNNING.
33946 */
33947 if (worker->flags & WORKER_NOT_RUNNING)
33948- return NULL;
33949+ return;
33950
33951 pool = worker->pool;
33952
33953- /* this can only happen on the local cpu */
33954- if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
33955- return NULL;
33956+ if (WARN_ON_ONCE(worker->sleeping))
33957+ return;
33958+
33959+ worker->sleeping = 1;
33960
33961 /*
33962 * The counterpart of the following dec_and_test, implied mb,
33963 * worklist not empty test sequence is in insert_work().
33964 * Please read comment there.
33965- *
33966- * NOT_RUNNING is clear. This means that we're bound to and
33967- * running on the local cpu w/ rq lock held and preemption
33968- * disabled, which in turn means that none else could be
33969- * manipulating idle_list, so dereferencing idle_list without pool
33970- * lock is safe.
33971 */
33972 if (atomic_dec_and_test(&pool->nr_running) &&
33973- !list_empty(&pool->worklist))
33974- to_wakeup = first_idle_worker(pool);
33975- return to_wakeup ? to_wakeup->task : NULL;
33976+ !list_empty(&pool->worklist)) {
33977+ sched_lock_idle_list(pool);
33978+ wake_up_worker(pool);
33979+ sched_unlock_idle_list(pool);
33980+ }
33981 }
33982
33983 /**
e4b2b4a8 33984@@ -1102,12 +1128,14 @@
1a6e0f06
JK
33985 {
33986 if (pwq) {
33987 /*
33988- * As both pwqs and pools are sched-RCU protected, the
33989+ * As both pwqs and pools are RCU protected, the
33990 * following lock operations are safe.
33991 */
33992- spin_lock_irq(&pwq->pool->lock);
c7c16703 33993+ rcu_read_lock();
1a6e0f06
JK
33994+ local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
33995 put_pwq(pwq);
33996- spin_unlock_irq(&pwq->pool->lock);
33997+ local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
c7c16703 33998+ rcu_read_unlock();
1a6e0f06
JK
33999 }
34000 }
34001
e4b2b4a8 34002@@ -1211,7 +1239,7 @@
1a6e0f06
JK
34003 struct worker_pool *pool;
34004 struct pool_workqueue *pwq;
34005
34006- local_irq_save(*flags);
34007+ local_lock_irqsave(pendingb_lock, *flags);
34008
34009 /* try to steal the timer if it exists */
34010 if (is_dwork) {
e4b2b4a8 34011@@ -1230,6 +1258,7 @@
1a6e0f06
JK
34012 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
34013 return 0;
34014
34015+ rcu_read_lock();
34016 /*
34017 * The queueing is in progress, or it is already queued. Try to
34018 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
e4b2b4a8 34019@@ -1268,14 +1297,16 @@
1a6e0f06
JK
34020 set_work_pool_and_keep_pending(work, pool->id);
34021
34022 spin_unlock(&pool->lock);
34023+ rcu_read_unlock();
34024 return 1;
34025 }
34026 spin_unlock(&pool->lock);
34027 fail:
34028- local_irq_restore(*flags);
34029+ rcu_read_unlock();
34030+ local_unlock_irqrestore(pendingb_lock, *flags);
34031 if (work_is_canceling(work))
34032 return -ENOENT;
34033- cpu_relax();
34034+ cpu_chill();
34035 return -EAGAIN;
34036 }
34037
e4b2b4a8 34038@@ -1377,7 +1408,7 @@
1a6e0f06
JK
34039 * queued or lose PENDING. Grabbing PENDING and queueing should
34040 * happen with IRQ disabled.
34041 */
34042- WARN_ON_ONCE(!irqs_disabled());
34043+ WARN_ON_ONCE_NONRT(!irqs_disabled());
34044
34045 debug_work_activate(work);
34046
e4b2b4a8 34047@@ -1385,6 +1416,7 @@
1a6e0f06
JK
34048 if (unlikely(wq->flags & __WQ_DRAINING) &&
34049 WARN_ON_ONCE(!is_chained_work(wq)))
34050 return;
34051+ rcu_read_lock();
34052 retry:
34053 if (req_cpu == WORK_CPU_UNBOUND)
34054 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
e4b2b4a8 34055@@ -1441,10 +1473,8 @@
1a6e0f06
JK
34056 /* pwq determined, queue */
34057 trace_workqueue_queue_work(req_cpu, pwq, work);
34058
34059- if (WARN_ON(!list_empty(&work->entry))) {
34060- spin_unlock(&pwq->pool->lock);
34061- return;
34062- }
34063+ if (WARN_ON(!list_empty(&work->entry)))
34064+ goto out;
34065
34066 pwq->nr_in_flight[pwq->work_color]++;
34067 work_flags = work_color_to_flags(pwq->work_color);
e4b2b4a8 34068@@ -1462,7 +1492,9 @@
1a6e0f06
JK
34069
34070 insert_work(pwq, work, worklist, work_flags);
34071
34072+out:
34073 spin_unlock(&pwq->pool->lock);
34074+ rcu_read_unlock();
34075 }
34076
34077 /**
e4b2b4a8 34078@@ -1482,14 +1514,14 @@
1a6e0f06
JK
34079 bool ret = false;
34080 unsigned long flags;
34081
34082- local_irq_save(flags);
34083+ local_lock_irqsave(pendingb_lock,flags);
34084
34085 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
34086 __queue_work(cpu, wq, work);
34087 ret = true;
34088 }
34089
34090- local_irq_restore(flags);
34091+ local_unlock_irqrestore(pendingb_lock, flags);
34092 return ret;
34093 }
34094 EXPORT_SYMBOL(queue_work_on);
e4b2b4a8
JK
34095@@ -1498,8 +1530,11 @@
34096 {
34097 struct delayed_work *dwork = (struct delayed_work *)__data;
34098
34099+ /* XXX */
34100+ /* local_lock(pendingb_lock); */
34101 /* should have been called from irqsafe timer with irq already off */
34102 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
34103+ /* local_unlock(pendingb_lock); */
34104 }
34105 EXPORT_SYMBOL(delayed_work_timer_fn);
34106
34107@@ -1555,14 +1590,14 @@
1a6e0f06
JK
34108 unsigned long flags;
34109
34110 /* read the comment in __queue_work() */
34111- local_irq_save(flags);
34112+ local_lock_irqsave(pendingb_lock, flags);
34113
34114 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
34115 __queue_delayed_work(cpu, wq, dwork, delay);
34116 ret = true;
34117 }
34118
34119- local_irq_restore(flags);
34120+ local_unlock_irqrestore(pendingb_lock, flags);
34121 return ret;
34122 }
34123 EXPORT_SYMBOL(queue_delayed_work_on);
e4b2b4a8 34124@@ -1597,7 +1632,7 @@
1a6e0f06
JK
34125
34126 if (likely(ret >= 0)) {
34127 __queue_delayed_work(cpu, wq, dwork, delay);
34128- local_irq_restore(flags);
34129+ local_unlock_irqrestore(pendingb_lock, flags);
34130 }
34131
34132 /* -ENOENT from try_to_grab_pending() becomes %true */
e4b2b4a8 34133@@ -1630,7 +1665,9 @@
1a6e0f06
JK
34134 worker->last_active = jiffies;
34135
34136 /* idle_list is LIFO */
34137+ rt_lock_idle_list(pool);
34138 list_add(&worker->entry, &pool->idle_list);
34139+ rt_unlock_idle_list(pool);
34140
34141 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
34142 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
e4b2b4a8 34143@@ -1663,7 +1700,9 @@
1a6e0f06
JK
34144 return;
34145 worker_clr_flags(worker, WORKER_IDLE);
34146 pool->nr_idle--;
34147+ rt_lock_idle_list(pool);
34148 list_del_init(&worker->entry);
34149+ rt_unlock_idle_list(pool);
34150 }
34151
34152 static struct worker *alloc_worker(int node)
e4b2b4a8 34153@@ -1829,7 +1868,9 @@
1a6e0f06
JK
34154 pool->nr_workers--;
34155 pool->nr_idle--;
34156
34157+ rt_lock_idle_list(pool);
34158 list_del_init(&worker->entry);
34159+ rt_unlock_idle_list(pool);
34160 worker->flags |= WORKER_DIE;
34161 wake_up_process(worker->task);
34162 }
e4b2b4a8 34163@@ -2815,14 +2856,14 @@
1a6e0f06
JK
34164
34165 might_sleep();
34166
34167- local_irq_disable();
34168+ rcu_read_lock();
34169 pool = get_work_pool(work);
34170 if (!pool) {
34171- local_irq_enable();
34172+ rcu_read_unlock();
34173 return false;
34174 }
34175
34176- spin_lock(&pool->lock);
34177+ spin_lock_irq(&pool->lock);
34178 /* see the comment in try_to_grab_pending() with the same code */
34179 pwq = get_work_pwq(work);
34180 if (pwq) {
e4b2b4a8
JK
34181@@ -2853,10 +2894,11 @@
34182 lock_map_acquire(&pwq->wq->lockdep_map);
34183 lock_map_release(&pwq->wq->lockdep_map);
34184 }
1a6e0f06
JK
34185-
34186+ rcu_read_unlock();
34187 return true;
34188 already_gone:
34189 spin_unlock_irq(&pool->lock);
34190+ rcu_read_unlock();
34191 return false;
34192 }
34193
e4b2b4a8 34194@@ -2946,7 +2988,7 @@
1a6e0f06
JK
34195
34196 /* tell other tasks trying to grab @work to back off */
34197 mark_work_canceling(work);
34198- local_irq_restore(flags);
34199+ local_unlock_irqrestore(pendingb_lock, flags);
34200
e4b2b4a8
JK
34201 /*
34202 * This allows canceling during early boot. We know that @work
34203@@ -3007,10 +3049,10 @@
1a6e0f06
JK
34204 */
34205 bool flush_delayed_work(struct delayed_work *dwork)
34206 {
34207- local_irq_disable();
34208+ local_lock_irq(pendingb_lock);
34209 if (del_timer_sync(&dwork->timer))
34210 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
34211- local_irq_enable();
34212+ local_unlock_irq(pendingb_lock);
34213 return flush_work(&dwork->work);
34214 }
34215 EXPORT_SYMBOL(flush_delayed_work);
e4b2b4a8 34216@@ -3028,7 +3070,7 @@
c7c16703 34217 return false;
1a6e0f06 34218
c7c16703 34219 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
1a6e0f06
JK
34220- local_irq_restore(flags);
34221+ local_unlock_irqrestore(pendingb_lock, flags);
34222 return ret;
34223 }
c7c16703 34224
e4b2b4a8 34225@@ -3284,7 +3326,7 @@
1a6e0f06
JK
34226 * put_unbound_pool - put a worker_pool
34227 * @pool: worker_pool to put
34228 *
34229- * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
34230+ * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
34231 * safe manner. get_unbound_pool() calls this function on its failure path
34232 * and this function should be able to release pools which went through,
34233 * successfully or not, init_worker_pool().
e4b2b4a8 34234@@ -3338,8 +3380,8 @@
1a6e0f06
JK
34235 del_timer_sync(&pool->idle_timer);
34236 del_timer_sync(&pool->mayday_timer);
34237
34238- /* sched-RCU protected to allow dereferences from get_work_pool() */
34239- call_rcu_sched(&pool->rcu, rcu_free_pool);
34240+ /* RCU protected to allow dereferences from get_work_pool() */
34241+ call_rcu(&pool->rcu, rcu_free_pool);
34242 }
34243
34244 /**
e4b2b4a8 34245@@ -3446,14 +3488,14 @@
1a6e0f06
JK
34246 put_unbound_pool(pool);
34247 mutex_unlock(&wq_pool_mutex);
34248
34249- call_rcu_sched(&pwq->rcu, rcu_free_pwq);
34250+ call_rcu(&pwq->rcu, rcu_free_pwq);
34251
34252 /*
34253 * If we're the last pwq going away, @wq is already dead and no one
34254 * is gonna access it anymore. Schedule RCU free.
34255 */
34256 if (is_last)
34257- call_rcu_sched(&wq->rcu, rcu_free_wq);
34258+ call_rcu(&wq->rcu, rcu_free_wq);
34259 }
34260
34261 /**
e4b2b4a8 34262@@ -4128,7 +4170,7 @@
1a6e0f06
JK
34263 * The base ref is never dropped on per-cpu pwqs. Directly
34264 * schedule RCU free.
34265 */
34266- call_rcu_sched(&wq->rcu, rcu_free_wq);
34267+ call_rcu(&wq->rcu, rcu_free_wq);
34268 } else {
34269 /*
34270 * We're the sole accessor of @wq at this point. Directly
e4b2b4a8 34271@@ -4238,7 +4280,8 @@
1a6e0f06
JK
34272 struct pool_workqueue *pwq;
34273 bool ret;
34274
34275- rcu_read_lock_sched();
34276+ rcu_read_lock();
34277+ preempt_disable();
34278
34279 if (cpu == WORK_CPU_UNBOUND)
34280 cpu = smp_processor_id();
e4b2b4a8 34281@@ -4249,7 +4292,8 @@
1a6e0f06
JK
34282 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
34283
34284 ret = !list_empty(&pwq->delayed_works);
34285- rcu_read_unlock_sched();
34286+ preempt_enable();
34287+ rcu_read_unlock();
34288
34289 return ret;
34290 }
e4b2b4a8 34291@@ -4275,15 +4319,15 @@
1a6e0f06
JK
34292 if (work_pending(work))
34293 ret |= WORK_BUSY_PENDING;
34294
34295- local_irq_save(flags);
34296+ rcu_read_lock();
34297 pool = get_work_pool(work);
34298 if (pool) {
34299- spin_lock(&pool->lock);
34300+ spin_lock_irqsave(&pool->lock, flags);
34301 if (find_worker_executing_work(pool, work))
34302 ret |= WORK_BUSY_RUNNING;
34303- spin_unlock(&pool->lock);
34304+ spin_unlock_irqrestore(&pool->lock, flags);
34305 }
34306- local_irq_restore(flags);
34307+ rcu_read_unlock();
34308
34309 return ret;
34310 }
e4b2b4a8 34311@@ -4472,7 +4516,7 @@
1a6e0f06
JK
34312 unsigned long flags;
34313 int pi;
34314
34315- rcu_read_lock_sched();
34316+ rcu_read_lock();
34317
34318 pr_info("Showing busy workqueues and worker pools:\n");
34319
e4b2b4a8
JK
34320@@ -4537,7 +4581,7 @@
34321 touch_nmi_watchdog();
1a6e0f06
JK
34322 }
34323
34324- rcu_read_unlock_sched();
34325+ rcu_read_unlock();
34326 }
34327
34328 /*
e4b2b4a8 34329@@ -4898,16 +4942,16 @@
1a6e0f06
JK
34330 * nr_active is monotonically decreasing. It's safe
34331 * to peek without lock.
34332 */
34333- rcu_read_lock_sched();
34334+ rcu_read_lock();
34335 for_each_pwq(pwq, wq) {
34336 WARN_ON_ONCE(pwq->nr_active < 0);
34337 if (pwq->nr_active) {
34338 busy = true;
34339- rcu_read_unlock_sched();
34340+ rcu_read_unlock();
34341 goto out_unlock;
34342 }
34343 }
34344- rcu_read_unlock_sched();
34345+ rcu_read_unlock();
34346 }
34347 out_unlock:
34348 mutex_unlock(&wq_pool_mutex);
e4b2b4a8 34349@@ -5097,7 +5141,8 @@
1a6e0f06
JK
34350 const char *delim = "";
34351 int node, written = 0;
34352
34353- rcu_read_lock_sched();
34354+ get_online_cpus();
34355+ rcu_read_lock();
34356 for_each_node(node) {
34357 written += scnprintf(buf + written, PAGE_SIZE - written,
34358 "%s%d:%d", delim, node,
e4b2b4a8 34359@@ -5105,7 +5150,8 @@
1a6e0f06
JK
34360 delim = " ";
34361 }
34362 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
34363- rcu_read_unlock_sched();
34364+ rcu_read_unlock();
34365+ put_online_cpus();
34366
34367 return written;
34368 }
e4b2b4a8
JK
34369diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/workqueue_internal.h linux-4.14/kernel/workqueue_internal.h
34370--- linux-4.14.orig/kernel/workqueue_internal.h 2017-11-12 19:46:13.000000000 +0100
34371+++ linux-4.14/kernel/workqueue_internal.h 2018-09-05 11:05:07.000000000 +0200
34372@@ -45,6 +45,7 @@
1a6e0f06
JK
34373 unsigned long last_active; /* L: last active timestamp */
34374 unsigned int flags; /* X: flags */
34375 int id; /* I: worker id */
34376+ int sleeping; /* None */
34377
34378 /*
34379 * Opaque string set with work_set_desc(). Printed out with task
e4b2b4a8 34380@@ -70,7 +71,7 @@
1a6e0f06
JK
34381 * Scheduler hooks for concurrency managed workqueue. Only to be used from
34382 * sched/core.c and workqueue.c.
34383 */
34384-void wq_worker_waking_up(struct task_struct *task, int cpu);
34385-struct task_struct *wq_worker_sleeping(struct task_struct *task);
34386+void wq_worker_running(struct task_struct *task);
34387+void wq_worker_sleeping(struct task_struct *task);
34388
34389 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
e4b2b4a8
JK
34390diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/debugobjects.c linux-4.14/lib/debugobjects.c
34391--- linux-4.14.orig/lib/debugobjects.c 2017-11-12 19:46:13.000000000 +0100
34392+++ linux-4.14/lib/debugobjects.c 2018-09-05 11:05:07.000000000 +0200
34393@@ -336,7 +336,10 @@
1a6e0f06
JK
34394 struct debug_obj *obj;
34395 unsigned long flags;
34396
34397- fill_pool();
34398+#ifdef CONFIG_PREEMPT_RT_FULL
34399+ if (preempt_count() == 0 && !irqs_disabled())
34400+#endif
34401+ fill_pool();
34402
34403 db = get_bucket((unsigned long) addr);
34404
e4b2b4a8
JK
34405diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/irq_poll.c linux-4.14/lib/irq_poll.c
34406--- linux-4.14.orig/lib/irq_poll.c 2017-11-12 19:46:13.000000000 +0100
34407+++ linux-4.14/lib/irq_poll.c 2018-09-05 11:05:07.000000000 +0200
34408@@ -37,6 +37,7 @@
1a6e0f06
JK
34409 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
34410 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34411 local_irq_restore(flags);
34412+ preempt_check_resched_rt();
34413 }
34414 EXPORT_SYMBOL(irq_poll_sched);
34415
e4b2b4a8 34416@@ -72,6 +73,7 @@
1a6e0f06
JK
34417 local_irq_save(flags);
34418 __irq_poll_complete(iop);
34419 local_irq_restore(flags);
34420+ preempt_check_resched_rt();
34421 }
34422 EXPORT_SYMBOL(irq_poll_complete);
34423
e4b2b4a8 34424@@ -96,6 +98,7 @@
1a6e0f06
JK
34425 }
34426
34427 local_irq_enable();
34428+ preempt_check_resched_rt();
34429
34430 /* Even though interrupts have been re-enabled, this
34431 * access is safe because interrupts can only add new
e4b2b4a8 34432@@ -133,6 +136,7 @@
1a6e0f06
JK
34433 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34434
34435 local_irq_enable();
34436+ preempt_check_resched_rt();
34437 }
34438
34439 /**
e4b2b4a8 34440@@ -196,6 +200,7 @@
c7c16703
JK
34441 this_cpu_ptr(&blk_cpu_iopoll));
34442 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34443 local_irq_enable();
34444+ preempt_check_resched_rt();
1a6e0f06 34445
c7c16703
JK
34446 return 0;
34447 }
e4b2b4a8
JK
34448diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/Kconfig linux-4.14/lib/Kconfig
34449--- linux-4.14.orig/lib/Kconfig 2017-11-12 19:46:13.000000000 +0100
34450+++ linux-4.14/lib/Kconfig 2018-09-05 11:05:07.000000000 +0200
34451@@ -428,6 +428,7 @@
34452
34453 config CPUMASK_OFFSTACK
34454 bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
34455+ depends on !PREEMPT_RT_FULL
34456 help
34457 Use dynamic allocation for cpumask_var_t, instead of putting
34458 them on the stack. This is a bit more expensive, but avoids
34459diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/Kconfig.debug linux-4.14/lib/Kconfig.debug
34460--- linux-4.14.orig/lib/Kconfig.debug 2018-09-05 11:03:22.000000000 +0200
34461+++ linux-4.14/lib/Kconfig.debug 2018-09-05 11:05:07.000000000 +0200
34462@@ -1197,7 +1197,7 @@
34463
34464 config DEBUG_LOCKING_API_SELFTESTS
34465 bool "Locking API boot-time self-tests"
34466- depends on DEBUG_KERNEL
34467+ depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
34468 help
34469 Say Y here if you want the kernel to run a short self-test during
34470 bootup. The self-test checks whether common types of locking bugs
34471diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/locking-selftest.c linux-4.14/lib/locking-selftest.c
34472--- linux-4.14.orig/lib/locking-selftest.c 2017-11-12 19:46:13.000000000 +0100
34473+++ linux-4.14/lib/locking-selftest.c 2018-09-05 11:05:07.000000000 +0200
34474@@ -742,6 +742,8 @@
1a6e0f06
JK
34475 #include "locking-selftest-spin-hardirq.h"
34476 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
34477
34478+#ifndef CONFIG_PREEMPT_RT_FULL
34479+
34480 #include "locking-selftest-rlock-hardirq.h"
34481 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
34482
e4b2b4a8 34483@@ -757,9 +759,12 @@
1a6e0f06
JK
34484 #include "locking-selftest-wlock-softirq.h"
34485 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
34486
34487+#endif
34488+
34489 #undef E1
34490 #undef E2
34491
34492+#ifndef CONFIG_PREEMPT_RT_FULL
34493 /*
34494 * Enabling hardirqs with a softirq-safe lock held:
34495 */
e4b2b4a8 34496@@ -792,6 +797,8 @@
1a6e0f06
JK
34497 #undef E1
34498 #undef E2
34499
34500+#endif
34501+
34502 /*
34503 * Enabling irqs with an irq-safe lock held:
34504 */
e4b2b4a8 34505@@ -815,6 +822,8 @@
1a6e0f06
JK
34506 #include "locking-selftest-spin-hardirq.h"
34507 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
34508
34509+#ifndef CONFIG_PREEMPT_RT_FULL
34510+
34511 #include "locking-selftest-rlock-hardirq.h"
34512 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
34513
e4b2b4a8 34514@@ -830,6 +839,8 @@
1a6e0f06
JK
34515 #include "locking-selftest-wlock-softirq.h"
34516 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
34517
34518+#endif
34519+
34520 #undef E1
34521 #undef E2
34522
e4b2b4a8 34523@@ -861,6 +872,8 @@
1a6e0f06
JK
34524 #include "locking-selftest-spin-hardirq.h"
34525 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
34526
34527+#ifndef CONFIG_PREEMPT_RT_FULL
34528+
34529 #include "locking-selftest-rlock-hardirq.h"
34530 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
34531
e4b2b4a8 34532@@ -876,6 +889,8 @@
1a6e0f06
JK
34533 #include "locking-selftest-wlock-softirq.h"
34534 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
34535
34536+#endif
34537+
34538 #undef E1
34539 #undef E2
34540 #undef E3
e4b2b4a8 34541@@ -909,6 +924,8 @@
1a6e0f06
JK
34542 #include "locking-selftest-spin-hardirq.h"
34543 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
34544
34545+#ifndef CONFIG_PREEMPT_RT_FULL
34546+
34547 #include "locking-selftest-rlock-hardirq.h"
34548 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
34549
e4b2b4a8 34550@@ -924,10 +941,14 @@
1a6e0f06
JK
34551 #include "locking-selftest-wlock-softirq.h"
34552 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
34553
34554+#endif
34555+
34556 #undef E1
34557 #undef E2
34558 #undef E3
34559
34560+#ifndef CONFIG_PREEMPT_RT_FULL
34561+
34562 /*
34563 * read-lock / write-lock irq inversion.
34564 *
e4b2b4a8 34565@@ -990,6 +1011,10 @@
1a6e0f06
JK
34566 #undef E2
34567 #undef E3
34568
34569+#endif
34570+
34571+#ifndef CONFIG_PREEMPT_RT_FULL
34572+
34573 /*
34574 * read-lock / write-lock recursion that is actually safe.
34575 */
e4b2b4a8 34576@@ -1028,6 +1053,8 @@
1a6e0f06
JK
34577 #undef E2
34578 #undef E3
34579
34580+#endif
34581+
34582 /*
34583 * read-lock / write-lock recursion that is unsafe.
34584 */
e4b2b4a8 34585@@ -2057,6 +2084,7 @@
1a6e0f06
JK
34586
34587 printk(" --------------------------------------------------------------------------\n");
34588
34589+#ifndef CONFIG_PREEMPT_RT_FULL
34590 /*
34591 * irq-context testcases:
34592 */
e4b2b4a8 34593@@ -2069,6 +2097,28 @@
1a6e0f06
JK
34594
34595 DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
34596 // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
34597+#else
34598+ /* On -rt, we only do hardirq context test for raw spinlock */
34599+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
34600+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
34601+
34602+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
34603+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
34604+
34605+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
34606+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
34607+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
34608+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
34609+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
34610+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
34611+
34612+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
34613+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
34614+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
34615+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
34616+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
34617+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
34618+#endif
34619
34620 ww_tests();
34621
e4b2b4a8
JK
34622diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/percpu_ida.c linux-4.14/lib/percpu_ida.c
34623--- linux-4.14.orig/lib/percpu_ida.c 2017-11-12 19:46:13.000000000 +0100
34624+++ linux-4.14/lib/percpu_ida.c 2018-09-05 11:05:07.000000000 +0200
34625@@ -27,6 +27,9 @@
1a6e0f06
JK
34626 #include <linux/string.h>
34627 #include <linux/spinlock.h>
34628 #include <linux/percpu_ida.h>
34629+#include <linux/locallock.h>
34630+
34631+static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
34632
34633 struct percpu_ida_cpu {
34634 /*
e4b2b4a8 34635@@ -149,13 +152,13 @@
1a6e0f06
JK
34636 unsigned long flags;
34637 int tag;
34638
34639- local_irq_save(flags);
34640+ local_lock_irqsave(irq_off_lock, flags);
34641 tags = this_cpu_ptr(pool->tag_cpu);
34642
34643 /* Fastpath */
34644 tag = alloc_local_tag(tags);
34645 if (likely(tag >= 0)) {
34646- local_irq_restore(flags);
34647+ local_unlock_irqrestore(irq_off_lock, flags);
34648 return tag;
34649 }
34650
e4b2b4a8 34651@@ -174,6 +177,7 @@
1a6e0f06
JK
34652
34653 if (!tags->nr_free)
34654 alloc_global_tags(pool, tags);
34655+
34656 if (!tags->nr_free)
34657 steal_tags(pool, tags);
34658
e4b2b4a8 34659@@ -185,7 +189,7 @@
1a6e0f06
JK
34660 }
34661
34662 spin_unlock(&pool->lock);
34663- local_irq_restore(flags);
34664+ local_unlock_irqrestore(irq_off_lock, flags);
34665
34666 if (tag >= 0 || state == TASK_RUNNING)
34667 break;
e4b2b4a8 34668@@ -197,7 +201,7 @@
1a6e0f06
JK
34669
34670 schedule();
34671
34672- local_irq_save(flags);
34673+ local_lock_irqsave(irq_off_lock, flags);
34674 tags = this_cpu_ptr(pool->tag_cpu);
34675 }
34676 if (state != TASK_RUNNING)
e4b2b4a8 34677@@ -222,7 +226,7 @@
1a6e0f06
JK
34678
34679 BUG_ON(tag >= pool->nr_tags);
34680
34681- local_irq_save(flags);
34682+ local_lock_irqsave(irq_off_lock, flags);
34683 tags = this_cpu_ptr(pool->tag_cpu);
34684
34685 spin_lock(&tags->lock);
e4b2b4a8 34686@@ -254,7 +258,7 @@
1a6e0f06
JK
34687 spin_unlock(&pool->lock);
34688 }
34689
34690- local_irq_restore(flags);
34691+ local_unlock_irqrestore(irq_off_lock, flags);
34692 }
34693 EXPORT_SYMBOL_GPL(percpu_ida_free);
34694
e4b2b4a8 34695@@ -346,7 +350,7 @@
1a6e0f06
JK
34696 struct percpu_ida_cpu *remote;
34697 unsigned cpu, i, err = 0;
34698
34699- local_irq_save(flags);
34700+ local_lock_irqsave(irq_off_lock, flags);
34701 for_each_possible_cpu(cpu) {
34702 remote = per_cpu_ptr(pool->tag_cpu, cpu);
34703 spin_lock(&remote->lock);
e4b2b4a8 34704@@ -368,7 +372,7 @@
1a6e0f06
JK
34705 }
34706 spin_unlock(&pool->lock);
34707 out:
34708- local_irq_restore(flags);
34709+ local_unlock_irqrestore(irq_off_lock, flags);
34710 return err;
34711 }
34712 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
e4b2b4a8
JK
34713diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/radix-tree.c linux-4.14/lib/radix-tree.c
34714--- linux-4.14.orig/lib/radix-tree.c 2018-09-05 11:03:25.000000000 +0200
34715+++ linux-4.14/lib/radix-tree.c 2018-09-05 11:05:07.000000000 +0200
34716@@ -37,7 +37,7 @@
1f39f580 34717 #include <linux/rcupdate.h>
e4b2b4a8
JK
34718 #include <linux/slab.h>
34719 #include <linux/string.h>
1f39f580
JK
34720-
34721+#include <linux/locallock.h>
34722
34723 /* Number of nodes in fully populated tree of given height */
34724 static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
e4b2b4a8 34725@@ -86,6 +86,7 @@
1f39f580
JK
34726 struct radix_tree_node *nodes;
34727 };
34728 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
34729+static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
34730
e4b2b4a8 34731 static inline struct radix_tree_node *entry_to_node(void *ptr)
1f39f580 34732 {
e4b2b4a8 34733@@ -404,12 +405,13 @@
1a6e0f06
JK
34734 * succeed in getting a node here (and never reach
34735 * kmem_cache_alloc)
34736 */
34737- rtp = this_cpu_ptr(&radix_tree_preloads);
1f39f580 34738+ rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
1a6e0f06
JK
34739 if (rtp->nr) {
34740 ret = rtp->nodes;
e4b2b4a8 34741 rtp->nodes = ret->parent;
1a6e0f06
JK
34742 rtp->nr--;
34743 }
1f39f580 34744+ put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
1a6e0f06
JK
34745 /*
34746 * Update the allocation stack trace as this is more useful
34747 * for debugging.
e4b2b4a8 34748@@ -475,14 +477,14 @@
1f39f580
JK
34749 */
34750 gfp_mask &= ~__GFP_ACCOUNT;
34751
34752- preempt_disable();
34753+ local_lock(radix_tree_preloads_lock);
34754 rtp = this_cpu_ptr(&radix_tree_preloads);
34755 while (rtp->nr < nr) {
34756- preempt_enable();
34757+ local_unlock(radix_tree_preloads_lock);
34758 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
34759 if (node == NULL)
34760 goto out;
34761- preempt_disable();
34762+ local_lock(radix_tree_preloads_lock);
34763 rtp = this_cpu_ptr(&radix_tree_preloads);
34764 if (rtp->nr < nr) {
e4b2b4a8
JK
34765 node->parent = rtp->nodes;
34766@@ -524,7 +526,7 @@
1f39f580
JK
34767 if (gfpflags_allow_blocking(gfp_mask))
34768 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
34769 /* Preloading doesn't help anything with this gfp mask, skip it */
34770- preempt_disable();
34771+ local_lock(radix_tree_preloads_lock);
34772 return 0;
1a6e0f06 34773 }
1f39f580 34774 EXPORT_SYMBOL(radix_tree_maybe_preload);
e4b2b4a8 34775@@ -562,7 +564,7 @@
1a6e0f06 34776
1f39f580
JK
34777 /* Preloading doesn't help anything with this gfp mask, skip it */
34778 if (!gfpflags_allow_blocking(gfp_mask)) {
34779- preempt_disable();
34780+ local_lock(radix_tree_preloads_lock);
34781 return 0;
34782 }
1a6e0f06 34783
e4b2b4a8 34784@@ -596,6 +598,12 @@
1a6e0f06
JK
34785 return __radix_tree_preload(gfp_mask, nr_nodes);
34786 }
1a6e0f06 34787
1f39f580
JK
34788+void radix_tree_preload_end(void)
34789+{
34790+ local_unlock(radix_tree_preloads_lock);
34791+}
34792+EXPORT_SYMBOL(radix_tree_preload_end);
34793+
e4b2b4a8
JK
34794 static unsigned radix_tree_load_root(const struct radix_tree_root *root,
34795 struct radix_tree_node **nodep, unsigned long *maxindex)
34796 {
34797@@ -2105,10 +2113,16 @@
34798 void idr_preload(gfp_t gfp_mask)
34799 {
34800 if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
34801- preempt_disable();
34802+ local_lock(radix_tree_preloads_lock);
34803 }
34804 EXPORT_SYMBOL(idr_preload);
34805
34806+void idr_preload_end(void)
34807+{
34808+ local_unlock(radix_tree_preloads_lock);
34809+}
34810+EXPORT_SYMBOL(idr_preload_end);
34811+
34812 /**
34813 * ida_pre_get - reserve resources for ida allocation
34814 * @ida: ida handle
34815@@ -2125,7 +2139,7 @@
34816 * to return to the ida_pre_get() step.
34817 */
34818 if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
34819- preempt_enable();
34820+ local_unlock(radix_tree_preloads_lock);
34821
34822 if (!this_cpu_read(ida_bitmap)) {
34823 struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
34824diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/scatterlist.c linux-4.14/lib/scatterlist.c
34825--- linux-4.14.orig/lib/scatterlist.c 2017-11-12 19:46:13.000000000 +0100
34826+++ linux-4.14/lib/scatterlist.c 2018-09-05 11:05:07.000000000 +0200
34827@@ -620,7 +620,7 @@
1a6e0f06
JK
34828 flush_kernel_dcache_page(miter->page);
34829
34830 if (miter->__flags & SG_MITER_ATOMIC) {
34831- WARN_ON_ONCE(preemptible());
34832+ WARN_ON_ONCE(!pagefault_disabled());
34833 kunmap_atomic(miter->addr);
34834 } else
34835 kunmap(miter->page);
e4b2b4a8
JK
34836diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/smp_processor_id.c linux-4.14/lib/smp_processor_id.c
34837--- linux-4.14.orig/lib/smp_processor_id.c 2017-11-12 19:46:13.000000000 +0100
34838+++ linux-4.14/lib/smp_processor_id.c 2018-09-05 11:05:07.000000000 +0200
34839@@ -23,7 +23,7 @@
34840 * Kernel threads bound to a single CPU can safely use
34841 * smp_processor_id():
34842 */
34843- if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
34844+ if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
34845 goto out;
1a6e0f06 34846
e4b2b4a8
JK
34847 /*
34848diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/timerqueue.c linux-4.14/lib/timerqueue.c
34849--- linux-4.14.orig/lib/timerqueue.c 2017-11-12 19:46:13.000000000 +0100
34850+++ linux-4.14/lib/timerqueue.c 2018-09-05 11:05:07.000000000 +0200
34851@@ -33,8 +33,9 @@
34852 * @head: head of timerqueue
34853 * @node: timer node to be added
34854 *
34855- * Adds the timer node to the timerqueue, sorted by the
34856- * node's expires value.
34857+ * Adds the timer node to the timerqueue, sorted by the node's expires
34858+ * value. Returns true if the newly added timer is the first expiring timer in
34859+ * the queue.
34860 */
34861 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
34862 {
34863@@ -70,7 +71,8 @@
34864 * @head: head of timerqueue
34865 * @node: timer node to be removed
34866 *
34867- * Removes the timer node from the timerqueue.
34868+ * Removes the timer node from the timerqueue. Returns true if the queue is
34869+ * not empty after the remove.
34870 */
34871 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
34872 {
34873diff -durN -x '*~' -x '*.orig' linux-4.14.orig/localversion-rt linux-4.14/localversion-rt
34874--- linux-4.14.orig/localversion-rt 1970-01-01 01:00:00.000000000 +0100
34875+++ linux-4.14/localversion-rt 2018-09-05 11:05:07.000000000 +0200
1a6e0f06 34876@@ -0,0 +1 @@
e4b2b4a8
JK
34877+-rt40
34878diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/backing-dev.c linux-4.14/mm/backing-dev.c
34879--- linux-4.14.orig/mm/backing-dev.c 2018-09-05 11:03:25.000000000 +0200
34880+++ linux-4.14/mm/backing-dev.c 2018-09-05 11:05:07.000000000 +0200
34881@@ -470,9 +470,9 @@
1a6e0f06
JK
34882 {
34883 unsigned long flags;
34884
34885- local_irq_save(flags);
34886+ local_irq_save_nort(flags);
34887 if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
34888- local_irq_restore(flags);
34889+ local_irq_restore_nort(flags);
34890 return;
34891 }
34892
e4b2b4a8
JK
34893diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/compaction.c linux-4.14/mm/compaction.c
34894--- linux-4.14.orig/mm/compaction.c 2017-11-12 19:46:13.000000000 +0100
34895+++ linux-4.14/mm/compaction.c 2018-09-05 11:05:07.000000000 +0200
34896@@ -1634,10 +1634,12 @@
1a6e0f06
JK
34897 block_start_pfn(cc->migrate_pfn, cc->order);
34898
34899 if (cc->last_migrated_pfn < current_block_start) {
34900- cpu = get_cpu();
34901+ cpu = get_cpu_light();
34902+ local_lock_irq(swapvec_lock);
34903 lru_add_drain_cpu(cpu);
34904+ local_unlock_irq(swapvec_lock);
34905 drain_local_pages(zone);
34906- put_cpu();
34907+ put_cpu_light();
34908 /* No more flushing until we migrate again */
34909 cc->last_migrated_pfn = 0;
34910 }
e4b2b4a8
JK
34911diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/filemap.c linux-4.14/mm/filemap.c
34912--- linux-4.14.orig/mm/filemap.c 2018-09-05 11:03:28.000000000 +0200
34913+++ linux-4.14/mm/filemap.c 2018-09-05 11:05:07.000000000 +0200
34914@@ -110,6 +110,7 @@
34915 * ->i_mmap_rwsem
34916 * ->tasklist_lock (memory_failure, collect_procs_ao)
34917 */
34918+DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
34919
34920 static int page_cache_tree_insert(struct address_space *mapping,
34921 struct page *page, void **shadowp)
34922@@ -133,8 +134,10 @@
34923 if (shadowp)
34924 *shadowp = p;
1a6e0f06 34925 }
e4b2b4a8
JK
34926+ local_lock(shadow_nodes_lock);
34927 __radix_tree_replace(&mapping->page_tree, node, slot, page,
34928- workingset_update_node, mapping);
34929+ __workingset_update_node, mapping);
34930+ local_unlock(shadow_nodes_lock);
34931 mapping->nrpages++;
1a6e0f06
JK
34932 return 0;
34933 }
e4b2b4a8
JK
34934@@ -151,6 +154,7 @@
34935 VM_BUG_ON_PAGE(PageTail(page), page);
34936 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
34937
34938+ local_lock(shadow_nodes_lock);
34939 for (i = 0; i < nr; i++) {
34940 struct radix_tree_node *node;
34941 void **slot;
34942@@ -162,8 +166,9 @@
1a6e0f06 34943
e4b2b4a8
JK
34944 radix_tree_clear_tags(&mapping->page_tree, node, slot);
34945 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
34946- workingset_update_node, mapping);
34947+ __workingset_update_node, mapping);
34948 }
34949+ local_unlock(shadow_nodes_lock);
34950
34951 if (shadow) {
34952 mapping->nrexceptional += nr;
34953diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/highmem.c linux-4.14/mm/highmem.c
34954--- linux-4.14.orig/mm/highmem.c 2017-11-12 19:46:13.000000000 +0100
34955+++ linux-4.14/mm/highmem.c 2018-09-05 11:05:07.000000000 +0200
34956@@ -30,10 +30,11 @@
1a6e0f06
JK
34957 #include <linux/kgdb.h>
34958 #include <asm/tlbflush.h>
34959
34960-
34961+#ifndef CONFIG_PREEMPT_RT_FULL
34962 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34963 DEFINE_PER_CPU(int, __kmap_atomic_idx);
34964 #endif
34965+#endif
34966
34967 /*
34968 * Virtual_count is not a pure "count".
e4b2b4a8 34969@@ -108,8 +109,9 @@
1a6e0f06
JK
34970 unsigned long totalhigh_pages __read_mostly;
34971 EXPORT_SYMBOL(totalhigh_pages);
34972
34973-
34974+#ifndef CONFIG_PREEMPT_RT_FULL
34975 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
34976+#endif
34977
34978 unsigned int nr_free_highpages (void)
34979 {
e4b2b4a8
JK
34980diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/Kconfig linux-4.14/mm/Kconfig
34981--- linux-4.14.orig/mm/Kconfig 2018-09-05 11:03:25.000000000 +0200
34982+++ linux-4.14/mm/Kconfig 2018-09-05 11:05:07.000000000 +0200
34983@@ -385,7 +385,7 @@
34984
34985 config TRANSPARENT_HUGEPAGE
34986 bool "Transparent Hugepage Support"
34987- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
34988+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
34989 select COMPACTION
34990 select RADIX_TREE_MULTIORDER
34991 help
34992diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/memcontrol.c linux-4.14/mm/memcontrol.c
34993--- linux-4.14.orig/mm/memcontrol.c 2018-09-05 11:03:25.000000000 +0200
34994+++ linux-4.14/mm/memcontrol.c 2018-09-05 11:05:07.000000000 +0200
34995@@ -69,6 +69,7 @@
1a6e0f06
JK
34996 #include <net/sock.h>
34997 #include <net/ip.h>
34998 #include "slab.h"
34999+#include <linux/locallock.h>
35000
e4b2b4a8 35001 #include <linux/uaccess.h>
1a6e0f06 35002
e4b2b4a8 35003@@ -94,6 +95,8 @@
1a6e0f06
JK
35004 #define do_swap_account 0
35005 #endif
35006
35007+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
35008+
35009 /* Whether legacy memory+swap accounting is active */
35010 static bool do_memsw_account(void)
35011 {
e4b2b4a8
JK
35012@@ -1831,7 +1834,7 @@
35013 * as well as workers from this path always operate on the local
35014 * per-cpu data. CPU up doesn't touch memcg_stock at all.
35015 */
1a6e0f06
JK
35016- curcpu = get_cpu();
35017+ curcpu = get_cpu_light();
35018 for_each_online_cpu(cpu) {
35019 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
35020 struct mem_cgroup *memcg;
e4b2b4a8 35021@@ -1851,7 +1854,7 @@
1a6e0f06 35022 }
e4b2b4a8 35023 css_put(&memcg->css);
1a6e0f06
JK
35024 }
35025- put_cpu();
35026+ put_cpu_light();
1a6e0f06
JK
35027 mutex_unlock(&percpu_charge_mutex);
35028 }
e4b2b4a8
JK
35029
35030@@ -4624,12 +4627,12 @@
1a6e0f06
JK
35031
35032 ret = 0;
35033
35034- local_irq_disable();
35035+ local_lock_irq(event_lock);
35036 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
35037 memcg_check_events(to, page);
35038 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
35039 memcg_check_events(from, page);
35040- local_irq_enable();
35041+ local_unlock_irq(event_lock);
35042 out_unlock:
35043 unlock_page(page);
35044 out:
e4b2b4a8 35045@@ -5572,10 +5575,10 @@
1a6e0f06
JK
35046
35047 commit_charge(page, memcg, lrucare);
35048
35049- local_irq_disable();
35050+ local_lock_irq(event_lock);
35051 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
35052 memcg_check_events(memcg, page);
35053- local_irq_enable();
35054+ local_unlock_irq(event_lock);
35055
35056 if (do_memsw_account() && PageSwapCache(page)) {
35057 swp_entry_t entry = { .val = page_private(page) };
e4b2b4a8
JK
35058@@ -5644,7 +5647,7 @@
35059 memcg_oom_recover(ug->memcg);
1a6e0f06
JK
35060 }
35061
35062- local_irq_save(flags);
35063+ local_lock_irqsave(event_lock, flags);
e4b2b4a8
JK
35064 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
35065 __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
35066 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
35067@@ -5652,7 +5655,7 @@
35068 __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
35069 __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
35070 memcg_check_events(ug->memcg, ug->dummy_page);
1a6e0f06
JK
35071- local_irq_restore(flags);
35072+ local_unlock_irqrestore(event_lock, flags);
35073
e4b2b4a8
JK
35074 if (!mem_cgroup_is_root(ug->memcg))
35075 css_put_many(&ug->memcg->css, nr_pages);
35076@@ -5815,10 +5818,10 @@
1a6e0f06
JK
35077
35078 commit_charge(newpage, memcg, false);
35079
35080- local_irq_save(flags);
35081+ local_lock_irqsave(event_lock, flags);
35082 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
35083 memcg_check_events(memcg, newpage);
35084- local_irq_restore(flags);
35085+ local_unlock_irqrestore(event_lock, flags);
35086 }
35087
35088 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
e4b2b4a8 35089@@ -6010,6 +6013,7 @@
1a6e0f06 35090 struct mem_cgroup *memcg, *swap_memcg;
e4b2b4a8 35091 unsigned int nr_entries;
1a6e0f06
JK
35092 unsigned short oldid;
35093+ unsigned long flags;
35094
35095 VM_BUG_ON_PAGE(PageLRU(page), page);
35096 VM_BUG_ON_PAGE(page_count(page), page);
e4b2b4a8 35097@@ -6055,13 +6059,17 @@
1a6e0f06
JK
35098 * important here to have the interrupts disabled because it is the
35099 * only synchronisation we have for udpating the per-CPU variables.
35100 */
35101+ local_lock_irqsave(event_lock, flags);
35102+#ifndef CONFIG_PREEMPT_RT_BASE
35103 VM_BUG_ON(!irqs_disabled());
35104+#endif
e4b2b4a8
JK
35105 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
35106 -nr_entries);
1a6e0f06
JK
35107 memcg_check_events(memcg, page);
35108
35109 if (!mem_cgroup_is_root(memcg))
e4b2b4a8 35110 css_put_many(&memcg->css, nr_entries);
1a6e0f06
JK
35111+ local_unlock_irqrestore(event_lock, flags);
35112 }
35113
e4b2b4a8
JK
35114 /**
35115diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/mmu_context.c linux-4.14/mm/mmu_context.c
35116--- linux-4.14.orig/mm/mmu_context.c 2017-11-12 19:46:13.000000000 +0100
35117+++ linux-4.14/mm/mmu_context.c 2018-09-05 11:05:07.000000000 +0200
35118@@ -25,6 +25,7 @@
1a6e0f06
JK
35119 struct task_struct *tsk = current;
35120
35121 task_lock(tsk);
35122+ preempt_disable_rt();
35123 active_mm = tsk->active_mm;
35124 if (active_mm != mm) {
e4b2b4a8
JK
35125 mmgrab(mm);
35126@@ -32,6 +33,7 @@
1a6e0f06
JK
35127 }
35128 tsk->mm = mm;
35129 switch_mm(active_mm, mm, tsk);
35130+ preempt_enable_rt();
35131 task_unlock(tsk);
35132 #ifdef finish_arch_post_lock_switch
35133 finish_arch_post_lock_switch();
e4b2b4a8
JK
35134diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/page_alloc.c linux-4.14/mm/page_alloc.c
35135--- linux-4.14.orig/mm/page_alloc.c 2018-09-05 11:03:25.000000000 +0200
35136+++ linux-4.14/mm/page_alloc.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06 35137@@ -61,6 +61,7 @@
1a6e0f06
JK
35138 #include <linux/hugetlb.h>
35139 #include <linux/sched/rt.h>
e4b2b4a8 35140 #include <linux/sched/mm.h>
1a6e0f06
JK
35141+#include <linux/locallock.h>
35142 #include <linux/page_owner.h>
35143 #include <linux/kthread.h>
35144 #include <linux/memcontrol.h>
e4b2b4a8 35145@@ -286,6 +287,18 @@
1a6e0f06
JK
35146 EXPORT_SYMBOL(nr_online_nodes);
35147 #endif
35148
35149+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
35150+
35151+#ifdef CONFIG_PREEMPT_RT_BASE
35152+# define cpu_lock_irqsave(cpu, flags) \
35153+ local_lock_irqsave_on(pa_lock, flags, cpu)
35154+# define cpu_unlock_irqrestore(cpu, flags) \
35155+ local_unlock_irqrestore_on(pa_lock, flags, cpu)
35156+#else
35157+# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
35158+# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
35159+#endif
35160+
35161 int page_group_by_mobility_disabled __read_mostly;
35162
35163 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
e4b2b4a8 35164@@ -1094,7 +1107,7 @@
1a6e0f06
JK
35165 #endif /* CONFIG_DEBUG_VM */
35166
35167 /*
35168- * Frees a number of pages from the PCP lists
35169+ * Frees a number of pages which have been collected from the pcp lists.
35170 * Assumes all pages on list are in same zone, and of same order.
35171 * count is the number of pages to free.
35172 *
e4b2b4a8 35173@@ -1105,15 +1118,53 @@
1a6e0f06
JK
35174 * pinned" detection logic.
35175 */
35176 static void free_pcppages_bulk(struct zone *zone, int count,
35177- struct per_cpu_pages *pcp)
35178+ struct list_head *list)
35179 {
35180- int migratetype = 0;
35181- int batch_free = 0;
1a6e0f06
JK
35182 bool isolated_pageblocks;
35183+ unsigned long flags;
1a6e0f06
JK
35184
35185- spin_lock(&zone->lock);
e4b2b4a8 35186+ spin_lock_irqsave(&zone->lock, flags);
1a6e0f06 35187 isolated_pageblocks = has_isolate_pageblock(zone);
1a6e0f06
JK
35188
35189+ while (!list_empty(list)) {
35190+ struct page *page;
e4b2b4a8 35191+ int mt; /* migratetype of the to-be-freed page */
1a6e0f06
JK
35192+
35193+ page = list_first_entry(list, struct page, lru);
35194+ /* must delete as __free_one_page list manipulates */
35195+ list_del(&page->lru);
35196+
35197+ mt = get_pcppage_migratetype(page);
35198+ /* MIGRATE_ISOLATE page should not go to pcplists */
35199+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
35200+ /* Pageblock could have been isolated meanwhile */
35201+ if (unlikely(isolated_pageblocks))
35202+ mt = get_pageblock_migratetype(page);
35203+
35204+ if (bulkfree_pcp_prepare(page))
35205+ continue;
35206+
35207+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
35208+ trace_mm_page_pcpu_drain(page, 0, mt);
35209+ count--;
35210+ }
35211+ WARN_ON(count != 0);
35212+ spin_unlock_irqrestore(&zone->lock, flags);
35213+}
35214+
35215+/*
35216+ * Moves a number of pages from the PCP lists to free list which
35217+ * is freed outside of the locked region.
35218+ *
35219+ * Assumes all pages on list are in same zone, and of same order.
35220+ * count is the number of pages to free.
35221+ */
35222+static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
35223+ struct list_head *dst)
35224+{
35225+ int migratetype = 0;
35226+ int batch_free = 0;
35227+
35228 while (count) {
35229 struct page *page;
35230 struct list_head *list;
e4b2b4a8 35231@@ -1129,7 +1180,7 @@
1a6e0f06
JK
35232 batch_free++;
35233 if (++migratetype == MIGRATE_PCPTYPES)
35234 migratetype = 0;
35235- list = &pcp->lists[migratetype];
35236+ list = &src->lists[migratetype];
35237 } while (list_empty(list));
35238
35239 /* This is the only non-empty list. Free them all. */
e4b2b4a8 35240@@ -1137,27 +1188,12 @@
1a6e0f06
JK
35241 batch_free = count;
35242
35243 do {
35244- int mt; /* migratetype of the to-be-freed page */
35245-
35246 page = list_last_entry(list, struct page, lru);
35247- /* must delete as __free_one_page list manipulates */
35248 list_del(&page->lru);
35249
35250- mt = get_pcppage_migratetype(page);
35251- /* MIGRATE_ISOLATE page should not go to pcplists */
35252- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
35253- /* Pageblock could have been isolated meanwhile */
35254- if (unlikely(isolated_pageblocks))
35255- mt = get_pageblock_migratetype(page);
35256-
35257- if (bulkfree_pcp_prepare(page))
35258- continue;
35259-
35260- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
35261- trace_mm_page_pcpu_drain(page, 0, mt);
35262+ list_add(&page->lru, dst);
35263 } while (--count && --batch_free && !list_empty(list));
35264 }
35265- spin_unlock(&zone->lock);
35266 }
35267
35268 static void free_one_page(struct zone *zone,
e4b2b4a8
JK
35269@@ -1165,13 +1201,15 @@
35270 unsigned int order,
1a6e0f06
JK
35271 int migratetype)
35272 {
1a6e0f06
JK
35273- spin_lock(&zone->lock);
35274+ unsigned long flags;
35275+
35276+ spin_lock_irqsave(&zone->lock, flags);
e4b2b4a8
JK
35277 if (unlikely(has_isolate_pageblock(zone) ||
35278 is_migrate_isolate(migratetype))) {
1a6e0f06
JK
35279 migratetype = get_pfnblock_migratetype(page, pfn);
35280 }
35281 __free_one_page(page, pfn, zone, order, migratetype);
35282- spin_unlock(&zone->lock);
35283+ spin_unlock_irqrestore(&zone->lock, flags);
35284 }
35285
35286 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
e4b2b4a8 35287@@ -1257,10 +1295,10 @@
1a6e0f06
JK
35288 return;
35289
35290 migratetype = get_pfnblock_migratetype(page, pfn);
35291- local_irq_save(flags);
35292+ local_lock_irqsave(pa_lock, flags);
35293 __count_vm_events(PGFREE, 1 << order);
35294 free_one_page(page_zone(page), page, pfn, order, migratetype);
35295- local_irq_restore(flags);
35296+ local_unlock_irqrestore(pa_lock, flags);
35297 }
35298
35299 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
e4b2b4a8 35300@@ -2378,16 +2416,18 @@
1a6e0f06
JK
35301 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
35302 {
35303 unsigned long flags;
35304+ LIST_HEAD(dst);
35305 int to_drain, batch;
35306
35307- local_irq_save(flags);
35308+ local_lock_irqsave(pa_lock, flags);
35309 batch = READ_ONCE(pcp->batch);
35310 to_drain = min(pcp->count, batch);
35311 if (to_drain > 0) {
35312- free_pcppages_bulk(zone, to_drain, pcp);
35313+ isolate_pcp_pages(to_drain, pcp, &dst);
35314 pcp->count -= to_drain;
35315 }
35316- local_irq_restore(flags);
35317+ local_unlock_irqrestore(pa_lock, flags);
35318+ free_pcppages_bulk(zone, to_drain, &dst);
35319 }
35320 #endif
35321
e4b2b4a8 35322@@ -2403,16 +2443,21 @@
1a6e0f06
JK
35323 unsigned long flags;
35324 struct per_cpu_pageset *pset;
35325 struct per_cpu_pages *pcp;
35326+ LIST_HEAD(dst);
35327+ int count;
35328
35329- local_irq_save(flags);
35330+ cpu_lock_irqsave(cpu, flags);
35331 pset = per_cpu_ptr(zone->pageset, cpu);
35332
35333 pcp = &pset->pcp;
35334- if (pcp->count) {
35335- free_pcppages_bulk(zone, pcp->count, pcp);
35336+ count = pcp->count;
35337+ if (count) {
35338+ isolate_pcp_pages(count, pcp, &dst);
35339 pcp->count = 0;
35340 }
35341- local_irq_restore(flags);
35342+ cpu_unlock_irqrestore(cpu, flags);
35343+ if (count)
35344+ free_pcppages_bulk(zone, count, &dst);
35345 }
35346
35347 /*
e4b2b4a8
JK
35348@@ -2447,6 +2492,7 @@
35349 drain_pages(cpu);
35350 }
35351
35352+#ifndef CONFIG_PREEMPT_RT_BASE
35353 static void drain_local_pages_wq(struct work_struct *work)
35354 {
35355 /*
35356@@ -2460,6 +2506,7 @@
35357 drain_local_pages(NULL);
35358 preempt_enable();
35359 }
35360+#endif
35361
35362 /*
35363 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
35364@@ -2526,7 +2573,14 @@
1a6e0f06
JK
35365 else
35366 cpumask_clear_cpu(cpu, &cpus_with_pcps);
35367 }
e4b2b4a8
JK
35368-
35369+#ifdef CONFIG_PREEMPT_RT_BASE
1a6e0f06
JK
35370+ for_each_cpu(cpu, &cpus_with_pcps) {
35371+ if (zone)
35372+ drain_pages_zone(cpu, zone);
35373+ else
35374+ drain_pages(cpu);
35375+ }
e4b2b4a8
JK
35376+#else
35377 for_each_cpu(cpu, &cpus_with_pcps) {
35378 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
35379 INIT_WORK(work, drain_local_pages_wq);
35380@@ -2534,6 +2588,7 @@
35381 }
35382 for_each_cpu(cpu, &cpus_with_pcps)
35383 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
1a6e0f06 35384+#endif
1a6e0f06 35385
e4b2b4a8
JK
35386 mutex_unlock(&pcpu_drain_mutex);
35387 }
35388@@ -2610,7 +2665,7 @@
1a6e0f06
JK
35389
35390 migratetype = get_pfnblock_migratetype(page, pfn);
35391 set_pcppage_migratetype(page, migratetype);
35392- local_irq_save(flags);
35393+ local_lock_irqsave(pa_lock, flags);
35394 __count_vm_event(PGFREE);
35395
35396 /*
e4b2b4a8 35397@@ -2636,12 +2691,17 @@
1a6e0f06
JK
35398 pcp->count++;
35399 if (pcp->count >= pcp->high) {
35400 unsigned long batch = READ_ONCE(pcp->batch);
35401- free_pcppages_bulk(zone, batch, pcp);
35402+ LIST_HEAD(dst);
35403+
35404+ isolate_pcp_pages(batch, pcp, &dst);
35405 pcp->count -= batch;
35406+ local_unlock_irqrestore(pa_lock, flags);
35407+ free_pcppages_bulk(zone, batch, &dst);
35408+ return;
35409 }
35410
35411 out:
35412- local_irq_restore(flags);
35413+ local_unlock_irqrestore(pa_lock, flags);
35414 }
35415
35416 /*
e4b2b4a8
JK
35417@@ -2789,7 +2849,7 @@
35418 struct page *page;
35419 unsigned long flags;
1a6e0f06 35420
e4b2b4a8
JK
35421- local_irq_save(flags);
35422+ local_lock_irqsave(pa_lock, flags);
35423 pcp = &this_cpu_ptr(zone->pageset)->pcp;
35424 list = &pcp->lists[migratetype];
35425 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
35426@@ -2797,7 +2857,7 @@
35427 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
35428 zone_statistics(preferred_zone, zone);
1a6e0f06 35429 }
e4b2b4a8
JK
35430- local_irq_restore(flags);
35431+ local_unlock_irqrestore(pa_lock, flags);
35432 return page;
35433 }
35434
35435@@ -2824,7 +2884,7 @@
35436 * allocate greater than order-1 page units with __GFP_NOFAIL.
35437 */
35438 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
35439- spin_lock_irqsave(&zone->lock, flags);
35440+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
35441
35442 do {
35443 page = NULL;
35444@@ -2844,14 +2904,14 @@
1a6e0f06
JK
35445
35446 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
e4b2b4a8 35447 zone_statistics(preferred_zone, zone);
1a6e0f06
JK
35448- local_irq_restore(flags);
35449+ local_unlock_irqrestore(pa_lock, flags);
35450
e4b2b4a8
JK
35451 out:
35452 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
1a6e0f06
JK
35453 return page;
35454
35455 failed:
35456- local_irq_restore(flags);
35457+ local_unlock_irqrestore(pa_lock, flags);
35458 return NULL;
35459 }
35460
e4b2b4a8 35461@@ -6778,8 +6838,9 @@
1a6e0f06 35462
e4b2b4a8 35463 static int page_alloc_cpu_dead(unsigned int cpu)
1a6e0f06 35464 {
e4b2b4a8
JK
35465-
35466+ local_lock_irq_on(swapvec_lock, cpu);
35467 lru_add_drain_cpu(cpu);
35468+ local_unlock_irq_on(swapvec_lock, cpu);
35469 drain_pages(cpu);
1a6e0f06 35470
e4b2b4a8
JK
35471 /*
35472@@ -7683,7 +7744,7 @@
1a6e0f06
JK
35473 struct per_cpu_pageset *pset;
35474
35475 /* avoid races with drain_pages() */
35476- local_irq_save(flags);
35477+ local_lock_irqsave(pa_lock, flags);
35478 if (zone->pageset != &boot_pageset) {
35479 for_each_online_cpu(cpu) {
35480 pset = per_cpu_ptr(zone->pageset, cpu);
e4b2b4a8 35481@@ -7692,7 +7753,7 @@
1a6e0f06
JK
35482 free_percpu(zone->pageset);
35483 zone->pageset = &boot_pageset;
35484 }
35485- local_irq_restore(flags);
35486+ local_unlock_irqrestore(pa_lock, flags);
35487 }
35488
35489 #ifdef CONFIG_MEMORY_HOTREMOVE
e4b2b4a8
JK
35490diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/slab.h linux-4.14/mm/slab.h
35491--- linux-4.14.orig/mm/slab.h 2018-09-05 11:03:25.000000000 +0200
35492+++ linux-4.14/mm/slab.h 2018-09-05 11:05:07.000000000 +0200
35493@@ -451,7 +451,11 @@
1a6e0f06
JK
35494 * The slab lists for all objects.
35495 */
35496 struct kmem_cache_node {
35497+#ifdef CONFIG_SLUB
35498+ raw_spinlock_t list_lock;
35499+#else
35500 spinlock_t list_lock;
35501+#endif
35502
35503 #ifdef CONFIG_SLAB
35504 struct list_head slabs_partial; /* partial list first, better asm code */
e4b2b4a8
JK
35505diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/slub.c linux-4.14/mm/slub.c
35506--- linux-4.14.orig/mm/slub.c 2018-09-05 11:03:25.000000000 +0200
35507+++ linux-4.14/mm/slub.c 2018-09-05 11:05:07.000000000 +0200
35508@@ -1179,7 +1179,7 @@
1a6e0f06
JK
35509 unsigned long uninitialized_var(flags);
35510 int ret = 0;
35511
35512- spin_lock_irqsave(&n->list_lock, flags);
35513+ raw_spin_lock_irqsave(&n->list_lock, flags);
35514 slab_lock(page);
35515
35516 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
e4b2b4a8 35517@@ -1214,7 +1214,7 @@
1a6e0f06
JK
35518 bulk_cnt, cnt);
35519
35520 slab_unlock(page);
35521- spin_unlock_irqrestore(&n->list_lock, flags);
35522+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35523 if (!ret)
35524 slab_fix(s, "Object at 0x%p not freed", object);
35525 return ret;
e4b2b4a8 35526@@ -1342,6 +1342,12 @@
1a6e0f06
JK
35527
35528 #endif /* CONFIG_SLUB_DEBUG */
35529
35530+struct slub_free_list {
35531+ raw_spinlock_t lock;
35532+ struct list_head list;
35533+};
35534+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
35535+
35536 /*
35537 * Hooks for other subsystems that check memory allocations. In a typical
35538 * production configuration these hooks all should produce no code at all.
e4b2b4a8 35539@@ -1561,10 +1567,17 @@
1a6e0f06
JK
35540 void *start, *p;
35541 int idx, order;
35542 bool shuffle;
35543+ bool enableirqs = false;
35544
35545 flags &= gfp_allowed_mask;
35546
35547 if (gfpflags_allow_blocking(flags))
35548+ enableirqs = true;
35549+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 35550+ if (system_state > SYSTEM_BOOTING)
1a6e0f06
JK
35551+ enableirqs = true;
35552+#endif
35553+ if (enableirqs)
35554 local_irq_enable();
35555
35556 flags |= s->allocflags;
e4b2b4a8 35557@@ -1623,7 +1636,7 @@
1a6e0f06
JK
35558 page->frozen = 1;
35559
35560 out:
35561- if (gfpflags_allow_blocking(flags))
35562+ if (enableirqs)
35563 local_irq_disable();
35564 if (!page)
35565 return NULL;
e4b2b4a8 35566@@ -1681,6 +1694,16 @@
1a6e0f06
JK
35567 __free_pages(page, order);
35568 }
35569
35570+static void free_delayed(struct list_head *h)
35571+{
35572+ while(!list_empty(h)) {
35573+ struct page *page = list_first_entry(h, struct page, lru);
35574+
35575+ list_del(&page->lru);
35576+ __free_slab(page->slab_cache, page);
35577+ }
35578+}
35579+
35580 #define need_reserve_slab_rcu \
35581 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
35582
e4b2b4a8 35583@@ -1712,6 +1735,12 @@
1a6e0f06
JK
35584 }
35585
35586 call_rcu(head, rcu_free_slab);
35587+ } else if (irqs_disabled()) {
35588+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
35589+
35590+ raw_spin_lock(&f->lock);
35591+ list_add(&page->lru, &f->list);
35592+ raw_spin_unlock(&f->lock);
35593 } else
35594 __free_slab(s, page);
35595 }
e4b2b4a8 35596@@ -1819,7 +1848,7 @@
1a6e0f06
JK
35597 if (!n || !n->nr_partial)
35598 return NULL;
35599
35600- spin_lock(&n->list_lock);
35601+ raw_spin_lock(&n->list_lock);
35602 list_for_each_entry_safe(page, page2, &n->partial, lru) {
35603 void *t;
35604
e4b2b4a8 35605@@ -1844,7 +1873,7 @@
1a6e0f06
JK
35606 break;
35607
35608 }
35609- spin_unlock(&n->list_lock);
35610+ raw_spin_unlock(&n->list_lock);
35611 return object;
35612 }
35613
e4b2b4a8 35614@@ -2090,7 +2119,7 @@
1a6e0f06
JK
35615 * that acquire_slab() will see a slab page that
35616 * is frozen
35617 */
35618- spin_lock(&n->list_lock);
35619+ raw_spin_lock(&n->list_lock);
35620 }
35621 } else {
35622 m = M_FULL;
e4b2b4a8 35623@@ -2101,7 +2130,7 @@
1a6e0f06
JK
35624 * slabs from diagnostic functions will not see
35625 * any frozen slabs.
35626 */
35627- spin_lock(&n->list_lock);
35628+ raw_spin_lock(&n->list_lock);
35629 }
35630 }
35631
e4b2b4a8 35632@@ -2136,7 +2165,7 @@
1a6e0f06
JK
35633 goto redo;
35634
35635 if (lock)
35636- spin_unlock(&n->list_lock);
35637+ raw_spin_unlock(&n->list_lock);
35638
35639 if (m == M_FREE) {
35640 stat(s, DEACTIVATE_EMPTY);
e4b2b4a8 35641@@ -2171,10 +2200,10 @@
1a6e0f06
JK
35642 n2 = get_node(s, page_to_nid(page));
35643 if (n != n2) {
35644 if (n)
35645- spin_unlock(&n->list_lock);
35646+ raw_spin_unlock(&n->list_lock);
35647
35648 n = n2;
35649- spin_lock(&n->list_lock);
35650+ raw_spin_lock(&n->list_lock);
35651 }
35652
35653 do {
e4b2b4a8 35654@@ -2203,7 +2232,7 @@
1a6e0f06
JK
35655 }
35656
35657 if (n)
35658- spin_unlock(&n->list_lock);
35659+ raw_spin_unlock(&n->list_lock);
35660
35661 while (discard_page) {
35662 page = discard_page;
e4b2b4a8 35663@@ -2242,14 +2271,21 @@
1a6e0f06
JK
35664 pobjects = oldpage->pobjects;
35665 pages = oldpage->pages;
35666 if (drain && pobjects > s->cpu_partial) {
35667+ struct slub_free_list *f;
35668 unsigned long flags;
35669+ LIST_HEAD(tofree);
35670 /*
35671 * partial array is full. Move the existing
35672 * set to the per node partial list.
35673 */
35674 local_irq_save(flags);
35675 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
35676+ f = this_cpu_ptr(&slub_free_list);
35677+ raw_spin_lock(&f->lock);
35678+ list_splice_init(&f->list, &tofree);
35679+ raw_spin_unlock(&f->lock);
35680 local_irq_restore(flags);
35681+ free_delayed(&tofree);
35682 oldpage = NULL;
35683 pobjects = 0;
35684 pages = 0;
e4b2b4a8 35685@@ -2319,7 +2355,22 @@
1a6e0f06
JK
35686
35687 static void flush_all(struct kmem_cache *s)
35688 {
35689+ LIST_HEAD(tofree);
35690+ int cpu;
35691+
35692 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
35693+ for_each_online_cpu(cpu) {
35694+ struct slub_free_list *f;
35695+
35696+ if (!has_cpu_slab(cpu, s))
35697+ continue;
35698+
35699+ f = &per_cpu(slub_free_list, cpu);
35700+ raw_spin_lock_irq(&f->lock);
35701+ list_splice_init(&f->list, &tofree);
35702+ raw_spin_unlock_irq(&f->lock);
35703+ free_delayed(&tofree);
35704+ }
35705 }
35706
35707 /*
e4b2b4a8 35708@@ -2374,10 +2425,10 @@
1a6e0f06
JK
35709 unsigned long x = 0;
35710 struct page *page;
35711
35712- spin_lock_irqsave(&n->list_lock, flags);
35713+ raw_spin_lock_irqsave(&n->list_lock, flags);
35714 list_for_each_entry(page, &n->partial, lru)
35715 x += get_count(page);
35716- spin_unlock_irqrestore(&n->list_lock, flags);
35717+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35718 return x;
35719 }
35720 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
e4b2b4a8 35721@@ -2515,8 +2566,10 @@
1a6e0f06
JK
35722 * already disabled (which is the case for bulk allocation).
35723 */
35724 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
35725- unsigned long addr, struct kmem_cache_cpu *c)
35726+ unsigned long addr, struct kmem_cache_cpu *c,
35727+ struct list_head *to_free)
35728 {
35729+ struct slub_free_list *f;
35730 void *freelist;
35731 struct page *page;
35732
e4b2b4a8 35733@@ -2572,6 +2625,13 @@
1a6e0f06
JK
35734 VM_BUG_ON(!c->page->frozen);
35735 c->freelist = get_freepointer(s, freelist);
35736 c->tid = next_tid(c->tid);
35737+
35738+out:
35739+ f = this_cpu_ptr(&slub_free_list);
35740+ raw_spin_lock(&f->lock);
35741+ list_splice_init(&f->list, to_free);
35742+ raw_spin_unlock(&f->lock);
35743+
35744 return freelist;
35745
35746 new_slab:
e4b2b4a8
JK
35747@@ -2587,7 +2647,7 @@
35748
35749 if (unlikely(!freelist)) {
35750 slab_out_of_memory(s, gfpflags, node);
35751- return NULL;
35752+ goto out;
35753 }
35754
35755 page = c->page;
35756@@ -2600,7 +2660,7 @@
35757 goto new_slab; /* Slab failed checks. Next slab needed */
35758
35759 deactivate_slab(s, page, get_freepointer(s, freelist), c);
1a6e0f06
JK
35760- return freelist;
35761+ goto out;
35762 }
35763
35764 /*
e4b2b4a8 35765@@ -2612,6 +2672,7 @@
1a6e0f06
JK
35766 {
35767 void *p;
35768 unsigned long flags;
35769+ LIST_HEAD(tofree);
35770
35771 local_irq_save(flags);
35772 #ifdef CONFIG_PREEMPT
e4b2b4a8 35773@@ -2623,8 +2684,9 @@
1a6e0f06
JK
35774 c = this_cpu_ptr(s->cpu_slab);
35775 #endif
35776
35777- p = ___slab_alloc(s, gfpflags, node, addr, c);
35778+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
35779 local_irq_restore(flags);
35780+ free_delayed(&tofree);
35781 return p;
35782 }
35783
e4b2b4a8 35784@@ -2810,7 +2872,7 @@
1a6e0f06
JK
35785
35786 do {
35787 if (unlikely(n)) {
35788- spin_unlock_irqrestore(&n->list_lock, flags);
35789+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35790 n = NULL;
35791 }
35792 prior = page->freelist;
e4b2b4a8 35793@@ -2842,7 +2904,7 @@
1a6e0f06
JK
35794 * Otherwise the list_lock will synchronize with
35795 * other processors updating the list of slabs.
35796 */
35797- spin_lock_irqsave(&n->list_lock, flags);
35798+ raw_spin_lock_irqsave(&n->list_lock, flags);
35799
35800 }
35801 }
e4b2b4a8 35802@@ -2884,7 +2946,7 @@
1a6e0f06
JK
35803 add_partial(n, page, DEACTIVATE_TO_TAIL);
35804 stat(s, FREE_ADD_PARTIAL);
35805 }
35806- spin_unlock_irqrestore(&n->list_lock, flags);
35807+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35808 return;
35809
35810 slab_empty:
e4b2b4a8 35811@@ -2899,7 +2961,7 @@
1a6e0f06
JK
35812 remove_full(s, n, page);
35813 }
35814
35815- spin_unlock_irqrestore(&n->list_lock, flags);
35816+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35817 stat(s, FREE_SLAB);
35818 discard_slab(s, page);
35819 }
e4b2b4a8 35820@@ -3104,6 +3166,7 @@
1a6e0f06
JK
35821 void **p)
35822 {
35823 struct kmem_cache_cpu *c;
35824+ LIST_HEAD(to_free);
35825 int i;
35826
35827 /* memcg and kmem_cache debug support */
e4b2b4a8 35828@@ -3127,7 +3190,7 @@
1a6e0f06
JK
35829 * of re-populating per CPU c->freelist
35830 */
35831 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
35832- _RET_IP_, c);
35833+ _RET_IP_, c, &to_free);
35834 if (unlikely(!p[i]))
35835 goto error;
35836
e4b2b4a8 35837@@ -3139,6 +3202,7 @@
1a6e0f06
JK
35838 }
35839 c->tid = next_tid(c->tid);
35840 local_irq_enable();
35841+ free_delayed(&to_free);
35842
35843 /* Clear memory outside IRQ disabled fastpath loop */
35844 if (unlikely(flags & __GFP_ZERO)) {
e4b2b4a8
JK
35845@@ -3153,6 +3217,7 @@
35846 return i;
35847 error:
35848 local_irq_enable();
35849+ free_delayed(&to_free);
35850 slab_post_alloc_hook(s, flags, i, p);
35851 __kmem_cache_free_bulk(s, i, p);
35852 return 0;
35853@@ -3286,7 +3351,7 @@
1a6e0f06
JK
35854 init_kmem_cache_node(struct kmem_cache_node *n)
35855 {
35856 n->nr_partial = 0;
35857- spin_lock_init(&n->list_lock);
35858+ raw_spin_lock_init(&n->list_lock);
35859 INIT_LIST_HEAD(&n->partial);
35860 #ifdef CONFIG_SLUB_DEBUG
35861 atomic_long_set(&n->nr_slabs, 0);
e4b2b4a8 35862@@ -3640,6 +3705,10 @@
1a6e0f06
JK
35863 const char *text)
35864 {
35865 #ifdef CONFIG_SLUB_DEBUG
35866+#ifdef CONFIG_PREEMPT_RT_BASE
35867+ /* XXX move out of irq-off section */
35868+ slab_err(s, page, text, s->name);
35869+#else
35870 void *addr = page_address(page);
35871 void *p;
35872 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
e4b2b4a8 35873@@ -3660,6 +3729,7 @@
1a6e0f06
JK
35874 slab_unlock(page);
35875 kfree(map);
35876 #endif
35877+#endif
35878 }
35879
35880 /*
e4b2b4a8 35881@@ -3673,7 +3743,7 @@
1a6e0f06
JK
35882 struct page *page, *h;
35883
35884 BUG_ON(irqs_disabled());
35885- spin_lock_irq(&n->list_lock);
35886+ raw_spin_lock_irq(&n->list_lock);
35887 list_for_each_entry_safe(page, h, &n->partial, lru) {
35888 if (!page->inuse) {
35889 remove_partial(n, page);
e4b2b4a8 35890@@ -3683,7 +3753,7 @@
1a6e0f06
JK
35891 "Objects remaining in %s on __kmem_cache_shutdown()");
35892 }
35893 }
35894- spin_unlock_irq(&n->list_lock);
35895+ raw_spin_unlock_irq(&n->list_lock);
35896
35897 list_for_each_entry_safe(page, h, &discard, lru)
35898 discard_slab(s, page);
e4b2b4a8 35899@@ -3927,7 +3997,7 @@
1a6e0f06
JK
35900 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
35901 INIT_LIST_HEAD(promote + i);
35902
35903- spin_lock_irqsave(&n->list_lock, flags);
35904+ raw_spin_lock_irqsave(&n->list_lock, flags);
35905
35906 /*
35907 * Build lists of slabs to discard or promote.
e4b2b4a8 35908@@ -3958,7 +4028,7 @@
1a6e0f06
JK
35909 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
35910 list_splice(promote + i, &n->partial);
35911
35912- spin_unlock_irqrestore(&n->list_lock, flags);
35913+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35914
35915 /* Release empty slabs */
35916 list_for_each_entry_safe(page, t, &discard, lru)
e4b2b4a8 35917@@ -4171,6 +4241,12 @@
1a6e0f06
JK
35918 {
35919 static __initdata struct kmem_cache boot_kmem_cache,
35920 boot_kmem_cache_node;
35921+ int cpu;
35922+
35923+ for_each_possible_cpu(cpu) {
35924+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
35925+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
35926+ }
35927
35928 if (debug_guardpage_minorder())
35929 slub_max_order = 0;
e4b2b4a8 35930@@ -4379,7 +4455,7 @@
1a6e0f06
JK
35931 struct page *page;
35932 unsigned long flags;
35933
35934- spin_lock_irqsave(&n->list_lock, flags);
35935+ raw_spin_lock_irqsave(&n->list_lock, flags);
35936
35937 list_for_each_entry(page, &n->partial, lru) {
35938 validate_slab_slab(s, page, map);
e4b2b4a8 35939@@ -4401,7 +4477,7 @@
1a6e0f06
JK
35940 s->name, count, atomic_long_read(&n->nr_slabs));
35941
35942 out:
35943- spin_unlock_irqrestore(&n->list_lock, flags);
35944+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35945 return count;
35946 }
35947
e4b2b4a8 35948@@ -4589,12 +4665,12 @@
1a6e0f06
JK
35949 if (!atomic_long_read(&n->nr_slabs))
35950 continue;
35951
35952- spin_lock_irqsave(&n->list_lock, flags);
35953+ raw_spin_lock_irqsave(&n->list_lock, flags);
35954 list_for_each_entry(page, &n->partial, lru)
35955 process_slab(&t, s, page, alloc, map);
35956 list_for_each_entry(page, &n->full, lru)
35957 process_slab(&t, s, page, alloc, map);
35958- spin_unlock_irqrestore(&n->list_lock, flags);
35959+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
35960 }
35961
35962 for (i = 0; i < t.count; i++) {
e4b2b4a8
JK
35963diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/swap.c linux-4.14/mm/swap.c
35964--- linux-4.14.orig/mm/swap.c 2017-11-12 19:46:13.000000000 +0100
35965+++ linux-4.14/mm/swap.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
35966@@ -32,6 +32,7 @@
35967 #include <linux/memcontrol.h>
35968 #include <linux/gfp.h>
35969 #include <linux/uio.h>
35970+#include <linux/locallock.h>
35971 #include <linux/hugetlb.h>
35972 #include <linux/page_idle.h>
35973
e4b2b4a8 35974@@ -50,6 +51,8 @@
1a6e0f06
JK
35975 #ifdef CONFIG_SMP
35976 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
35977 #endif
35978+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
35979+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
35980
35981 /*
35982 * This path almost never happens for VM activity - pages are normally
e4b2b4a8 35983@@ -252,11 +255,11 @@
1a6e0f06
JK
35984 unsigned long flags;
35985
35986 get_page(page);
35987- local_irq_save(flags);
35988+ local_lock_irqsave(rotate_lock, flags);
35989 pvec = this_cpu_ptr(&lru_rotate_pvecs);
35990 if (!pagevec_add(pvec, page) || PageCompound(page))
35991 pagevec_move_tail(pvec);
35992- local_irq_restore(flags);
35993+ local_unlock_irqrestore(rotate_lock, flags);
35994 }
35995 }
35996
e4b2b4a8 35997@@ -306,12 +309,13 @@
1a6e0f06
JK
35998 {
35999 page = compound_head(page);
36000 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
36001- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
36002+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
36003+ activate_page_pvecs);
36004
36005 get_page(page);
36006 if (!pagevec_add(pvec, page) || PageCompound(page))
36007 pagevec_lru_move_fn(pvec, __activate_page, NULL);
36008- put_cpu_var(activate_page_pvecs);
36009+ put_locked_var(swapvec_lock, activate_page_pvecs);
36010 }
36011 }
36012
e4b2b4a8 36013@@ -338,7 +342,7 @@
1a6e0f06
JK
36014
36015 static void __lru_cache_activate_page(struct page *page)
36016 {
36017- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36018+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36019 int i;
36020
36021 /*
e4b2b4a8 36022@@ -360,7 +364,7 @@
1a6e0f06
JK
36023 }
36024 }
36025
36026- put_cpu_var(lru_add_pvec);
36027+ put_locked_var(swapvec_lock, lru_add_pvec);
36028 }
36029
36030 /*
e4b2b4a8 36031@@ -402,12 +406,12 @@
1a6e0f06
JK
36032
36033 static void __lru_cache_add(struct page *page)
36034 {
36035- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36036+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36037
36038 get_page(page);
36039 if (!pagevec_add(pvec, page) || PageCompound(page))
36040 __pagevec_lru_add(pvec);
36041- put_cpu_var(lru_add_pvec);
36042+ put_locked_var(swapvec_lock, lru_add_pvec);
36043 }
36044
36045 /**
e4b2b4a8 36046@@ -613,9 +617,15 @@
1a6e0f06
JK
36047 unsigned long flags;
36048
36049 /* No harm done if a racing interrupt already did this */
36050- local_irq_save(flags);
36051+#ifdef CONFIG_PREEMPT_RT_BASE
36052+ local_lock_irqsave_on(rotate_lock, flags, cpu);
36053 pagevec_move_tail(pvec);
36054- local_irq_restore(flags);
36055+ local_unlock_irqrestore_on(rotate_lock, flags, cpu);
36056+#else
36057+ local_lock_irqsave(rotate_lock, flags);
36058+ pagevec_move_tail(pvec);
36059+ local_unlock_irqrestore(rotate_lock, flags);
36060+#endif
36061 }
36062
36063 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
e4b2b4a8 36064@@ -647,11 +657,12 @@
1a6e0f06
JK
36065 return;
36066
36067 if (likely(get_page_unless_zero(page))) {
36068- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
36069+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
36070+ lru_deactivate_file_pvecs);
36071
36072 if (!pagevec_add(pvec, page) || PageCompound(page))
36073 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
36074- put_cpu_var(lru_deactivate_file_pvecs);
36075+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
36076 }
36077 }
36078
e4b2b4a8 36079@@ -666,21 +677,32 @@
1a6e0f06 36080 {
e4b2b4a8
JK
36081 if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
36082 !PageSwapCache(page) && !PageUnevictable(page)) {
36083- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
1a6e0f06 36084+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
e4b2b4a8 36085+ lru_lazyfree_pvecs);
1a6e0f06
JK
36086
36087 get_page(page);
36088 if (!pagevec_add(pvec, page) || PageCompound(page))
e4b2b4a8
JK
36089 pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
36090- put_cpu_var(lru_lazyfree_pvecs);
36091+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
1a6e0f06
JK
36092 }
36093 }
36094
36095 void lru_add_drain(void)
36096 {
36097- lru_add_drain_cpu(get_cpu());
36098- put_cpu();
36099+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
36100+ local_unlock_cpu(swapvec_lock);
36101 }
36102
1a6e0f06
JK
36103+#ifdef CONFIG_PREEMPT_RT_BASE
36104+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
e4b2b4a8 36105+{
1a6e0f06
JK
36106+ local_lock_on(swapvec_lock, cpu);
36107+ lru_add_drain_cpu(cpu);
36108+ local_unlock_on(swapvec_lock, cpu);
e4b2b4a8
JK
36109+}
36110+
1a6e0f06 36111+#else
e4b2b4a8
JK
36112+
36113 static void lru_add_drain_per_cpu(struct work_struct *dummy)
36114 {
36115 lru_add_drain();
36116@@ -688,6 +710,16 @@
1a6e0f06 36117
e4b2b4a8 36118 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
1a6e0f06 36119
1a6e0f06
JK
36120+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
36121+{
36122+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
36123+
36124+ INIT_WORK(work, lru_add_drain_per_cpu);
e4b2b4a8 36125+ queue_work_on(cpu, mm_percpu_wq, work);
1a6e0f06
JK
36126+ cpumask_set_cpu(cpu, has_work);
36127+}
36128+#endif
36129+
e4b2b4a8 36130 void lru_add_drain_all_cpuslocked(void)
1a6e0f06
JK
36131 {
36132 static DEFINE_MUTEX(lock);
e4b2b4a8 36133@@ -705,21 +737,19 @@
1a6e0f06
JK
36134 cpumask_clear(&has_work);
36135
36136 for_each_online_cpu(cpu) {
36137- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
e4b2b4a8 36138
1a6e0f06
JK
36139 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
36140 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
36141 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
e4b2b4a8 36142 pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
1a6e0f06
JK
36143- need_activate_page_drain(cpu)) {
36144- INIT_WORK(work, lru_add_drain_per_cpu);
e4b2b4a8 36145- queue_work_on(cpu, mm_percpu_wq, work);
1a6e0f06
JK
36146- cpumask_set_cpu(cpu, &has_work);
36147- }
36148+ need_activate_page_drain(cpu))
36149+ remote_lru_add_drain(cpu, &has_work);
36150 }
36151
36152+#ifndef CONFIG_PREEMPT_RT_BASE
36153 for_each_cpu(cpu, &has_work)
36154 flush_work(&per_cpu(lru_add_drain_work, cpu));
36155+#endif
36156
1a6e0f06 36157 mutex_unlock(&lock);
e4b2b4a8
JK
36158 }
36159diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/truncate.c linux-4.14/mm/truncate.c
36160--- linux-4.14.orig/mm/truncate.c 2017-11-12 19:46:13.000000000 +0100
36161+++ linux-4.14/mm/truncate.c 2018-09-05 11:05:07.000000000 +0200
36162@@ -41,8 +41,10 @@
36163 goto unlock;
36164 if (*slot != entry)
36165 goto unlock;
36166+ local_lock(shadow_nodes_lock);
36167 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
36168- workingset_update_node, mapping);
36169+ __workingset_update_node, mapping);
36170+ local_unlock(shadow_nodes_lock);
36171 mapping->nrexceptional--;
1a6e0f06
JK
36172 unlock:
36173 spin_unlock_irq(&mapping->tree_lock);
e4b2b4a8
JK
36174diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/vmalloc.c linux-4.14/mm/vmalloc.c
36175--- linux-4.14.orig/mm/vmalloc.c 2018-09-05 11:03:25.000000000 +0200
36176+++ linux-4.14/mm/vmalloc.c 2018-09-05 11:05:07.000000000 +0200
36177@@ -865,7 +865,7 @@
1a6e0f06
JK
36178 struct vmap_block *vb;
36179 struct vmap_area *va;
36180 unsigned long vb_idx;
36181- int node, err;
36182+ int node, err, cpu;
36183 void *vaddr;
36184
36185 node = numa_node_id();
e4b2b4a8 36186@@ -908,11 +908,12 @@
1a6e0f06
JK
36187 BUG_ON(err);
36188 radix_tree_preload_end();
36189
36190- vbq = &get_cpu_var(vmap_block_queue);
36191+ cpu = get_cpu_light();
36192+ vbq = this_cpu_ptr(&vmap_block_queue);
36193 spin_lock(&vbq->lock);
36194 list_add_tail_rcu(&vb->free_list, &vbq->free);
36195 spin_unlock(&vbq->lock);
36196- put_cpu_var(vmap_block_queue);
36197+ put_cpu_light();
36198
36199 return vaddr;
36200 }
e4b2b4a8 36201@@ -981,6 +982,7 @@
1a6e0f06
JK
36202 struct vmap_block *vb;
36203 void *vaddr = NULL;
36204 unsigned int order;
36205+ int cpu;
36206
36207 BUG_ON(offset_in_page(size));
36208 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
e4b2b4a8 36209@@ -995,7 +997,8 @@
1a6e0f06
JK
36210 order = get_order(size);
36211
36212 rcu_read_lock();
36213- vbq = &get_cpu_var(vmap_block_queue);
36214+ cpu = get_cpu_light();
36215+ vbq = this_cpu_ptr(&vmap_block_queue);
36216 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
36217 unsigned long pages_off;
36218
e4b2b4a8 36219@@ -1018,7 +1021,7 @@
1a6e0f06
JK
36220 break;
36221 }
36222
36223- put_cpu_var(vmap_block_queue);
36224+ put_cpu_light();
36225 rcu_read_unlock();
36226
36227 /* Allocate new block if nothing was found */
e4b2b4a8
JK
36228diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/vmstat.c linux-4.14/mm/vmstat.c
36229--- linux-4.14.orig/mm/vmstat.c 2017-11-12 19:46:13.000000000 +0100
36230+++ linux-4.14/mm/vmstat.c 2018-09-05 11:05:07.000000000 +0200
36231@@ -249,6 +249,7 @@
1a6e0f06
JK
36232 long x;
36233 long t;
36234
36235+ preempt_disable_rt();
36236 x = delta + __this_cpu_read(*p);
36237
36238 t = __this_cpu_read(pcp->stat_threshold);
e4b2b4a8 36239@@ -258,6 +259,7 @@
1a6e0f06
JK
36240 x = 0;
36241 }
36242 __this_cpu_write(*p, x);
36243+ preempt_enable_rt();
36244 }
36245 EXPORT_SYMBOL(__mod_zone_page_state);
36246
e4b2b4a8 36247@@ -269,6 +271,7 @@
1a6e0f06
JK
36248 long x;
36249 long t;
36250
36251+ preempt_disable_rt();
36252 x = delta + __this_cpu_read(*p);
36253
36254 t = __this_cpu_read(pcp->stat_threshold);
e4b2b4a8 36255@@ -278,6 +281,7 @@
1a6e0f06
JK
36256 x = 0;
36257 }
36258 __this_cpu_write(*p, x);
36259+ preempt_enable_rt();
36260 }
36261 EXPORT_SYMBOL(__mod_node_page_state);
36262
e4b2b4a8 36263@@ -310,6 +314,7 @@
1a6e0f06
JK
36264 s8 __percpu *p = pcp->vm_stat_diff + item;
36265 s8 v, t;
36266
36267+ preempt_disable_rt();
36268 v = __this_cpu_inc_return(*p);
36269 t = __this_cpu_read(pcp->stat_threshold);
36270 if (unlikely(v > t)) {
e4b2b4a8 36271@@ -318,6 +323,7 @@
1a6e0f06
JK
36272 zone_page_state_add(v + overstep, zone, item);
36273 __this_cpu_write(*p, -overstep);
36274 }
36275+ preempt_enable_rt();
36276 }
36277
36278 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
e4b2b4a8 36279@@ -326,6 +332,7 @@
1a6e0f06
JK
36280 s8 __percpu *p = pcp->vm_node_stat_diff + item;
36281 s8 v, t;
36282
36283+ preempt_disable_rt();
36284 v = __this_cpu_inc_return(*p);
36285 t = __this_cpu_read(pcp->stat_threshold);
36286 if (unlikely(v > t)) {
e4b2b4a8 36287@@ -334,6 +341,7 @@
1a6e0f06
JK
36288 node_page_state_add(v + overstep, pgdat, item);
36289 __this_cpu_write(*p, -overstep);
36290 }
36291+ preempt_enable_rt();
36292 }
36293
36294 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
e4b2b4a8 36295@@ -354,6 +362,7 @@
1a6e0f06
JK
36296 s8 __percpu *p = pcp->vm_stat_diff + item;
36297 s8 v, t;
36298
36299+ preempt_disable_rt();
36300 v = __this_cpu_dec_return(*p);
36301 t = __this_cpu_read(pcp->stat_threshold);
36302 if (unlikely(v < - t)) {
e4b2b4a8 36303@@ -362,6 +371,7 @@
1a6e0f06
JK
36304 zone_page_state_add(v - overstep, zone, item);
36305 __this_cpu_write(*p, overstep);
36306 }
36307+ preempt_enable_rt();
36308 }
36309
36310 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
e4b2b4a8 36311@@ -370,6 +380,7 @@
1a6e0f06
JK
36312 s8 __percpu *p = pcp->vm_node_stat_diff + item;
36313 s8 v, t;
36314
36315+ preempt_disable_rt();
36316 v = __this_cpu_dec_return(*p);
36317 t = __this_cpu_read(pcp->stat_threshold);
36318 if (unlikely(v < - t)) {
e4b2b4a8 36319@@ -378,6 +389,7 @@
1a6e0f06
JK
36320 node_page_state_add(v - overstep, pgdat, item);
36321 __this_cpu_write(*p, overstep);
36322 }
36323+ preempt_enable_rt();
36324 }
36325
36326 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
e4b2b4a8
JK
36327diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/workingset.c linux-4.14/mm/workingset.c
36328--- linux-4.14.orig/mm/workingset.c 2017-11-12 19:46:13.000000000 +0100
36329+++ linux-4.14/mm/workingset.c 2018-09-05 11:05:07.000000000 +0200
36330@@ -338,9 +338,10 @@
1a6e0f06
JK
36331 * point where they would still be useful.
36332 */
36333
e4b2b4a8
JK
36334-static struct list_lru shadow_nodes;
36335+static struct list_lru __shadow_nodes;
36336+DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
36337
36338-void workingset_update_node(struct radix_tree_node *node, void *private)
36339+void __workingset_update_node(struct radix_tree_node *node, void *private)
36340 {
36341 struct address_space *mapping = private;
36342
36343@@ -358,10 +359,10 @@
36344 */
36345 if (node->count && node->count == node->exceptional) {
36346 if (list_empty(&node->private_list))
36347- list_lru_add(&shadow_nodes, &node->private_list);
36348+ list_lru_add(&__shadow_nodes, &node->private_list);
36349 } else {
36350 if (!list_empty(&node->private_list))
36351- list_lru_del(&shadow_nodes, &node->private_list);
36352+ list_lru_del(&__shadow_nodes, &node->private_list);
36353 }
36354 }
1a6e0f06 36355
e4b2b4a8
JK
36356@@ -373,9 +374,9 @@
36357 unsigned long cache;
1a6e0f06
JK
36358
36359 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
36360- local_irq_disable();
e4b2b4a8 36361- nodes = list_lru_shrink_count(&shadow_nodes, sc);
1a6e0f06 36362- local_irq_enable();
e4b2b4a8
JK
36363+ local_lock_irq(shadow_nodes_lock);
36364+ nodes = list_lru_shrink_count(&__shadow_nodes, sc);
36365+ local_unlock_irq(shadow_nodes_lock);
1a6e0f06 36366
e4b2b4a8
JK
36367 /*
36368 * Approximate a reasonable limit for the radix tree nodes
36369@@ -475,15 +476,15 @@
36370 goto out_invalid;
36371 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
36372 __radix_tree_delete_node(&mapping->page_tree, node,
36373- workingset_update_node, mapping);
36374+ __workingset_update_node, mapping);
36375
36376 out_invalid:
1a6e0f06
JK
36377 spin_unlock(&mapping->tree_lock);
36378 ret = LRU_REMOVED_RETRY;
36379 out:
36380- local_irq_enable();
e4b2b4a8 36381+ local_unlock_irq(shadow_nodes_lock);
1a6e0f06
JK
36382 cond_resched();
36383- local_irq_disable();
e4b2b4a8 36384+ local_lock_irq(shadow_nodes_lock);
1a6e0f06
JK
36385 spin_lock(lru_lock);
36386 return ret;
36387 }
e4b2b4a8 36388@@ -494,9 +495,9 @@
1a6e0f06
JK
36389 unsigned long ret;
36390
36391 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
36392- local_irq_disable();
e4b2b4a8 36393- ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
1a6e0f06 36394- local_irq_enable();
e4b2b4a8
JK
36395+ local_lock_irq(shadow_nodes_lock);
36396+ ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL);
36397+ local_unlock_irq(shadow_nodes_lock);
1a6e0f06
JK
36398 return ret;
36399 }
36400
e4b2b4a8 36401@@ -534,7 +535,7 @@
1a6e0f06
JK
36402 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
36403 timestamp_bits, max_order, bucket_order);
36404
e4b2b4a8
JK
36405- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
36406+ ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key);
1a6e0f06
JK
36407 if (ret)
36408 goto err;
36409 ret = register_shrinker(&workingset_shadow_shrinker);
e4b2b4a8 36410@@ -542,7 +543,7 @@
1a6e0f06
JK
36411 goto err_list_lru;
36412 return 0;
36413 err_list_lru:
e4b2b4a8
JK
36414- list_lru_destroy(&shadow_nodes);
36415+ list_lru_destroy(&__shadow_nodes);
1a6e0f06
JK
36416 err:
36417 return ret;
36418 }
e4b2b4a8
JK
36419diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/zsmalloc.c linux-4.14/mm/zsmalloc.c
36420--- linux-4.14.orig/mm/zsmalloc.c 2018-09-05 11:03:25.000000000 +0200
36421+++ linux-4.14/mm/zsmalloc.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
36422@@ -53,6 +53,7 @@
36423 #include <linux/mount.h>
36424 #include <linux/migrate.h>
36425 #include <linux/pagemap.h>
36426+#include <linux/locallock.h>
36427
36428 #define ZSPAGE_MAGIC 0x58
36429
36430@@ -70,9 +71,22 @@
36431 */
36432 #define ZS_MAX_ZSPAGE_ORDER 2
36433 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
36434-
36435 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
36436
36437+#ifdef CONFIG_PREEMPT_RT_FULL
36438+
36439+struct zsmalloc_handle {
36440+ unsigned long addr;
36441+ struct mutex lock;
36442+};
36443+
36444+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
36445+
36446+#else
36447+
36448+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
36449+#endif
36450+
36451 /*
36452 * Object location (<PFN>, <obj_idx>) is encoded as
36453 * as single (unsigned long) handle value.
e4b2b4a8 36454@@ -320,7 +334,7 @@
1a6e0f06
JK
36455
36456 static int create_cache(struct zs_pool *pool)
36457 {
36458- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
36459+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
36460 0, 0, NULL);
36461 if (!pool->handle_cachep)
36462 return 1;
e4b2b4a8 36463@@ -344,9 +358,26 @@
1a6e0f06
JK
36464
36465 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
36466 {
36467- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
36468- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
36469+ void *p;
36470+
36471+ p = kmem_cache_alloc(pool->handle_cachep,
36472+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
36473+#ifdef CONFIG_PREEMPT_RT_FULL
36474+ if (p) {
36475+ struct zsmalloc_handle *zh = p;
36476+
36477+ mutex_init(&zh->lock);
36478+ }
36479+#endif
36480+ return (unsigned long)p;
e4b2b4a8
JK
36481+}
36482+
1a6e0f06
JK
36483+#ifdef CONFIG_PREEMPT_RT_FULL
36484+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
36485+{
36486+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
e4b2b4a8 36487 }
1a6e0f06 36488+#endif
e4b2b4a8 36489
1a6e0f06
JK
36490 static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
36491 {
e4b2b4a8 36492@@ -366,12 +397,18 @@
1a6e0f06
JK
36493
36494 static void record_obj(unsigned long handle, unsigned long obj)
36495 {
36496+#ifdef CONFIG_PREEMPT_RT_FULL
36497+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36498+
36499+ WRITE_ONCE(zh->addr, obj);
36500+#else
36501 /*
36502 * lsb of @obj represents handle lock while other bits
36503 * represent object value the handle is pointing so
36504 * updating shouldn't do store tearing.
36505 */
36506 WRITE_ONCE(*(unsigned long *)handle, obj);
36507+#endif
36508 }
36509
36510 /* zpool driver */
e4b2b4a8 36511@@ -460,6 +497,7 @@
1a6e0f06
JK
36512
36513 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
36514 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
36515+static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
36516
36517 static bool is_zspage_isolated(struct zspage *zspage)
36518 {
e4b2b4a8 36519@@ -898,7 +936,13 @@
1a6e0f06
JK
36520
36521 static unsigned long handle_to_obj(unsigned long handle)
36522 {
36523+#ifdef CONFIG_PREEMPT_RT_FULL
36524+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36525+
36526+ return zh->addr;
36527+#else
36528 return *(unsigned long *)handle;
36529+#endif
36530 }
36531
36532 static unsigned long obj_to_head(struct page *page, void *obj)
e4b2b4a8 36533@@ -912,22 +956,46 @@
1a6e0f06
JK
36534
36535 static inline int testpin_tag(unsigned long handle)
36536 {
36537+#ifdef CONFIG_PREEMPT_RT_FULL
36538+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36539+
36540+ return mutex_is_locked(&zh->lock);
36541+#else
36542 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
36543+#endif
36544 }
36545
36546 static inline int trypin_tag(unsigned long handle)
36547 {
36548+#ifdef CONFIG_PREEMPT_RT_FULL
36549+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36550+
36551+ return mutex_trylock(&zh->lock);
36552+#else
36553 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
36554+#endif
36555 }
36556
36557 static void pin_tag(unsigned long handle)
36558 {
36559+#ifdef CONFIG_PREEMPT_RT_FULL
36560+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36561+
36562+ return mutex_lock(&zh->lock);
36563+#else
36564 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
36565+#endif
36566 }
36567
36568 static void unpin_tag(unsigned long handle)
36569 {
36570+#ifdef CONFIG_PREEMPT_RT_FULL
36571+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36572+
36573+ return mutex_unlock(&zh->lock);
36574+#else
36575 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
36576+#endif
36577 }
36578
36579 static void reset_page(struct page *page)
e4b2b4a8 36580@@ -1365,7 +1433,7 @@
1a6e0f06
JK
36581 class = pool->size_class[class_idx];
36582 off = (class->size * obj_idx) & ~PAGE_MASK;
36583
36584- area = &get_cpu_var(zs_map_area);
36585+ area = &get_locked_var(zs_map_area_lock, zs_map_area);
36586 area->vm_mm = mm;
36587 if (off + class->size <= PAGE_SIZE) {
36588 /* this object is contained entirely within a page */
e4b2b4a8 36589@@ -1419,7 +1487,7 @@
1a6e0f06
JK
36590
36591 __zs_unmap_object(area, pages, off, class->size);
36592 }
36593- put_cpu_var(zs_map_area);
36594+ put_locked_var(zs_map_area_lock, zs_map_area);
36595
36596 migrate_read_unlock(zspage);
36597 unpin_tag(handle);
e4b2b4a8
JK
36598diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/9p/trans_xen.c linux-4.14/net/9p/trans_xen.c
36599--- linux-4.14.orig/net/9p/trans_xen.c 2018-09-05 11:03:25.000000000 +0200
36600+++ linux-4.14/net/9p/trans_xen.c 2018-09-05 11:05:07.000000000 +0200
36601@@ -38,7 +38,6 @@
36602
36603 #include <linux/module.h>
36604 #include <linux/spinlock.h>
36605-#include <linux/rwlock.h>
36606 #include <net/9p/9p.h>
36607 #include <net/9p/client.h>
36608 #include <net/9p/transport.h>
36609diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/bluetooth/hci_sock.c linux-4.14/net/bluetooth/hci_sock.c
36610--- linux-4.14.orig/net/bluetooth/hci_sock.c 2017-11-12 19:46:13.000000000 +0100
36611+++ linux-4.14/net/bluetooth/hci_sock.c 2018-09-05 11:05:07.000000000 +0200
36612@@ -251,15 +251,13 @@
36613 }
36614
36615 /* Send frame to sockets with specific channel */
36616-void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36617- int flag, struct sock *skip_sk)
36618+static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36619+ int flag, struct sock *skip_sk)
36620 {
36621 struct sock *sk;
36622
36623 BT_DBG("channel %u len %d", channel, skb->len);
36624
36625- read_lock(&hci_sk_list.lock);
36626-
36627 sk_for_each(sk, &hci_sk_list.head) {
36628 struct sk_buff *nskb;
36629
36630@@ -285,6 +283,13 @@
36631 kfree_skb(nskb);
36632 }
36633
36634+}
36635+
36636+void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36637+ int flag, struct sock *skip_sk)
36638+{
36639+ read_lock(&hci_sk_list.lock);
36640+ __hci_send_to_channel(channel, skb, flag, skip_sk);
36641 read_unlock(&hci_sk_list.lock);
36642 }
36643
36644@@ -388,8 +393,8 @@
36645 hdr->index = index;
36646 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
36647
36648- hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
36649- HCI_SOCK_TRUSTED, NULL);
36650+ __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
36651+ HCI_SOCK_TRUSTED, NULL);
36652 kfree_skb(skb);
36653 }
36654
36655diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/can/bcm.c linux-4.14/net/can/bcm.c
36656--- linux-4.14.orig/net/can/bcm.c 2017-11-12 19:46:13.000000000 +0100
36657+++ linux-4.14/net/can/bcm.c 2018-09-05 11:05:07.000000000 +0200
36658@@ -102,7 +102,6 @@
36659 unsigned long frames_abs, frames_filtered;
36660 struct bcm_timeval ival1, ival2;
36661 struct hrtimer timer, thrtimer;
36662- struct tasklet_struct tsklet, thrtsklet;
36663 ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
36664 int rx_ifindex;
36665 int cfsiz;
36666@@ -364,25 +363,34 @@
36667 }
36668 }
36669
36670-static void bcm_tx_start_timer(struct bcm_op *op)
36671+static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
36672 {
36673+ ktime_t ival;
36674+
36675 if (op->kt_ival1 && op->count)
36676- hrtimer_start(&op->timer,
36677- ktime_add(ktime_get(), op->kt_ival1),
36678- HRTIMER_MODE_ABS);
36679+ ival = op->kt_ival1;
36680 else if (op->kt_ival2)
36681- hrtimer_start(&op->timer,
36682- ktime_add(ktime_get(), op->kt_ival2),
36683- HRTIMER_MODE_ABS);
36684+ ival = op->kt_ival2;
36685+ else
36686+ return false;
36687+
36688+ hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
36689+ return true;
36690 }
36691
36692-static void bcm_tx_timeout_tsklet(unsigned long data)
36693+static void bcm_tx_start_timer(struct bcm_op *op)
36694 {
36695- struct bcm_op *op = (struct bcm_op *)data;
36696+ if (bcm_tx_set_expiry(op, &op->timer))
36697+ hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
36698+}
36699+
36700+/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
36701+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
36702+{
36703+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36704 struct bcm_msg_head msg_head;
36705
36706 if (op->kt_ival1 && (op->count > 0)) {
36707-
36708 op->count--;
36709 if (!op->count && (op->flags & TX_COUNTEVT)) {
36710
36711@@ -399,22 +407,12 @@
36712 }
36713 bcm_can_tx(op);
36714
36715- } else if (op->kt_ival2)
36716+ } else if (op->kt_ival2) {
36717 bcm_can_tx(op);
36718+ }
36719
36720- bcm_tx_start_timer(op);
36721-}
36722-
36723-/*
36724- * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
36725- */
36726-static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
36727-{
36728- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36729-
36730- tasklet_schedule(&op->tsklet);
36731-
36732- return HRTIMER_NORESTART;
36733+ return bcm_tx_set_expiry(op, &op->timer) ?
36734+ HRTIMER_RESTART : HRTIMER_NORESTART;
36735 }
36736
36737 /*
36738@@ -480,7 +478,7 @@
36739 /* do not send the saved data - only start throttle timer */
36740 hrtimer_start(&op->thrtimer,
36741 ktime_add(op->kt_lastmsg, op->kt_ival2),
36742- HRTIMER_MODE_ABS);
36743+ HRTIMER_MODE_ABS_SOFT);
36744 return;
36745 }
36746
36747@@ -539,14 +537,21 @@
36748 return;
36749
36750 if (op->kt_ival1)
36751- hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
36752+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
36753 }
36754
36755-static void bcm_rx_timeout_tsklet(unsigned long data)
36756+/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
36757+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
36758 {
36759- struct bcm_op *op = (struct bcm_op *)data;
36760+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36761 struct bcm_msg_head msg_head;
36762
36763+ /* if user wants to be informed, when cyclic CAN-Messages come back */
36764+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
36765+ /* clear received CAN frames to indicate 'nothing received' */
36766+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
36767+ }
36768+
36769 /* create notification to user */
36770 msg_head.opcode = RX_TIMEOUT;
36771 msg_head.flags = op->flags;
36772@@ -557,25 +562,6 @@
36773 msg_head.nframes = 0;
36774
36775 bcm_send_to_user(op, &msg_head, NULL, 0);
36776-}
36777-
36778-/*
36779- * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
36780- */
36781-static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
36782-{
36783- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36784-
36785- /* schedule before NET_RX_SOFTIRQ */
36786- tasklet_hi_schedule(&op->tsklet);
36787-
36788- /* no restart of the timer is done here! */
36789-
36790- /* if user wants to be informed, when cyclic CAN-Messages come back */
36791- if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
36792- /* clear received CAN frames to indicate 'nothing received' */
36793- memset(op->last_frames, 0, op->nframes * op->cfsiz);
36794- }
36795
36796 return HRTIMER_NORESTART;
36797 }
36798@@ -583,14 +569,12 @@
36799 /*
36800 * bcm_rx_do_flush - helper for bcm_rx_thr_flush
36801 */
36802-static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
36803- unsigned int index)
36804+static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
36805 {
36806 struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
36807
36808 if ((op->last_frames) && (lcf->flags & RX_THR)) {
36809- if (update)
36810- bcm_rx_changed(op, lcf);
36811+ bcm_rx_changed(op, lcf);
36812 return 1;
36813 }
36814 return 0;
36815@@ -598,11 +582,8 @@
36816
36817 /*
36818 * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
36819- *
36820- * update == 0 : just check if throttled data is available (any irq context)
36821- * update == 1 : check and send throttled data to userspace (soft_irq context)
36822 */
36823-static int bcm_rx_thr_flush(struct bcm_op *op, int update)
36824+static int bcm_rx_thr_flush(struct bcm_op *op)
36825 {
36826 int updated = 0;
36827
36828@@ -611,24 +592,16 @@
36829
36830 /* for MUX filter we start at index 1 */
36831 for (i = 1; i < op->nframes; i++)
36832- updated += bcm_rx_do_flush(op, update, i);
36833+ updated += bcm_rx_do_flush(op, i);
36834
36835 } else {
36836 /* for RX_FILTER_ID and simple filter */
36837- updated += bcm_rx_do_flush(op, update, 0);
36838+ updated += bcm_rx_do_flush(op, 0);
36839 }
36840
36841 return updated;
36842 }
36843
36844-static void bcm_rx_thr_tsklet(unsigned long data)
36845-{
36846- struct bcm_op *op = (struct bcm_op *)data;
36847-
36848- /* push the changed data to the userspace */
36849- bcm_rx_thr_flush(op, 1);
36850-}
36851-
36852 /*
36853 * bcm_rx_thr_handler - the time for blocked content updates is over now:
36854 * Check for throttled data and send it to the userspace
36855@@ -637,9 +610,7 @@
36856 {
36857 struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
36858
36859- tasklet_schedule(&op->thrtsklet);
36860-
36861- if (bcm_rx_thr_flush(op, 0)) {
36862+ if (bcm_rx_thr_flush(op)) {
36863 hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
36864 return HRTIMER_RESTART;
36865 } else {
36866@@ -735,23 +706,8 @@
36867
36868 static void bcm_remove_op(struct bcm_op *op)
36869 {
36870- if (op->tsklet.func) {
36871- while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
36872- test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
36873- hrtimer_active(&op->timer)) {
36874- hrtimer_cancel(&op->timer);
36875- tasklet_kill(&op->tsklet);
36876- }
36877- }
36878-
36879- if (op->thrtsklet.func) {
36880- while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
36881- test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
36882- hrtimer_active(&op->thrtimer)) {
36883- hrtimer_cancel(&op->thrtimer);
36884- tasklet_kill(&op->thrtsklet);
36885- }
36886- }
36887+ hrtimer_cancel(&op->timer);
36888+ hrtimer_cancel(&op->thrtimer);
36889
36890 if ((op->frames) && (op->frames != &op->sframe))
36891 kfree(op->frames);
36892@@ -979,15 +935,13 @@
36893 op->ifindex = ifindex;
36894
36895 /* initialize uninitialized (kzalloc) structure */
36896- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36897+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
36898+ HRTIMER_MODE_REL_SOFT);
36899 op->timer.function = bcm_tx_timeout_handler;
36900
36901- /* initialize tasklet for tx countevent notification */
36902- tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
36903- (unsigned long) op);
36904-
36905 /* currently unused in tx_ops */
36906- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36907+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
36908+ HRTIMER_MODE_REL_SOFT);
36909
36910 /* add this bcm_op to the list of the tx_ops */
36911 list_add(&op->list, &bo->tx_ops);
36912@@ -1150,20 +1104,14 @@
36913 op->rx_ifindex = ifindex;
36914
36915 /* initialize uninitialized (kzalloc) structure */
36916- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36917+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
36918+ HRTIMER_MODE_REL_SOFT);
36919 op->timer.function = bcm_rx_timeout_handler;
36920
36921- /* initialize tasklet for rx timeout notification */
36922- tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
36923- (unsigned long) op);
36924-
36925- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36926+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
36927+ HRTIMER_MODE_REL_SOFT);
36928 op->thrtimer.function = bcm_rx_thr_handler;
36929
36930- /* initialize tasklet for rx throttle handling */
36931- tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
36932- (unsigned long) op);
36933-
36934 /* add this bcm_op to the list of the rx_ops */
36935 list_add(&op->list, &bo->rx_ops);
36936
36937@@ -1209,12 +1157,12 @@
36938 */
36939 op->kt_lastmsg = 0;
36940 hrtimer_cancel(&op->thrtimer);
36941- bcm_rx_thr_flush(op, 1);
36942+ bcm_rx_thr_flush(op);
36943 }
36944
36945 if ((op->flags & STARTTIMER) && op->kt_ival1)
36946 hrtimer_start(&op->timer, op->kt_ival1,
36947- HRTIMER_MODE_REL);
36948+ HRTIMER_MODE_REL_SOFT);
36949 }
36950
36951 /* now we can register for can_ids, if we added a new bcm_op */
36952diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/dev.c linux-4.14/net/core/dev.c
36953--- linux-4.14.orig/net/core/dev.c 2018-09-05 11:03:25.000000000 +0200
36954+++ linux-4.14/net/core/dev.c 2018-09-05 11:05:07.000000000 +0200
36955@@ -195,6 +195,7 @@
1a6e0f06
JK
36956 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
36957
36958 static seqcount_t devnet_rename_seq;
36959+static DEFINE_MUTEX(devnet_rename_mutex);
36960
36961 static inline void dev_base_seq_inc(struct net *net)
36962 {
e4b2b4a8 36963@@ -217,14 +218,14 @@
1a6e0f06
JK
36964 static inline void rps_lock(struct softnet_data *sd)
36965 {
36966 #ifdef CONFIG_RPS
36967- spin_lock(&sd->input_pkt_queue.lock);
36968+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
36969 #endif
36970 }
36971
36972 static inline void rps_unlock(struct softnet_data *sd)
36973 {
36974 #ifdef CONFIG_RPS
36975- spin_unlock(&sd->input_pkt_queue.lock);
36976+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
36977 #endif
36978 }
36979
e4b2b4a8 36980@@ -920,7 +921,8 @@
1a6e0f06
JK
36981 strcpy(name, dev->name);
36982 rcu_read_unlock();
36983 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
36984- cond_resched();
36985+ mutex_lock(&devnet_rename_mutex);
36986+ mutex_unlock(&devnet_rename_mutex);
36987 goto retry;
36988 }
36989
e4b2b4a8 36990@@ -1189,20 +1191,17 @@
1a6e0f06
JK
36991 if (dev->flags & IFF_UP)
36992 return -EBUSY;
36993
36994- write_seqcount_begin(&devnet_rename_seq);
36995+ mutex_lock(&devnet_rename_mutex);
36996+ __raw_write_seqcount_begin(&devnet_rename_seq);
36997
36998- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
36999- write_seqcount_end(&devnet_rename_seq);
37000- return 0;
37001- }
37002+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
37003+ goto outunlock;
37004
37005 memcpy(oldname, dev->name, IFNAMSIZ);
37006
37007 err = dev_get_valid_name(net, dev, newname);
37008- if (err < 0) {
37009- write_seqcount_end(&devnet_rename_seq);
37010- return err;
37011- }
37012+ if (err < 0)
37013+ goto outunlock;
37014
37015 if (oldname[0] && !strchr(oldname, '%'))
37016 netdev_info(dev, "renamed from %s\n", oldname);
e4b2b4a8 37017@@ -1215,11 +1214,12 @@
1a6e0f06
JK
37018 if (ret) {
37019 memcpy(dev->name, oldname, IFNAMSIZ);
37020 dev->name_assign_type = old_assign_type;
37021- write_seqcount_end(&devnet_rename_seq);
37022- return ret;
37023+ err = ret;
37024+ goto outunlock;
37025 }
37026
37027- write_seqcount_end(&devnet_rename_seq);
37028+ __raw_write_seqcount_end(&devnet_rename_seq);
37029+ mutex_unlock(&devnet_rename_mutex);
37030
37031 netdev_adjacent_rename_links(dev, oldname);
37032
e4b2b4a8 37033@@ -1240,7 +1240,8 @@
1a6e0f06
JK
37034 /* err >= 0 after dev_alloc_name() or stores the first errno */
37035 if (err >= 0) {
37036 err = ret;
37037- write_seqcount_begin(&devnet_rename_seq);
37038+ mutex_lock(&devnet_rename_mutex);
37039+ __raw_write_seqcount_begin(&devnet_rename_seq);
37040 memcpy(dev->name, oldname, IFNAMSIZ);
37041 memcpy(oldname, newname, IFNAMSIZ);
37042 dev->name_assign_type = old_assign_type;
e4b2b4a8 37043@@ -1253,6 +1254,11 @@
1a6e0f06
JK
37044 }
37045
37046 return err;
37047+
37048+outunlock:
37049+ __raw_write_seqcount_end(&devnet_rename_seq);
37050+ mutex_unlock(&devnet_rename_mutex);
37051+ return err;
37052 }
37053
37054 /**
e4b2b4a8 37055@@ -2438,6 +2444,7 @@
1a6e0f06
JK
37056 sd->output_queue_tailp = &q->next_sched;
37057 raise_softirq_irqoff(NET_TX_SOFTIRQ);
37058 local_irq_restore(flags);
37059+ preempt_check_resched_rt();
37060 }
37061
37062 void __netif_schedule(struct Qdisc *q)
e4b2b4a8 37063@@ -2500,6 +2507,7 @@
1a6e0f06
JK
37064 __this_cpu_write(softnet_data.completion_queue, skb);
37065 raise_softirq_irqoff(NET_TX_SOFTIRQ);
37066 local_irq_restore(flags);
37067+ preempt_check_resched_rt();
37068 }
37069 EXPORT_SYMBOL(__dev_kfree_skb_irq);
37070
e4b2b4a8 37071@@ -3175,7 +3183,11 @@
1a6e0f06
JK
37072 * This permits qdisc->running owner to get the lock more
37073 * often and dequeue packets faster.
37074 */
37075+#ifdef CONFIG_PREEMPT_RT_FULL
37076+ contended = true;
37077+#else
37078 contended = qdisc_is_running(q);
37079+#endif
37080 if (unlikely(contended))
37081 spin_lock(&q->busylock);
37082
e4b2b4a8 37083@@ -3246,8 +3258,10 @@
1a6e0f06
JK
37084 #define skb_update_prio(skb)
37085 #endif
37086
37087+#ifndef CONFIG_PREEMPT_RT_FULL
37088 DEFINE_PER_CPU(int, xmit_recursion);
37089 EXPORT_SYMBOL(xmit_recursion);
37090+#endif
37091
37092 /**
37093 * dev_loopback_xmit - loop back @skb
e4b2b4a8
JK
37094@@ -3487,9 +3501,12 @@
37095 if (dev->flags & IFF_UP) {
1a6e0f06
JK
37096 int cpu = smp_processor_id(); /* ok because BHs are off */
37097
e4b2b4a8
JK
37098+#ifdef CONFIG_PREEMPT_RT_FULL
37099+ if (txq->xmit_lock_owner != current) {
37100+#else
1a6e0f06
JK
37101 if (txq->xmit_lock_owner != cpu) {
37102- if (unlikely(__this_cpu_read(xmit_recursion) >
37103- XMIT_RECURSION_LIMIT))
e4b2b4a8 37104+#endif
1a6e0f06
JK
37105+ if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
37106 goto recursion_alert;
37107
37108 skb = validate_xmit_skb(skb, dev);
e4b2b4a8 37109@@ -3499,9 +3516,9 @@
1a6e0f06
JK
37110 HARD_TX_LOCK(dev, txq, cpu);
37111
37112 if (!netif_xmit_stopped(txq)) {
37113- __this_cpu_inc(xmit_recursion);
37114+ xmit_rec_inc();
37115 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
37116- __this_cpu_dec(xmit_recursion);
37117+ xmit_rec_dec();
37118 if (dev_xmit_complete(rc)) {
37119 HARD_TX_UNLOCK(dev, txq);
37120 goto out;
e4b2b4a8 37121@@ -3882,6 +3899,7 @@
1a6e0f06
JK
37122 rps_unlock(sd);
37123
37124 local_irq_restore(flags);
37125+ preempt_check_resched_rt();
37126
37127 atomic_long_inc(&skb->dev->rx_dropped);
37128 kfree_skb(skb);
e4b2b4a8 37129@@ -4034,7 +4052,7 @@
1a6e0f06
JK
37130 struct rps_dev_flow voidflow, *rflow = &voidflow;
37131 int cpu;
37132
37133- preempt_disable();
37134+ migrate_disable();
37135 rcu_read_lock();
37136
37137 cpu = get_rps_cpu(skb->dev, skb, &rflow);
e4b2b4a8 37138@@ -4044,14 +4062,14 @@
1a6e0f06
JK
37139 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
37140
37141 rcu_read_unlock();
37142- preempt_enable();
37143+ migrate_enable();
37144 } else
37145 #endif
37146 {
37147 unsigned int qtail;
e4b2b4a8 37148
1a6e0f06
JK
37149- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
37150- put_cpu();
37151+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
37152+ put_cpu_light();
37153 }
37154 return ret;
37155 }
e4b2b4a8 37156@@ -4085,11 +4103,9 @@
1a6e0f06
JK
37157
37158 trace_netif_rx_ni_entry(skb);
37159
37160- preempt_disable();
37161+ local_bh_disable();
37162 err = netif_rx_internal(skb);
37163- if (local_softirq_pending())
37164- do_softirq();
37165- preempt_enable();
37166+ local_bh_enable();
37167
37168 return err;
37169 }
e4b2b4a8 37170@@ -4607,7 +4623,7 @@
1a6e0f06 37171 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
c7c16703 37172 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
1a6e0f06
JK
37173 __skb_unlink(skb, &sd->input_pkt_queue);
37174- kfree_skb(skb);
37175+ __skb_queue_tail(&sd->tofree_queue, skb);
37176 input_queue_head_incr(sd);
37177 }
37178 }
e4b2b4a8 37179@@ -4617,11 +4633,14 @@
1a6e0f06 37180 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
c7c16703 37181 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
1a6e0f06
JK
37182 __skb_unlink(skb, &sd->process_queue);
37183- kfree_skb(skb);
37184+ __skb_queue_tail(&sd->tofree_queue, skb);
37185 input_queue_head_incr(sd);
37186 }
37187 }
1a6e0f06
JK
37188+ if (!skb_queue_empty(&sd->tofree_queue))
37189+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
c7c16703
JK
37190 local_bh_enable();
37191+
1a6e0f06
JK
37192 }
37193
c7c16703 37194 static void flush_all_backlogs(void)
e4b2b4a8 37195@@ -5131,12 +5150,14 @@
1a6e0f06
JK
37196 sd->rps_ipi_list = NULL;
37197
37198 local_irq_enable();
37199+ preempt_check_resched_rt();
37200
37201 /* Send pending IPI's to kick RPS processing on remote cpus. */
e4b2b4a8 37202 net_rps_send_ipi(remsd);
1a6e0f06
JK
37203 } else
37204 #endif
37205 local_irq_enable();
37206+ preempt_check_resched_rt();
37207 }
37208
37209 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
e4b2b4a8 37210@@ -5166,7 +5187,9 @@
c7c16703
JK
37211 while (again) {
37212 struct sk_buff *skb;
37213
37214+ local_irq_disable();
37215 while ((skb = __skb_dequeue(&sd->process_queue))) {
37216+ local_irq_enable();
37217 rcu_read_lock();
37218 __netif_receive_skb(skb);
37219 rcu_read_unlock();
e4b2b4a8 37220@@ -5174,9 +5197,9 @@
c7c16703
JK
37221 if (++work >= quota)
37222 return work;
37223
37224+ local_irq_disable();
37225 }
37226
37227- local_irq_disable();
37228 rps_lock(sd);
37229 if (skb_queue_empty(&sd->input_pkt_queue)) {
37230 /*
e4b2b4a8 37231@@ -5214,6 +5237,7 @@
1a6e0f06
JK
37232 local_irq_save(flags);
37233 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
37234 local_irq_restore(flags);
37235+ preempt_check_resched_rt();
37236 }
37237 EXPORT_SYMBOL(__napi_schedule);
37238
e4b2b4a8
JK
37239@@ -5250,6 +5274,7 @@
37240 }
37241 EXPORT_SYMBOL(napi_schedule_prep);
37242
c7c16703
JK
37243+#ifndef CONFIG_PREEMPT_RT_FULL
37244 /**
37245 * __napi_schedule_irqoff - schedule for receive
37246 * @n: entry to schedule
e4b2b4a8 37247@@ -5261,6 +5286,7 @@
c7c16703
JK
37248 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
37249 }
37250 EXPORT_SYMBOL(__napi_schedule_irqoff);
37251+#endif
37252
e4b2b4a8 37253 bool napi_complete_done(struct napi_struct *n, int work_done)
c7c16703 37254 {
e4b2b4a8
JK
37255@@ -5615,13 +5641,21 @@
37256 unsigned long time_limit = jiffies +
37257 usecs_to_jiffies(netdev_budget_usecs);
c7c16703
JK
37258 int budget = netdev_budget;
37259+ struct sk_buff_head tofree_q;
37260+ struct sk_buff *skb;
37261 LIST_HEAD(list);
37262 LIST_HEAD(repoll);
37263
37264+ __skb_queue_head_init(&tofree_q);
37265+
37266 local_irq_disable();
37267+ skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
37268 list_splice_init(&sd->poll_list, &list);
37269 local_irq_enable();
37270
37271+ while ((skb = __skb_dequeue(&tofree_q)))
37272+ kfree_skb(skb);
37273+
37274 for (;;) {
37275 struct napi_struct *n;
37276
e4b2b4a8 37277@@ -5651,7 +5685,7 @@
1a6e0f06
JK
37278 list_splice_tail(&repoll, &list);
37279 list_splice(&list, &sd->poll_list);
37280 if (!list_empty(&sd->poll_list))
37281- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
37282+ __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
37283
37284 net_rps_action_and_irq_enable(sd);
e4b2b4a8
JK
37285 out:
37286@@ -7478,7 +7512,7 @@
37287 /* Initialize queue lock */
37288 spin_lock_init(&queue->_xmit_lock);
37289 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
37290- queue->xmit_lock_owner = -1;
37291+ netdev_queue_clear_owner(queue);
37292 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
37293 queue->dev = dev;
37294 #ifdef CONFIG_BQL
37295@@ -8418,6 +8452,7 @@
1a6e0f06
JK
37296
37297 raise_softirq_irqoff(NET_TX_SOFTIRQ);
37298 local_irq_enable();
37299+ preempt_check_resched_rt();
37300
e4b2b4a8
JK
37301 #ifdef CONFIG_RPS
37302 remsd = oldsd->rps_ipi_list;
37303@@ -8431,10 +8466,13 @@
1a6e0f06
JK
37304 netif_rx_ni(skb);
37305 input_queue_head_incr(oldsd);
37306 }
37307- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
37308+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
37309 netif_rx_ni(skb);
37310 input_queue_head_incr(oldsd);
37311 }
37312+ while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
37313+ kfree_skb(skb);
37314+ }
37315
e4b2b4a8 37316 return 0;
1a6e0f06 37317 }
e4b2b4a8 37318@@ -8738,8 +8776,9 @@
c7c16703
JK
37319
37320 INIT_WORK(flush, flush_backlog);
1a6e0f06
JK
37321
37322- skb_queue_head_init(&sd->input_pkt_queue);
37323- skb_queue_head_init(&sd->process_queue);
37324+ skb_queue_head_init_raw(&sd->input_pkt_queue);
37325+ skb_queue_head_init_raw(&sd->process_queue);
37326+ skb_queue_head_init_raw(&sd->tofree_queue);
37327 INIT_LIST_HEAD(&sd->poll_list);
37328 sd->output_queue_tailp = &sd->output_queue;
37329 #ifdef CONFIG_RPS
e4b2b4a8
JK
37330diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/filter.c linux-4.14/net/core/filter.c
37331--- linux-4.14.orig/net/core/filter.c 2018-09-05 11:03:25.000000000 +0200
37332+++ linux-4.14/net/core/filter.c 2018-09-05 11:05:07.000000000 +0200
37333@@ -1696,7 +1696,7 @@
1a6e0f06
JK
37334 {
37335 int ret;
37336
37337- if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
37338+ if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
37339 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
37340 kfree_skb(skb);
37341 return -ENETDOWN;
e4b2b4a8 37342@@ -1704,9 +1704,9 @@
1a6e0f06
JK
37343
37344 skb->dev = dev;
37345
37346- __this_cpu_inc(xmit_recursion);
37347+ xmit_rec_inc();
37348 ret = dev_queue_xmit(skb);
37349- __this_cpu_dec(xmit_recursion);
37350+ xmit_rec_dec();
37351
37352 return ret;
37353 }
e4b2b4a8
JK
37354diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/gen_estimator.c linux-4.14/net/core/gen_estimator.c
37355--- linux-4.14.orig/net/core/gen_estimator.c 2018-09-05 11:03:25.000000000 +0200
37356+++ linux-4.14/net/core/gen_estimator.c 2018-09-05 11:05:07.000000000 +0200
37357@@ -46,7 +46,7 @@
37358 struct net_rate_estimator {
1a6e0f06 37359 struct gnet_stats_basic_packed *bstats;
1a6e0f06
JK
37360 spinlock_t *stats_lock;
37361- seqcount_t *running;
37362+ net_seqlock_t *running;
e4b2b4a8
JK
37363 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
37364 u8 ewma_log;
37365 u8 intvl_log; /* period : (250ms << intvl_log) */
37366@@ -129,7 +129,7 @@
1a6e0f06 37367 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
e4b2b4a8 37368 struct net_rate_estimator __rcu **rate_est,
1a6e0f06
JK
37369 spinlock_t *stats_lock,
37370- seqcount_t *running,
37371+ net_seqlock_t *running,
37372 struct nlattr *opt)
37373 {
e4b2b4a8
JK
37374 struct gnet_estimator *parm = nla_data(opt);
37375@@ -222,7 +222,7 @@
1a6e0f06 37376 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
e4b2b4a8 37377 struct net_rate_estimator __rcu **rate_est,
1a6e0f06
JK
37378 spinlock_t *stats_lock,
37379- seqcount_t *running, struct nlattr *opt)
37380+ net_seqlock_t *running, struct nlattr *opt)
37381 {
e4b2b4a8
JK
37382 return gen_new_estimator(bstats, cpu_bstats, rate_est,
37383 stats_lock, running, opt);
37384diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/gen_stats.c linux-4.14/net/core/gen_stats.c
37385--- linux-4.14.orig/net/core/gen_stats.c 2018-09-05 11:03:25.000000000 +0200
37386+++ linux-4.14/net/core/gen_stats.c 2018-09-05 11:05:07.000000000 +0200
37387@@ -142,7 +142,7 @@
1a6e0f06
JK
37388 }
37389
37390 void
37391-__gnet_stats_copy_basic(const seqcount_t *running,
37392+__gnet_stats_copy_basic(net_seqlock_t *running,
37393 struct gnet_stats_basic_packed *bstats,
37394 struct gnet_stats_basic_cpu __percpu *cpu,
37395 struct gnet_stats_basic_packed *b)
e4b2b4a8 37396@@ -155,10 +155,10 @@
1a6e0f06
JK
37397 }
37398 do {
37399 if (running)
37400- seq = read_seqcount_begin(running);
37401+ seq = net_seq_begin(running);
37402 bstats->bytes = b->bytes;
37403 bstats->packets = b->packets;
37404- } while (running && read_seqcount_retry(running, seq));
37405+ } while (running && net_seq_retry(running, seq));
37406 }
37407 EXPORT_SYMBOL(__gnet_stats_copy_basic);
37408
e4b2b4a8 37409@@ -176,7 +176,7 @@
1a6e0f06
JK
37410 * if the room in the socket buffer was not sufficient.
37411 */
37412 int
37413-gnet_stats_copy_basic(const seqcount_t *running,
37414+gnet_stats_copy_basic(net_seqlock_t *running,
37415 struct gnet_dump *d,
37416 struct gnet_stats_basic_cpu __percpu *cpu,
37417 struct gnet_stats_basic_packed *b)
e4b2b4a8
JK
37418diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/pktgen.c linux-4.14/net/core/pktgen.c
37419--- linux-4.14.orig/net/core/pktgen.c 2017-11-12 19:46:13.000000000 +0100
37420+++ linux-4.14/net/core/pktgen.c 2018-09-05 11:05:07.000000000 +0200
37421@@ -2252,7 +2252,8 @@
37422 s64 remaining;
37423 struct hrtimer_sleeper t;
37424
37425- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
37426+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
37427+ current);
37428 hrtimer_set_expires(&t.timer, spin_until);
37429
37430 remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
37431@@ -2267,7 +2268,6 @@
37432 } while (ktime_compare(end_time, spin_until) < 0);
37433 } else {
37434 /* see do_nanosleep */
37435- hrtimer_init_sleeper(&t, current);
37436 do {
37437 set_current_state(TASK_INTERRUPTIBLE);
37438 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
37439diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/skbuff.c linux-4.14/net/core/skbuff.c
37440--- linux-4.14.orig/net/core/skbuff.c 2018-09-05 11:03:25.000000000 +0200
37441+++ linux-4.14/net/core/skbuff.c 2018-09-05 11:05:07.000000000 +0200
37442@@ -63,6 +63,7 @@
1a6e0f06
JK
37443 #include <linux/errqueue.h>
37444 #include <linux/prefetch.h>
37445 #include <linux/if_vlan.h>
37446+#include <linux/locallock.h>
37447
37448 #include <net/protocol.h>
37449 #include <net/dst.h>
e4b2b4a8 37450@@ -330,6 +331,8 @@
1a6e0f06
JK
37451
37452 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
37453 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
37454+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
37455+static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
37456
37457 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
37458 {
e4b2b4a8 37459@@ -337,10 +340,10 @@
1a6e0f06
JK
37460 unsigned long flags;
37461 void *data;
37462
37463- local_irq_save(flags);
37464+ local_lock_irqsave(netdev_alloc_lock, flags);
37465 nc = this_cpu_ptr(&netdev_alloc_cache);
e4b2b4a8 37466 data = page_frag_alloc(nc, fragsz, gfp_mask);
1a6e0f06
JK
37467- local_irq_restore(flags);
37468+ local_unlock_irqrestore(netdev_alloc_lock, flags);
37469 return data;
37470 }
37471
e4b2b4a8 37472@@ -359,9 +362,13 @@
1a6e0f06
JK
37473
37474 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
37475 {
37476- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37477+ struct napi_alloc_cache *nc;
37478+ void *data;
37479
e4b2b4a8 37480- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
1a6e0f06 37481+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
e4b2b4a8 37482+ data = page_frag_alloc(&nc->page, fragsz, gfp_mask);
1a6e0f06
JK
37483+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37484+ return data;
37485 }
37486
37487 void *napi_alloc_frag(unsigned int fragsz)
e4b2b4a8 37488@@ -408,13 +415,13 @@
1a6e0f06
JK
37489 if (sk_memalloc_socks())
37490 gfp_mask |= __GFP_MEMALLOC;
37491
37492- local_irq_save(flags);
37493+ local_lock_irqsave(netdev_alloc_lock, flags);
37494
37495 nc = this_cpu_ptr(&netdev_alloc_cache);
e4b2b4a8 37496 data = page_frag_alloc(nc, len, gfp_mask);
1a6e0f06
JK
37497 pfmemalloc = nc->pfmemalloc;
37498
37499- local_irq_restore(flags);
37500+ local_unlock_irqrestore(netdev_alloc_lock, flags);
37501
37502 if (unlikely(!data))
37503 return NULL;
e4b2b4a8 37504@@ -455,9 +462,10 @@
1a6e0f06
JK
37505 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
37506 gfp_t gfp_mask)
37507 {
37508- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37509+ struct napi_alloc_cache *nc;
37510 struct sk_buff *skb;
37511 void *data;
37512+ bool pfmemalloc;
37513
37514 len += NET_SKB_PAD + NET_IP_ALIGN;
37515
e4b2b4a8 37516@@ -475,7 +483,10 @@
1a6e0f06
JK
37517 if (sk_memalloc_socks())
37518 gfp_mask |= __GFP_MEMALLOC;
37519
37520+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
e4b2b4a8 37521 data = page_frag_alloc(&nc->page, len, gfp_mask);
1a6e0f06
JK
37522+ pfmemalloc = nc->page.pfmemalloc;
37523+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37524 if (unlikely(!data))
37525 return NULL;
37526
e4b2b4a8 37527@@ -486,7 +497,7 @@
1a6e0f06
JK
37528 }
37529
37530 /* use OR instead of assignment to avoid clearing of bits in mask */
37531- if (nc->page.pfmemalloc)
37532+ if (pfmemalloc)
37533 skb->pfmemalloc = 1;
37534 skb->head_frag = 1;
37535
e4b2b4a8 37536@@ -718,23 +729,26 @@
1a6e0f06
JK
37537
37538 void __kfree_skb_flush(void)
37539 {
37540- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37541+ struct napi_alloc_cache *nc;
37542
37543+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37544 /* flush skb_cache if containing objects */
37545 if (nc->skb_count) {
37546 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
37547 nc->skb_cache);
37548 nc->skb_count = 0;
37549 }
37550+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37551 }
37552
37553 static inline void _kfree_skb_defer(struct sk_buff *skb)
37554 {
37555- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37556+ struct napi_alloc_cache *nc;
37557
37558 /* drop skb->head and call any destructors for packet */
37559 skb_release_all(skb);
37560
37561+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37562 /* record skb to CPU local list */
37563 nc->skb_cache[nc->skb_count++] = skb;
37564
e4b2b4a8 37565@@ -749,6 +763,7 @@
1a6e0f06
JK
37566 nc->skb_cache);
37567 nc->skb_count = 0;
37568 }
37569+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37570 }
37571 void __kfree_skb_defer(struct sk_buff *skb)
37572 {
e4b2b4a8
JK
37573diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/sock.c linux-4.14/net/core/sock.c
37574--- linux-4.14.orig/net/core/sock.c 2018-09-05 11:03:25.000000000 +0200
37575+++ linux-4.14/net/core/sock.c 2018-09-05 11:05:07.000000000 +0200
37576@@ -2757,12 +2757,11 @@
1a6e0f06
JK
37577 if (sk->sk_lock.owned)
37578 __lock_sock(sk);
37579 sk->sk_lock.owned = 1;
37580- spin_unlock(&sk->sk_lock.slock);
37581+ spin_unlock_bh(&sk->sk_lock.slock);
37582 /*
37583 * The sk_lock has mutex_lock() semantics here:
37584 */
37585 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
37586- local_bh_enable();
37587 }
37588 EXPORT_SYMBOL(lock_sock_nested);
37589
e4b2b4a8
JK
37590diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/ipv4/icmp.c linux-4.14/net/ipv4/icmp.c
37591--- linux-4.14.orig/net/ipv4/icmp.c 2018-09-05 11:03:25.000000000 +0200
37592+++ linux-4.14/net/ipv4/icmp.c 2018-09-05 11:05:07.000000000 +0200
37593@@ -77,6 +77,7 @@
1a6e0f06
JK
37594 #include <linux/string.h>
37595 #include <linux/netfilter_ipv4.h>
37596 #include <linux/slab.h>
37597+#include <linux/locallock.h>
37598 #include <net/snmp.h>
37599 #include <net/ip.h>
37600 #include <net/route.h>
e4b2b4a8 37601@@ -204,6 +205,8 @@
1a6e0f06
JK
37602 *
37603 * On SMP we have one ICMP socket per-cpu.
37604 */
37605+static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
37606+
37607 static struct sock *icmp_sk(struct net *net)
37608 {
37609 return *this_cpu_ptr(net->ipv4.icmp_sk);
e4b2b4a8
JK
37610@@ -214,12 +217,16 @@
37611 {
37612 struct sock *sk;
1a6e0f06 37613
e4b2b4a8
JK
37614+ if (!local_trylock(icmp_sk_lock))
37615+ return NULL;
37616+
1a6e0f06
JK
37617 sk = icmp_sk(net);
37618
37619 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
37620 /* This can happen if the output path signals a
37621 * dst_link_failure() for an outgoing ICMP packet.
37622 */
37623+ local_unlock(icmp_sk_lock);
1a6e0f06
JK
37624 return NULL;
37625 }
e4b2b4a8
JK
37626 return sk;
37627@@ -228,6 +235,7 @@
1a6e0f06
JK
37628 static inline void icmp_xmit_unlock(struct sock *sk)
37629 {
e4b2b4a8 37630 spin_unlock(&sk->sk_lock.slock);
1a6e0f06
JK
37631+ local_unlock(icmp_sk_lock);
37632 }
37633
37634 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
e4b2b4a8
JK
37635diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/ipv4/tcp_ipv4.c linux-4.14/net/ipv4/tcp_ipv4.c
37636--- linux-4.14.orig/net/ipv4/tcp_ipv4.c 2018-09-05 11:03:25.000000000 +0200
37637+++ linux-4.14/net/ipv4/tcp_ipv4.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
37638@@ -62,6 +62,7 @@
37639 #include <linux/init.h>
37640 #include <linux/times.h>
37641 #include <linux/slab.h>
37642+#include <linux/locallock.h>
37643
37644 #include <net/net_namespace.h>
37645 #include <net/icmp.h>
e4b2b4a8 37646@@ -580,6 +581,7 @@
1a6e0f06
JK
37647 }
37648 EXPORT_SYMBOL(tcp_v4_send_check);
37649
37650+static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
37651 /*
37652 * This routine will send an RST to the other tcp.
37653 *
e4b2b4a8 37654@@ -710,6 +712,7 @@
1a6e0f06 37655 arg.tos = ip_hdr(skb)->tos;
e4b2b4a8 37656 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
1a6e0f06 37657 local_bh_disable();
e4b2b4a8 37658+ local_lock(tcp_sk_lock);
1a6e0f06
JK
37659 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
37660 skb, &TCP_SKB_CB(skb)->header.h4.opt,
e4b2b4a8
JK
37661 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
37662@@ -717,6 +720,7 @@
37663
1a6e0f06
JK
37664 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
37665 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
1a6e0f06 37666+ local_unlock(tcp_sk_lock);
e4b2b4a8 37667 local_bh_enable();
1a6e0f06
JK
37668
37669 #ifdef CONFIG_TCP_MD5SIG
e4b2b4a8 37670@@ -796,12 +800,14 @@
1a6e0f06 37671 arg.tos = tos;
e4b2b4a8 37672 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1a6e0f06 37673 local_bh_disable();
e4b2b4a8 37674+ local_lock(tcp_sk_lock);
1a6e0f06
JK
37675 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
37676 skb, &TCP_SKB_CB(skb)->header.h4.opt,
e4b2b4a8
JK
37677 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
37678 &arg, arg.iov[0].iov_len);
1a6e0f06
JK
37679
37680 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1a6e0f06 37681+ local_unlock(tcp_sk_lock);
e4b2b4a8 37682 local_bh_enable();
1a6e0f06
JK
37683 }
37684
e4b2b4a8
JK
37685diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/Kconfig linux-4.14/net/Kconfig
37686--- linux-4.14.orig/net/Kconfig 2017-11-12 19:46:13.000000000 +0100
37687+++ linux-4.14/net/Kconfig 2018-09-05 11:05:07.000000000 +0200
37688@@ -272,7 +272,7 @@
37689
37690 config NET_RX_BUSY_POLL
37691 bool
37692- default y
37693+ default y if !PREEMPT_RT_FULL
37694
37695 config BQL
37696 bool
37697diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/mac80211/rx.c linux-4.14/net/mac80211/rx.c
37698--- linux-4.14.orig/net/mac80211/rx.c 2018-09-05 11:03:25.000000000 +0200
37699+++ linux-4.14/net/mac80211/rx.c 2018-09-05 11:05:07.000000000 +0200
37700@@ -4252,7 +4252,7 @@
1a6e0f06
JK
37701 struct ieee80211_supported_band *sband;
37702 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
37703
37704- WARN_ON_ONCE(softirq_count() == 0);
37705+ WARN_ON_ONCE_NONRT(softirq_count() == 0);
37706
37707 if (WARN_ON(status->band >= NUM_NL80211_BANDS))
37708 goto drop;
e4b2b4a8
JK
37709diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/netfilter/core.c linux-4.14/net/netfilter/core.c
37710--- linux-4.14.orig/net/netfilter/core.c 2017-11-12 19:46:13.000000000 +0100
37711+++ linux-4.14/net/netfilter/core.c 2018-09-05 11:05:07.000000000 +0200
37712@@ -21,6 +21,7 @@
37713 #include <linux/inetdevice.h>
1a6e0f06
JK
37714 #include <linux/proc_fs.h>
37715 #include <linux/mutex.h>
1a6e0f06 37716+#include <linux/locallock.h>
e4b2b4a8 37717 #include <linux/mm.h>
c7c16703 37718 #include <linux/rcupdate.h>
1a6e0f06 37719 #include <net/net_namespace.h>
e4b2b4a8 37720@@ -28,6 +29,11 @@
1a6e0f06
JK
37721
37722 #include "nf_internals.h"
37723
37724+#ifdef CONFIG_PREEMPT_RT_BASE
37725+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
37726+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
37727+#endif
37728+
37729 static DEFINE_MUTEX(afinfo_mutex);
37730
37731 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
e4b2b4a8
JK
37732diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/packet/af_packet.c linux-4.14/net/packet/af_packet.c
37733--- linux-4.14.orig/net/packet/af_packet.c 2018-09-05 11:03:25.000000000 +0200
37734+++ linux-4.14/net/packet/af_packet.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
37735@@ -63,6 +63,7 @@
37736 #include <linux/if_packet.h>
37737 #include <linux/wireless.h>
37738 #include <linux/kernel.h>
37739+#include <linux/delay.h>
37740 #include <linux/kmod.h>
37741 #include <linux/slab.h>
37742 #include <linux/vmalloc.h>
e4b2b4a8 37743@@ -707,7 +708,7 @@
1a6e0f06
JK
37744 if (BLOCK_NUM_PKTS(pbd)) {
37745 while (atomic_read(&pkc->blk_fill_in_prog)) {
37746 /* Waiting for skb_copy_bits to finish... */
37747- cpu_relax();
37748+ cpu_chill();
37749 }
37750 }
37751
e4b2b4a8 37752@@ -969,7 +970,7 @@
1a6e0f06
JK
37753 if (!(status & TP_STATUS_BLK_TMO)) {
37754 while (atomic_read(&pkc->blk_fill_in_prog)) {
37755 /* Waiting for skb_copy_bits to finish... */
37756- cpu_relax();
37757+ cpu_chill();
37758 }
37759 }
37760 prb_close_block(pkc, pbd, po, status);
e4b2b4a8
JK
37761diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/rds/ib_rdma.c linux-4.14/net/rds/ib_rdma.c
37762--- linux-4.14.orig/net/rds/ib_rdma.c 2017-11-12 19:46:13.000000000 +0100
37763+++ linux-4.14/net/rds/ib_rdma.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
37764@@ -34,6 +34,7 @@
37765 #include <linux/slab.h>
37766 #include <linux/rculist.h>
37767 #include <linux/llist.h>
37768+#include <linux/delay.h>
37769
37770 #include "rds_single_path.h"
37771 #include "ib_mr.h"
e4b2b4a8 37772@@ -210,7 +211,7 @@
1a6e0f06
JK
37773 for_each_online_cpu(cpu) {
37774 flag = &per_cpu(clean_list_grace, cpu);
37775 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
37776- cpu_relax();
37777+ cpu_chill();
37778 }
37779 }
37780
e4b2b4a8
JK
37781diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/rxrpc/security.c linux-4.14/net/rxrpc/security.c
37782--- linux-4.14.orig/net/rxrpc/security.c 2017-11-12 19:46:13.000000000 +0100
37783+++ linux-4.14/net/rxrpc/security.c 2018-09-05 11:05:07.000000000 +0200
1a6e0f06
JK
37784@@ -19,9 +19,6 @@
37785 #include <keys/rxrpc-type.h>
37786 #include "ar-internal.h"
37787
37788-static LIST_HEAD(rxrpc_security_methods);
37789-static DECLARE_RWSEM(rxrpc_security_sem);
37790-
37791 static const struct rxrpc_security *rxrpc_security_types[] = {
37792 [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
37793 #ifdef CONFIG_RXKAD
e4b2b4a8
JK
37794diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sched/sch_api.c linux-4.14/net/sched/sch_api.c
37795--- linux-4.14.orig/net/sched/sch_api.c 2017-11-12 19:46:13.000000000 +0100
37796+++ linux-4.14/net/sched/sch_api.c 2018-09-05 11:05:07.000000000 +0200
37797@@ -1081,7 +1081,7 @@
1a6e0f06
JK
37798 rcu_assign_pointer(sch->stab, stab);
37799 }
37800 if (tca[TCA_RATE]) {
37801- seqcount_t *running;
37802+ net_seqlock_t *running;
37803
37804 err = -EOPNOTSUPP;
37805 if (sch->flags & TCQ_F_MQROOT)
e4b2b4a8
JK
37806diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sched/sch_generic.c linux-4.14/net/sched/sch_generic.c
37807--- linux-4.14.orig/net/sched/sch_generic.c 2018-09-05 11:03:25.000000000 +0200
37808+++ linux-4.14/net/sched/sch_generic.c 2018-09-05 11:05:07.000000000 +0200
37809@@ -429,7 +429,11 @@
c7c16703 37810 .ops = &noop_qdisc_ops,
1a6e0f06
JK
37811 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
37812 .dev_queue = &noop_netdev_queue,
37813+#ifdef CONFIG_PREEMPT_RT_BASE
37814+ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
37815+#else
37816 .running = SEQCNT_ZERO(noop_qdisc.running),
37817+#endif
37818 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
37819 };
37820 EXPORT_SYMBOL(noop_qdisc);
e4b2b4a8 37821@@ -628,9 +632,17 @@
1a6e0f06
JK
37822 lockdep_set_class(&sch->busylock,
37823 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
37824
37825+#ifdef CONFIG_PREEMPT_RT_BASE
37826+ seqlock_init(&sch->running);
37827+ lockdep_set_class(&sch->running.seqcount,
37828+ dev->qdisc_running_key ?: &qdisc_running_key);
37829+ lockdep_set_class(&sch->running.lock,
37830+ dev->qdisc_running_key ?: &qdisc_running_key);
37831+#else
37832 seqcount_init(&sch->running);
37833 lockdep_set_class(&sch->running,
37834 dev->qdisc_running_key ?: &qdisc_running_key);
37835+#endif
37836
37837 sch->ops = ops;
37838 sch->enqueue = ops->enqueue;
e4b2b4a8 37839@@ -933,7 +945,7 @@
1a6e0f06 37840 /* Wait for outstanding qdisc_run calls. */
e4b2b4a8 37841 list_for_each_entry(dev, head, close_list) {
1a6e0f06
JK
37842 while (some_qdisc_is_busy(dev))
37843- yield();
37844+ msleep(1);
e4b2b4a8
JK
37845 /* The new qdisc is assigned at this point so we can safely
37846 * unwind stale skb lists and qdisc statistics
37847 */
37848diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sunrpc/svc_xprt.c linux-4.14/net/sunrpc/svc_xprt.c
37849--- linux-4.14.orig/net/sunrpc/svc_xprt.c 2017-11-12 19:46:13.000000000 +0100
37850+++ linux-4.14/net/sunrpc/svc_xprt.c 2018-09-05 11:05:07.000000000 +0200
37851@@ -396,7 +396,7 @@
1a6e0f06
JK
37852 goto out;
37853 }
37854
37855- cpu = get_cpu();
37856+ cpu = get_cpu_light();
37857 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
37858
37859 atomic_long_inc(&pool->sp_stats.packets);
e4b2b4a8 37860@@ -432,7 +432,7 @@
1a6e0f06
JK
37861
37862 atomic_long_inc(&pool->sp_stats.threads_woken);
37863 wake_up_process(rqstp->rq_task);
37864- put_cpu();
37865+ put_cpu_light();
37866 goto out;
37867 }
37868 rcu_read_unlock();
e4b2b4a8 37869@@ -453,7 +453,7 @@
1a6e0f06
JK
37870 goto redo_search;
37871 }
37872 rqstp = NULL;
37873- put_cpu();
37874+ put_cpu_light();
37875 out:
37876 trace_svc_xprt_do_enqueue(xprt, rqstp);
37877 }
e4b2b4a8
JK
37878diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/xfrm/xfrm_state.c linux-4.14/net/xfrm/xfrm_state.c
37879--- linux-4.14.orig/net/xfrm/xfrm_state.c 2018-09-05 11:03:25.000000000 +0200
37880+++ linux-4.14/net/xfrm/xfrm_state.c 2018-09-05 11:05:07.000000000 +0200
37881@@ -427,7 +427,7 @@
37882
37883 static void xfrm_state_gc_destroy(struct xfrm_state *x)
37884 {
37885- tasklet_hrtimer_cancel(&x->mtimer);
37886+ hrtimer_cancel(&x->mtimer);
37887 del_timer_sync(&x->rtimer);
37888 kfree(x->aead);
37889 kfree(x->aalg);
37890@@ -472,8 +472,8 @@
37891
37892 static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
37893 {
37894- struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
37895- struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
37896+ struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
37897+ enum hrtimer_restart ret = HRTIMER_NORESTART;
37898 unsigned long now = get_seconds();
37899 long next = LONG_MAX;
37900 int warn = 0;
37901@@ -537,7 +537,8 @@
37902 km_state_expired(x, 0, 0);
37903 resched:
37904 if (next != LONG_MAX) {
37905- tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
37906+ hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
37907+ ret = HRTIMER_RESTART;
37908 }
37909
37910 goto out;
37911@@ -554,7 +555,7 @@
37912
37913 out:
37914 spin_unlock(&x->lock);
37915- return HRTIMER_NORESTART;
37916+ return ret;
37917 }
37918
37919 static void xfrm_replay_timer_handler(unsigned long data);
37920@@ -573,8 +574,8 @@
37921 INIT_HLIST_NODE(&x->bydst);
37922 INIT_HLIST_NODE(&x->bysrc);
37923 INIT_HLIST_NODE(&x->byspi);
37924- tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
37925- CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
37926+ hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT);
37927+ x->mtimer.function = xfrm_timer_handler;
37928 setup_timer(&x->rtimer, xfrm_replay_timer_handler,
37929 (unsigned long)x);
37930 x->curlft.add_time = get_seconds();
37931@@ -1031,7 +1032,9 @@
37932 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
37933 }
37934 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
37935- tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
37936+ hrtimer_start(&x->mtimer,
37937+ ktime_set(net->xfrm.sysctl_acq_expires, 0),
37938+ HRTIMER_MODE_REL_SOFT);
37939 net->xfrm.state_num++;
37940 xfrm_hash_grow_check(net, x->bydst.next != NULL);
37941 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
37942@@ -1142,7 +1145,7 @@
37943 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
37944 }
37945
37946- tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
37947+ hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
37948 if (x->replay_maxage)
37949 mod_timer(&x->rtimer, jiffies + x->replay_maxage);
37950
37951@@ -1246,7 +1249,9 @@
37952 x->mark.m = m->m;
37953 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
37954 xfrm_state_hold(x);
37955- tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
37956+ hrtimer_start(&x->mtimer,
37957+ ktime_set(net->xfrm.sysctl_acq_expires, 0),
37958+ HRTIMER_MODE_REL_SOFT);
37959 list_add(&x->km.all, &net->xfrm.state_all);
37960 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
37961 h = xfrm_src_hash(net, daddr, saddr, family);
37962@@ -1546,7 +1551,8 @@
37963 memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
37964 x1->km.dying = 0;
37965
37966- tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
37967+ hrtimer_start(&x1->mtimer, ktime_set(1, 0),
37968+ HRTIMER_MODE_REL_SOFT);
37969 if (x1->curlft.use_time)
37970 xfrm_state_check_expire(x1);
37971
37972@@ -1570,7 +1576,7 @@
37973 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
37974 x->curlft.packets >= x->lft.hard_packet_limit) {
37975 x->km.state = XFRM_STATE_EXPIRED;
37976- tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
37977+ hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
37978 return -EINVAL;
37979 }
37980
37981diff -durN -x '*~' -x '*.orig' linux-4.14.orig/samples/trace_events/trace-events-sample.c linux-4.14/samples/trace_events/trace-events-sample.c
37982--- linux-4.14.orig/samples/trace_events/trace-events-sample.c 2017-11-12 19:46:13.000000000 +0100
37983+++ linux-4.14/samples/trace_events/trace-events-sample.c 2018-09-05 11:05:07.000000000 +0200
37984@@ -33,7 +33,7 @@
37985
37986 /* Silly tracepoints */
37987 trace_foo_bar("hello", cnt, array, random_strings[len],
37988- &current->cpus_allowed);
37989+ current->cpus_ptr);
37990
37991 trace_foo_with_template_simple("HELLO", cnt);
37992
37993diff -durN -x '*~' -x '*.orig' linux-4.14.orig/scripts/mkcompile_h linux-4.14/scripts/mkcompile_h
37994--- linux-4.14.orig/scripts/mkcompile_h 2017-11-12 19:46:13.000000000 +0100
37995+++ linux-4.14/scripts/mkcompile_h 2018-09-05 11:05:07.000000000 +0200
37996@@ -5,7 +5,8 @@
1a6e0f06
JK
37997 ARCH=$2
37998 SMP=$3
37999 PREEMPT=$4
38000-CC=$5
38001+RT=$5
38002+CC=$6
38003
38004 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
38005
e4b2b4a8 38006@@ -58,6 +59,7 @@
1a6e0f06
JK
38007 CONFIG_FLAGS=""
38008 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
38009 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
38010+if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
38011 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
38012
38013 # Truncate to maximum length
e4b2b4a8
JK
38014diff -durN -x '*~' -x '*.orig' linux-4.14.orig/security/apparmor/include/path.h linux-4.14/security/apparmor/include/path.h
38015--- linux-4.14.orig/security/apparmor/include/path.h 2017-11-12 19:46:13.000000000 +0100
38016+++ linux-4.14/security/apparmor/include/path.h 2018-09-05 11:05:07.000000000 +0200
38017@@ -39,9 +39,10 @@
38018 };
38019
38020 #include <linux/percpu.h>
38021-#include <linux/preempt.h>
38022+#include <linux/locallock.h>
38023
38024 DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
38025+DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
38026
38027 #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
38028 #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
38029@@ -55,12 +56,24 @@
38030
38031 #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
38032
38033-#ifdef CONFIG_DEBUG_PREEMPT
38034+#ifdef CONFIG_PREEMPT_RT_BASE
38035+
38036+static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
38037+{
38038+ struct local_irq_lock *lv;
38039+
38040+ lv = this_cpu_ptr(&aa_buffers_lock);
38041+ WARN_ONCE(lv->owner != current,
38042+ "__get_buffer without aa_buffers_lock\n");
38043+}
38044+
38045+#elif defined(CONFIG_DEBUG_PREEMPT)
38046 #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
38047 #else
38048 #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
38049 #endif
38050
38051+
38052 #define __get_buffer(N) ({ \
38053 struct aa_buffers *__cpu_var; \
38054 AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled"); \
38055@@ -73,14 +86,14 @@
38056
38057 #define get_buffers(X...) \
38058 do { \
38059- preempt_disable(); \
38060+ local_lock(aa_buffers_lock); \
38061 __get_buffers(X); \
38062 } while (0)
38063
38064 #define put_buffers(X, Y...) \
38065 do { \
38066 __put_buffers(X, Y); \
38067- preempt_enable(); \
38068+ local_unlock(aa_buffers_lock); \
38069 } while (0)
38070
38071 #endif /* __AA_PATH_H */
38072diff -durN -x '*~' -x '*.orig' linux-4.14.orig/security/apparmor/lsm.c linux-4.14/security/apparmor/lsm.c
38073--- linux-4.14.orig/security/apparmor/lsm.c 2017-11-12 19:46:13.000000000 +0100
38074+++ linux-4.14/security/apparmor/lsm.c 2018-09-05 11:05:07.000000000 +0200
38075@@ -44,7 +44,7 @@
38076 int apparmor_initialized;
38077
38078 DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
38079-
38080+DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
38081
38082 /*
38083 * LSM hook functions
38084diff -durN -x '*~' -x '*.orig' linux-4.14.orig/sound/core/pcm_native.c linux-4.14/sound/core/pcm_native.c
38085--- linux-4.14.orig/sound/core/pcm_native.c 2018-09-05 11:03:25.000000000 +0200
38086+++ linux-4.14/sound/core/pcm_native.c 2018-09-05 11:05:07.000000000 +0200
38087@@ -148,7 +148,7 @@
1a6e0f06
JK
38088 void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
38089 {
38090 if (!substream->pcm->nonatomic)
38091- local_irq_disable();
38092+ local_irq_disable_nort();
38093 snd_pcm_stream_lock(substream);
38094 }
38095 EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
e4b2b4a8 38096@@ -163,7 +163,7 @@
1a6e0f06
JK
38097 {
38098 snd_pcm_stream_unlock(substream);
38099 if (!substream->pcm->nonatomic)
38100- local_irq_enable();
38101+ local_irq_enable_nort();
38102 }
38103 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
38104
e4b2b4a8 38105@@ -171,7 +171,7 @@
1a6e0f06
JK
38106 {
38107 unsigned long flags = 0;
38108 if (!substream->pcm->nonatomic)
38109- local_irq_save(flags);
38110+ local_irq_save_nort(flags);
38111 snd_pcm_stream_lock(substream);
38112 return flags;
38113 }
e4b2b4a8 38114@@ -189,7 +189,7 @@
1a6e0f06
JK
38115 {
38116 snd_pcm_stream_unlock(substream);
38117 if (!substream->pcm->nonatomic)
38118- local_irq_restore(flags);
38119+ local_irq_restore_nort(flags);
38120 }
38121 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
38122
e4b2b4a8
JK
38123diff -durN -x '*~' -x '*.orig' linux-4.14.orig/sound/drivers/dummy.c linux-4.14/sound/drivers/dummy.c
38124--- linux-4.14.orig/sound/drivers/dummy.c 2017-11-12 19:46:13.000000000 +0100
38125+++ linux-4.14/sound/drivers/dummy.c 2018-09-05 11:05:07.000000000 +0200
38126@@ -376,17 +376,9 @@
38127 ktime_t period_time;
38128 atomic_t running;
38129 struct hrtimer timer;
38130- struct tasklet_struct tasklet;
38131 struct snd_pcm_substream *substream;
38132 };
38133
38134-static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
38135-{
38136- struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
38137- if (atomic_read(&dpcm->running))
38138- snd_pcm_period_elapsed(dpcm->substream);
38139-}
38140-
38141 static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
38142 {
38143 struct dummy_hrtimer_pcm *dpcm;
38144@@ -394,7 +386,14 @@
38145 dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
38146 if (!atomic_read(&dpcm->running))
38147 return HRTIMER_NORESTART;
38148- tasklet_schedule(&dpcm->tasklet);
38149+ /*
38150+ * In cases of XRUN and draining, this calls .trigger to stop PCM
38151+ * substream.
38152+ */
38153+ snd_pcm_period_elapsed(dpcm->substream);
38154+ if (!atomic_read(&dpcm->running))
38155+ return HRTIMER_NORESTART;
38156+
38157 hrtimer_forward_now(timer, dpcm->period_time);
38158 return HRTIMER_RESTART;
38159 }
38160@@ -404,7 +403,7 @@
38161 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
38162
38163 dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer);
38164- hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL);
38165+ hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
38166 atomic_set(&dpcm->running, 1);
38167 return 0;
38168 }
38169@@ -414,14 +413,14 @@
38170 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
38171
38172 atomic_set(&dpcm->running, 0);
38173- hrtimer_cancel(&dpcm->timer);
38174+ if (!hrtimer_callback_running(&dpcm->timer))
38175+ hrtimer_cancel(&dpcm->timer);
38176 return 0;
38177 }
38178
38179 static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
38180 {
38181 hrtimer_cancel(&dpcm->timer);
38182- tasklet_kill(&dpcm->tasklet);
38183 }
38184
38185 static snd_pcm_uframes_t
38186@@ -466,12 +465,10 @@
38187 if (!dpcm)
38188 return -ENOMEM;
38189 substream->runtime->private_data = dpcm;
38190- hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
38191+ hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
38192 dpcm->timer.function = dummy_hrtimer_callback;
38193 dpcm->substream = substream;
38194 atomic_set(&dpcm->running, 0);
38195- tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed,
38196- (unsigned long)dpcm);
38197 return 0;
38198 }
38199
38200diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/functions linux-4.14/tools/testing/selftests/ftrace/test.d/functions
38201--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/functions 2018-09-05 11:03:25.000000000 +0200
38202+++ linux-4.14/tools/testing/selftests/ftrace/test.d/functions 2018-09-05 11:05:07.000000000 +0200
38203@@ -70,6 +70,13 @@
38204 echo 0 > events/enable
38205 }
38206
38207+clear_synthetic_events() { # reset all current synthetic events
38208+ grep -v ^# synthetic_events |
38209+ while read line; do
38210+ echo "!$line" >> synthetic_events
38211+ done
38212+}
38213+
38214 initialize_ftrace() { # Reset ftrace to initial-state
38215 # As the initial state, ftrace will be set to nop tracer,
38216 # no events, no triggers, no filters, no function filters,
38217diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
38218--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc 1970-01-01 01:00:00.000000000 +0100
38219+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc 2018-09-05 11:05:07.000000000 +0200
38220@@ -0,0 +1,39 @@
38221+#!/bin/sh
38222+# description: event trigger - test extended error support
38223+
38224+
38225+do_reset() {
38226+ reset_trigger
38227+ echo > set_event
38228+ clear_trace
38229+}
38230+
38231+fail() { #msg
38232+ do_reset
38233+ echo $1
38234+ exit_fail
38235+}
38236+
38237+if [ ! -f set_event ]; then
38238+ echo "event tracing is not supported"
38239+ exit_unsupported
38240+fi
38241+
38242+if [ ! -f synthetic_events ]; then
38243+ echo "synthetic event is not supported"
38244+ exit_unsupported
38245+fi
38246+
38247+reset_tracer
38248+do_reset
38249+
38250+echo "Test extended error support"
38251+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38252+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
38253+if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
38254+ fail "Failed to generate extended error in histogram"
38255+fi
38256+
38257+do_reset
38258+
38259+exit 0
38260diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
38261--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc 1970-01-01 01:00:00.000000000 +0100
38262+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc 2018-09-05 11:05:07.000000000 +0200
38263@@ -0,0 +1,54 @@
38264+#!/bin/sh
38265+# description: event trigger - test field variable support
38266+
38267+do_reset() {
38268+ reset_trigger
38269+ echo > set_event
38270+ clear_trace
38271+}
38272+
38273+fail() { #msg
38274+ do_reset
38275+ echo $1
38276+ exit_fail
38277+}
38278+
38279+if [ ! -f set_event ]; then
38280+ echo "event tracing is not supported"
38281+ exit_unsupported
38282+fi
38283+
38284+if [ ! -f synthetic_events ]; then
38285+ echo "synthetic event is not supported"
38286+ exit_unsupported
38287+fi
38288+
38289+clear_synthetic_events
38290+reset_tracer
38291+do_reset
38292+
38293+echo "Test field variable support"
38294+
38295+echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
38296+echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
38297+echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
38298+echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
38299+
38300+ping localhost -c 3
38301+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
38302+ fail "Failed to create inter-event histogram"
38303+fi
38304+
38305+if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
38306+ fail "Failed to create histogram with field variable"
38307+fi
38308+
38309+echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38310+
38311+if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
38312+ fail "Failed to remove histogram with field variable"
38313+fi
38314+
38315+do_reset
38316+
38317+exit 0
38318diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
38319--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc 1970-01-01 01:00:00.000000000 +0100
38320+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc 2018-09-05 11:05:07.000000000 +0200
38321@@ -0,0 +1,58 @@
38322+#!/bin/sh
38323+# description: event trigger - test inter-event combined histogram trigger
38324+
38325+do_reset() {
38326+ reset_trigger
38327+ echo > set_event
38328+ clear_trace
38329+}
38330+
38331+fail() { #msg
38332+ do_reset
38333+ echo $1
38334+ exit_fail
38335+}
38336+
38337+if [ ! -f set_event ]; then
38338+ echo "event tracing is not supported"
38339+ exit_unsupported
38340+fi
38341+
38342+if [ ! -f synthetic_events ]; then
38343+ echo "synthetic event is not supported"
38344+ exit_unsupported
38345+fi
38346+
38347+reset_tracer
38348+do_reset
38349+clear_synthetic_events
38350+
38351+echo "Test create synthetic event"
38352+
38353+echo 'waking_latency u64 lat pid_t pid' > synthetic_events
38354+if [ ! -d events/synthetic/waking_latency ]; then
38355+ fail "Failed to create waking_latency synthetic event"
38356+fi
38357+
38358+echo "Test combined histogram"
38359+
38360+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
38361+echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
38362+echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
38363+
38364+echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
38365+echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
38366+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
38367+
38368+echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
38369+echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
38370+echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
38371+
38372+ping localhost -c 3
38373+if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
38374+ fail "Failed to create combined histogram"
38375+fi
38376+
38377+do_reset
38378+
38379+exit 0
38380diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
38381--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc 1970-01-01 01:00:00.000000000 +0100
38382+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38383@@ -0,0 +1,50 @@
38384+#!/bin/sh
38385+# description: event trigger - test inter-event histogram trigger onmatch action
38386+
38387+do_reset() {
38388+ reset_trigger
38389+ echo > set_event
38390+ clear_trace
38391+}
38392+
38393+fail() { #msg
38394+ do_reset
38395+ echo $1
38396+ exit_fail
38397+}
38398+
38399+if [ ! -f set_event ]; then
38400+ echo "event tracing is not supported"
38401+ exit_unsupported
38402+fi
38403+
38404+if [ ! -f synthetic_events ]; then
38405+ echo "synthetic event is not supported"
38406+ exit_unsupported
38407+fi
38408+
38409+clear_synthetic_events
38410+reset_tracer
38411+do_reset
38412+
38413+echo "Test create synthetic event"
38414+
38415+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38416+if [ ! -d events/synthetic/wakeup_latency ]; then
38417+ fail "Failed to create wakeup_latency synthetic event"
38418+fi
38419+
38420+echo "Test create histogram for synthetic event"
38421+echo "Test histogram variables,simple expression support and onmatch action"
38422+
38423+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38424+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
38425+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
38426+ping localhost -c 5
38427+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
38428+ fail "Failed to create onmatch action inter-event histogram"
38429+fi
38430+
38431+do_reset
38432+
38433+exit 0
38434diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
38435--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc 1970-01-01 01:00:00.000000000 +0100
38436+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38437@@ -0,0 +1,50 @@
38438+#!/bin/sh
38439+# description: event trigger - test inter-event histogram trigger onmatch-onmax action
38440+
38441+do_reset() {
38442+ reset_trigger
38443+ echo > set_event
38444+ clear_trace
38445+}
38446+
38447+fail() { #msg
38448+ do_reset
38449+ echo $1
38450+ exit_fail
38451+}
38452+
38453+if [ ! -f set_event ]; then
38454+ echo "event tracing is not supported"
38455+ exit_unsupported
38456+fi
38457+
38458+if [ ! -f synthetic_events ]; then
38459+ echo "synthetic event is not supported"
38460+ exit_unsupported
38461+fi
38462+
38463+clear_synthetic_events
38464+reset_tracer
38465+do_reset
38466+
38467+echo "Test create synthetic event"
38468+
38469+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38470+if [ ! -d events/synthetic/wakeup_latency ]; then
38471+ fail "Failed to create wakeup_latency synthetic event"
38472+fi
38473+
38474+echo "Test create histogram for synthetic event"
38475+echo "Test histogram variables,simple expression support and onmatch-onmax action"
38476+
38477+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38478+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38479+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
38480+ping localhost -c 5
38481+if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
38482+ fail "Failed to create onmatch-onmax action inter-event histogram"
38483+fi
38484+
38485+do_reset
38486+
38487+exit 0
38488diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
38489--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc 1970-01-01 01:00:00.000000000 +0100
38490+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38491@@ -0,0 +1,48 @@
38492+#!/bin/sh
38493+# description: event trigger - test inter-event histogram trigger onmax action
38494+
38495+do_reset() {
38496+ reset_trigger
38497+ echo > set_event
38498+ clear_trace
38499+}
38500+
38501+fail() { #msg
38502+ do_reset
38503+ echo $1
38504+ exit_fail
38505+}
38506+
38507+if [ ! -f set_event ]; then
38508+ echo "event tracing is not supported"
38509+ exit_unsupported
38510+fi
38511+
38512+if [ ! -f synthetic_events ]; then
38513+ echo "synthetic event is not supported"
38514+ exit_unsupported
38515+fi
38516+
38517+clear_synthetic_events
38518+reset_tracer
38519+do_reset
38520+
38521+echo "Test create synthetic event"
38522+
38523+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38524+if [ ! -d events/synthetic/wakeup_latency ]; then
38525+ fail "Failed to create wakeup_latency synthetic event"
38526+fi
38527+
38528+echo "Test onmax action"
38529+
38530+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
38531+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38532+ping localhost -c 3
38533+if ! grep -q "max:" events/sched/sched_switch/hist; then
38534+ fail "Failed to create onmax action inter-event histogram"
38535+fi
38536+
38537+do_reset
38538+
38539+exit 0
38540diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
38541--- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc 1970-01-01 01:00:00.000000000 +0100
38542+++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc 2018-09-05 11:05:07.000000000 +0200
38543@@ -0,0 +1,54 @@
38544+#!/bin/sh
38545+# description: event trigger - test synthetic event create remove
38546+do_reset() {
38547+ reset_trigger
38548+ echo > set_event
38549+ clear_trace
38550+}
38551+
38552+fail() { #msg
38553+ do_reset
38554+ echo $1
38555+ exit_fail
38556+}
38557+
38558+if [ ! -f set_event ]; then
38559+ echo "event tracing is not supported"
38560+ exit_unsupported
38561+fi
38562+
38563+if [ ! -f synthetic_events ]; then
38564+ echo "synthetic event is not supported"
38565+ exit_unsupported
38566+fi
38567+
38568+clear_synthetic_events
38569+reset_tracer
38570+do_reset
38571+
38572+echo "Test create synthetic event"
38573+
38574+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38575+if [ ! -d events/synthetic/wakeup_latency ]; then
38576+ fail "Failed to create wakeup_latency synthetic event"
38577+fi
38578+
38579+reset_trigger
38580+
38581+echo "Test create synthetic event with an error"
38582+echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null
38583+if [ -d events/synthetic/wakeup_latency ]; then
38584+ fail "Created wakeup_latency synthetic event with an invalid format"
38585+fi
38586+
38587+reset_trigger
38588+
38589+echo "Test remove synthetic event"
38590+echo '!wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
38591+if [ -d events/synthetic/wakeup_latency ]; then
38592+ fail "Failed to delete wakeup_latency synthetic event"
38593+fi
38594+
38595+do_reset
38596+
38597+exit 0
38598diff -durN -x '*~' -x '*.orig' linux-4.14.orig/virt/kvm/arm/arm.c linux-4.14/virt/kvm/arm/arm.c
38599--- linux-4.14.orig/virt/kvm/arm/arm.c 2018-09-05 11:03:25.000000000 +0200
38600+++ linux-4.14/virt/kvm/arm/arm.c 2018-09-05 11:05:07.000000000 +0200
38601@@ -69,7 +69,6 @@
38602
38603 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
38604 {
38605- BUG_ON(preemptible());
38606 __this_cpu_write(kvm_arm_running_vcpu, vcpu);
38607 }
38608
38609@@ -79,7 +78,6 @@
38610 */
38611 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
38612 {
38613- BUG_ON(preemptible());
38614 return __this_cpu_read(kvm_arm_running_vcpu);
38615 }
38616
38617@@ -653,7 +651,7 @@
38618 * involves poking the GIC, which must be done in a
38619 * non-preemptible context.
38620 */
38621- preempt_disable();
38622+ migrate_disable();
38623
38624 kvm_pmu_flush_hwstate(vcpu);
38625
38626@@ -690,7 +688,7 @@
38627 kvm_pmu_sync_hwstate(vcpu);
38628 kvm_timer_sync_hwstate(vcpu);
38629 kvm_vgic_sync_hwstate(vcpu);
38630- preempt_enable();
38631+ migrate_enable();
38632 continue;
38633 }
38634
38635@@ -745,7 +743,7 @@
38636
38637 kvm_vgic_sync_hwstate(vcpu);
38638
38639- preempt_enable();
38640+ migrate_enable();
38641
38642 ret = handle_exit(vcpu, run, ret);
38643 }
This page took 4.963451 seconds and 4 git commands to generate.