]> git.pld-linux.org Git - packages/kernel.git/blob - kernel-rt.patch
- 4.14.72
[packages/kernel.git] / kernel-rt.patch
1 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/alpha/include/asm/spinlock_types.h linux-4.14/arch/alpha/include/asm/spinlock_types.h
2 --- linux-4.14.orig/arch/alpha/include/asm/spinlock_types.h     2017-11-12 19:46:13.000000000 +0100
3 +++ linux-4.14/arch/alpha/include/asm/spinlock_types.h  2018-09-05 11:05:07.000000000 +0200
4 @@ -2,10 +2,6 @@
5  #ifndef _ALPHA_SPINLOCK_TYPES_H
6  #define _ALPHA_SPINLOCK_TYPES_H
7  
8 -#ifndef __LINUX_SPINLOCK_TYPES_H
9 -# error "please don't include this file directly"
10 -#endif
11 -
12  typedef struct {
13         volatile unsigned int lock;
14  } arch_spinlock_t;
15 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/irq.h linux-4.14/arch/arm/include/asm/irq.h
16 --- linux-4.14.orig/arch/arm/include/asm/irq.h  2017-11-12 19:46:13.000000000 +0100
17 +++ linux-4.14/arch/arm/include/asm/irq.h       2018-09-05 11:05:07.000000000 +0200
18 @@ -23,6 +23,8 @@
19  #endif
20  
21  #ifndef __ASSEMBLY__
22 +#include <linux/cpumask.h>
23 +
24  struct irqaction;
25  struct pt_regs;
26  extern void migrate_irqs(void);
27 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/spinlock_types.h linux-4.14/arch/arm/include/asm/spinlock_types.h
28 --- linux-4.14.orig/arch/arm/include/asm/spinlock_types.h       2017-11-12 19:46:13.000000000 +0100
29 +++ linux-4.14/arch/arm/include/asm/spinlock_types.h    2018-09-05 11:05:07.000000000 +0200
30 @@ -2,10 +2,6 @@
31  #ifndef __ASM_SPINLOCK_TYPES_H
32  #define __ASM_SPINLOCK_TYPES_H
33  
34 -#ifndef __LINUX_SPINLOCK_TYPES_H
35 -# error "please don't include this file directly"
36 -#endif
37 -
38  #define TICKET_SHIFT   16
39  
40  typedef struct {
41 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/switch_to.h linux-4.14/arch/arm/include/asm/switch_to.h
42 --- linux-4.14.orig/arch/arm/include/asm/switch_to.h    2017-11-12 19:46:13.000000000 +0100
43 +++ linux-4.14/arch/arm/include/asm/switch_to.h 2018-09-05 11:05:07.000000000 +0200
44 @@ -4,6 +4,13 @@
45  
46  #include <linux/thread_info.h>
47  
48 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
49 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
50 +#else
51 +static inline void
52 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
53 +#endif
54 +
55  /*
56   * For v7 SMP cores running a preemptible kernel we may be pre-empted
57   * during a TLB maintenance operation, so execute an inner-shareable dsb
58 @@ -26,6 +33,7 @@
59  #define switch_to(prev,next,last)                                      \
60  do {                                                                   \
61         __complete_pending_tlbi();                                      \
62 +       switch_kmaps(prev, next);                                       \
63         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
64  } while (0)
65  
66 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/include/asm/thread_info.h linux-4.14/arch/arm/include/asm/thread_info.h
67 --- linux-4.14.orig/arch/arm/include/asm/thread_info.h  2017-11-12 19:46:13.000000000 +0100
68 +++ linux-4.14/arch/arm/include/asm/thread_info.h       2018-09-05 11:05:07.000000000 +0200
69 @@ -49,6 +49,7 @@
70  struct thread_info {
71         unsigned long           flags;          /* low level flags */
72         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
73 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
74         mm_segment_t            addr_limit;     /* address limit */
75         struct task_struct      *task;          /* main task structure */
76         __u32                   cpu;            /* cpu */
77 @@ -142,7 +143,8 @@
78  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
79  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
80  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
81 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
82 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
83 +#define TIF_NEED_RESCHED_LAZY  7
84  
85  #define TIF_NOHZ               12      /* in adaptive nohz mode */
86  #define TIF_USING_IWMMXT       17
87 @@ -152,6 +154,7 @@
88  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
89  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
90  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
91 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
92  #define _TIF_UPROBE            (1 << TIF_UPROBE)
93  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
94  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
95 @@ -167,7 +170,8 @@
96   * Change these and you break ASM code in entry-common.S
97   */
98  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
99 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
100 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
101 +                                _TIF_NEED_RESCHED_LAZY)
102  
103  #endif /* __KERNEL__ */
104  #endif /* __ASM_ARM_THREAD_INFO_H */
105 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/Kconfig linux-4.14/arch/arm/Kconfig
106 --- linux-4.14.orig/arch/arm/Kconfig    2017-11-12 19:46:13.000000000 +0100
107 +++ linux-4.14/arch/arm/Kconfig 2018-09-05 11:05:07.000000000 +0200
108 @@ -45,7 +45,7 @@
109         select HARDIRQS_SW_RESEND
110         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
111         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
112 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
113 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
114         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
115         select HAVE_ARCH_MMAP_RND_BITS if MMU
116         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
117 @@ -85,6 +85,7 @@
118         select HAVE_PERF_EVENTS
119         select HAVE_PERF_REGS
120         select HAVE_PERF_USER_STACK_DUMP
121 +       select HAVE_PREEMPT_LAZY
122         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
123         select HAVE_REGS_AND_STACK_ACCESS_API
124         select HAVE_SYSCALL_TRACEPOINTS
125 @@ -2164,7 +2165,7 @@
126  
127  config KERNEL_MODE_NEON
128         bool "Support for NEON in kernel mode"
129 -       depends on NEON && AEABI
130 +       depends on NEON && AEABI && !PREEMPT_RT_BASE
131         help
132           Say Y to include support for NEON in kernel mode.
133  
134 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/asm-offsets.c linux-4.14/arch/arm/kernel/asm-offsets.c
135 --- linux-4.14.orig/arch/arm/kernel/asm-offsets.c       2017-11-12 19:46:13.000000000 +0100
136 +++ linux-4.14/arch/arm/kernel/asm-offsets.c    2018-09-05 11:05:07.000000000 +0200
137 @@ -65,6 +65,7 @@
138    BLANK();
139    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
140    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
141 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
142    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
143    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
144    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
145 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/entry-armv.S linux-4.14/arch/arm/kernel/entry-armv.S
146 --- linux-4.14.orig/arch/arm/kernel/entry-armv.S        2017-11-12 19:46:13.000000000 +0100
147 +++ linux-4.14/arch/arm/kernel/entry-armv.S     2018-09-05 11:05:07.000000000 +0200
148 @@ -220,11 +220,18 @@
149  
150  #ifdef CONFIG_PREEMPT
151         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
152 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
153         teq     r8, #0                          @ if preempt count != 0
154 +       bne     1f                              @ return from exeption
155 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
156 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
157 +       blne    svc_preempt                     @ preempt!
158 +
159 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
160 +       teq     r8, #0                          @ if preempt lazy count != 0
161         movne   r0, #0                          @ force flags to 0
162 -       tst     r0, #_TIF_NEED_RESCHED
163 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
164         blne    svc_preempt
165 +1:
166  #endif
167  
168         svc_exit r5, irq = 1                    @ return from exception
169 @@ -239,8 +246,14 @@
170  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
171         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
172         tst     r0, #_TIF_NEED_RESCHED
173 +       bne     1b
174 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
175         reteq   r8                              @ go again
176 -       b       1b
177 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
178 +       teq     r0, #0                          @ if preempt lazy count != 0
179 +       beq     1b
180 +       ret     r8                              @ go again
181 +
182  #endif
183  
184  __und_fault:
185 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/entry-common.S linux-4.14/arch/arm/kernel/entry-common.S
186 --- linux-4.14.orig/arch/arm/kernel/entry-common.S      2017-11-12 19:46:13.000000000 +0100
187 +++ linux-4.14/arch/arm/kernel/entry-common.S   2018-09-05 11:05:07.000000000 +0200
188 @@ -53,7 +53,9 @@
189         cmp     r2, #TASK_SIZE
190         blne    addr_limit_check_failed
191         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
192 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
193 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
194 +       bne     fast_work_pending
195 +       tst     r1, #_TIF_SECCOMP
196         bne     fast_work_pending
197  
198  
199 @@ -83,8 +85,11 @@
200         cmp     r2, #TASK_SIZE
201         blne    addr_limit_check_failed
202         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
203 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
204 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
205 +       bne     do_slower_path
206 +       tst     r1, #_TIF_SECCOMP
207         beq     no_work_pending
208 +do_slower_path:
209   UNWIND(.fnend         )
210  ENDPROC(ret_fast_syscall)
211  
212 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/patch.c linux-4.14/arch/arm/kernel/patch.c
213 --- linux-4.14.orig/arch/arm/kernel/patch.c     2017-11-12 19:46:13.000000000 +0100
214 +++ linux-4.14/arch/arm/kernel/patch.c  2018-09-05 11:05:07.000000000 +0200
215 @@ -16,7 +16,7 @@
216         unsigned int insn;
217  };
218  
219 -static DEFINE_SPINLOCK(patch_lock);
220 +static DEFINE_RAW_SPINLOCK(patch_lock);
221  
222  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
223         __acquires(&patch_lock)
224 @@ -33,7 +33,7 @@
225                 return addr;
226  
227         if (flags)
228 -               spin_lock_irqsave(&patch_lock, *flags);
229 +               raw_spin_lock_irqsave(&patch_lock, *flags);
230         else
231                 __acquire(&patch_lock);
232  
233 @@ -48,7 +48,7 @@
234         clear_fixmap(fixmap);
235  
236         if (flags)
237 -               spin_unlock_irqrestore(&patch_lock, *flags);
238 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
239         else
240                 __release(&patch_lock);
241  }
242 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/process.c linux-4.14/arch/arm/kernel/process.c
243 --- linux-4.14.orig/arch/arm/kernel/process.c   2017-11-12 19:46:13.000000000 +0100
244 +++ linux-4.14/arch/arm/kernel/process.c        2018-09-05 11:05:07.000000000 +0200
245 @@ -325,6 +325,30 @@
246  }
247  
248  #ifdef CONFIG_MMU
249 +/*
250 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
251 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
252 + * fail.
253 + */
254 +static int __init vectors_user_mapping_init_page(void)
255 +{
256 +       struct page *page;
257 +       unsigned long addr = 0xffff0000;
258 +       pgd_t *pgd;
259 +       pud_t *pud;
260 +       pmd_t *pmd;
261 +
262 +       pgd = pgd_offset_k(addr);
263 +       pud = pud_offset(pgd, addr);
264 +       pmd = pmd_offset(pud, addr);
265 +       page = pmd_page(*(pmd));
266 +
267 +       pgtable_page_ctor(page);
268 +
269 +       return 0;
270 +}
271 +late_initcall(vectors_user_mapping_init_page);
272 +
273  #ifdef CONFIG_KUSER_HELPERS
274  /*
275   * The vectors page is always readable from user space for the
276 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/signal.c linux-4.14/arch/arm/kernel/signal.c
277 --- linux-4.14.orig/arch/arm/kernel/signal.c    2017-11-12 19:46:13.000000000 +0100
278 +++ linux-4.14/arch/arm/kernel/signal.c 2018-09-05 11:05:07.000000000 +0200
279 @@ -615,7 +615,8 @@
280          */
281         trace_hardirqs_off();
282         do {
283 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
284 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
285 +                                          _TIF_NEED_RESCHED_LAZY))) {
286                         schedule();
287                 } else {
288                         if (unlikely(!user_mode(regs)))
289 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/smp.c linux-4.14/arch/arm/kernel/smp.c
290 --- linux-4.14.orig/arch/arm/kernel/smp.c       2017-11-12 19:46:13.000000000 +0100
291 +++ linux-4.14/arch/arm/kernel/smp.c    2018-09-05 11:05:07.000000000 +0200
292 @@ -236,8 +236,6 @@
293         flush_cache_louis();
294         local_flush_tlb_all();
295  
296 -       clear_tasks_mm_cpumask(cpu);
297 -
298         return 0;
299  }
300  
301 @@ -255,6 +253,7 @@
302         }
303         pr_debug("CPU%u: shutdown\n", cpu);
304  
305 +       clear_tasks_mm_cpumask(cpu);
306         /*
307          * platform_cpu_kill() is generally expected to do the powering off
308          * and/or cutting of clocks to the dying CPU.  Optionally, this may
309 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/kernel/unwind.c linux-4.14/arch/arm/kernel/unwind.c
310 --- linux-4.14.orig/arch/arm/kernel/unwind.c    2017-11-12 19:46:13.000000000 +0100
311 +++ linux-4.14/arch/arm/kernel/unwind.c 2018-09-05 11:05:07.000000000 +0200
312 @@ -93,7 +93,7 @@
313  static const struct unwind_idx *__origin_unwind_idx;
314  extern const struct unwind_idx __stop_unwind_idx[];
315  
316 -static DEFINE_SPINLOCK(unwind_lock);
317 +static DEFINE_RAW_SPINLOCK(unwind_lock);
318  static LIST_HEAD(unwind_tables);
319  
320  /* Convert a prel31 symbol to an absolute address */
321 @@ -201,7 +201,7 @@
322                 /* module unwind tables */
323                 struct unwind_table *table;
324  
325 -               spin_lock_irqsave(&unwind_lock, flags);
326 +               raw_spin_lock_irqsave(&unwind_lock, flags);
327                 list_for_each_entry(table, &unwind_tables, list) {
328                         if (addr >= table->begin_addr &&
329                             addr < table->end_addr) {
330 @@ -213,7 +213,7 @@
331                                 break;
332                         }
333                 }
334 -               spin_unlock_irqrestore(&unwind_lock, flags);
335 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
336         }
337  
338         pr_debug("%s: idx = %p\n", __func__, idx);
339 @@ -529,9 +529,9 @@
340         tab->begin_addr = text_addr;
341         tab->end_addr = text_addr + text_size;
342  
343 -       spin_lock_irqsave(&unwind_lock, flags);
344 +       raw_spin_lock_irqsave(&unwind_lock, flags);
345         list_add_tail(&tab->list, &unwind_tables);
346 -       spin_unlock_irqrestore(&unwind_lock, flags);
347 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
348  
349         return tab;
350  }
351 @@ -543,9 +543,9 @@
352         if (!tab)
353                 return;
354  
355 -       spin_lock_irqsave(&unwind_lock, flags);
356 +       raw_spin_lock_irqsave(&unwind_lock, flags);
357         list_del(&tab->list);
358 -       spin_unlock_irqrestore(&unwind_lock, flags);
359 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
360  
361         kfree(tab);
362  }
363 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-exynos/platsmp.c linux-4.14/arch/arm/mach-exynos/platsmp.c
364 --- linux-4.14.orig/arch/arm/mach-exynos/platsmp.c      2017-11-12 19:46:13.000000000 +0100
365 +++ linux-4.14/arch/arm/mach-exynos/platsmp.c   2018-09-05 11:05:07.000000000 +0200
366 @@ -229,7 +229,7 @@
367         return (void __iomem *)(S5P_VA_SCU);
368  }
369  
370 -static DEFINE_SPINLOCK(boot_lock);
371 +static DEFINE_RAW_SPINLOCK(boot_lock);
372  
373  static void exynos_secondary_init(unsigned int cpu)
374  {
375 @@ -242,8 +242,8 @@
376         /*
377          * Synchronise with the boot thread.
378          */
379 -       spin_lock(&boot_lock);
380 -       spin_unlock(&boot_lock);
381 +       raw_spin_lock(&boot_lock);
382 +       raw_spin_unlock(&boot_lock);
383  }
384  
385  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
386 @@ -307,7 +307,7 @@
387          * Set synchronisation state between this boot processor
388          * and the secondary one
389          */
390 -       spin_lock(&boot_lock);
391 +       raw_spin_lock(&boot_lock);
392  
393         /*
394          * The secondary processor is waiting to be released from
395 @@ -334,7 +334,7 @@
396  
397                 if (timeout == 0) {
398                         printk(KERN_ERR "cpu1 power enable failed");
399 -                       spin_unlock(&boot_lock);
400 +                       raw_spin_unlock(&boot_lock);
401                         return -ETIMEDOUT;
402                 }
403         }
404 @@ -380,7 +380,7 @@
405          * calibrations, then wait for it to finish
406          */
407  fail:
408 -       spin_unlock(&boot_lock);
409 +       raw_spin_unlock(&boot_lock);
410  
411         return pen_release != -1 ? ret : 0;
412  }
413 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-hisi/platmcpm.c linux-4.14/arch/arm/mach-hisi/platmcpm.c
414 --- linux-4.14.orig/arch/arm/mach-hisi/platmcpm.c       2017-11-12 19:46:13.000000000 +0100
415 +++ linux-4.14/arch/arm/mach-hisi/platmcpm.c    2018-09-05 11:05:07.000000000 +0200
416 @@ -61,7 +61,7 @@
417  
418  static void __iomem *sysctrl, *fabric;
419  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
420 -static DEFINE_SPINLOCK(boot_lock);
421 +static DEFINE_RAW_SPINLOCK(boot_lock);
422  static u32 fabric_phys_addr;
423  /*
424   * [0]: bootwrapper physical address
425 @@ -113,7 +113,7 @@
426         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
427                 return -EINVAL;
428  
429 -       spin_lock_irq(&boot_lock);
430 +       raw_spin_lock_irq(&boot_lock);
431  
432         if (hip04_cpu_table[cluster][cpu])
433                 goto out;
434 @@ -147,7 +147,7 @@
435  
436  out:
437         hip04_cpu_table[cluster][cpu]++;
438 -       spin_unlock_irq(&boot_lock);
439 +       raw_spin_unlock_irq(&boot_lock);
440  
441         return 0;
442  }
443 @@ -162,11 +162,11 @@
444         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
445         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
446  
447 -       spin_lock(&boot_lock);
448 +       raw_spin_lock(&boot_lock);
449         hip04_cpu_table[cluster][cpu]--;
450         if (hip04_cpu_table[cluster][cpu] == 1) {
451                 /* A power_up request went ahead of us. */
452 -               spin_unlock(&boot_lock);
453 +               raw_spin_unlock(&boot_lock);
454                 return;
455         } else if (hip04_cpu_table[cluster][cpu] > 1) {
456                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
457 @@ -174,7 +174,7 @@
458         }
459  
460         last_man = hip04_cluster_is_down(cluster);
461 -       spin_unlock(&boot_lock);
462 +       raw_spin_unlock(&boot_lock);
463         if (last_man) {
464                 /* Since it's Cortex A15, disable L2 prefetching. */
465                 asm volatile(
466 @@ -203,7 +203,7 @@
467                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
468  
469         count = TIMEOUT_MSEC / POLL_MSEC;
470 -       spin_lock_irq(&boot_lock);
471 +       raw_spin_lock_irq(&boot_lock);
472         for (tries = 0; tries < count; tries++) {
473                 if (hip04_cpu_table[cluster][cpu])
474                         goto err;
475 @@ -211,10 +211,10 @@
476                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
477                 if (data & CORE_WFI_STATUS(cpu))
478                         break;
479 -               spin_unlock_irq(&boot_lock);
480 +               raw_spin_unlock_irq(&boot_lock);
481                 /* Wait for clean L2 when the whole cluster is down. */
482                 msleep(POLL_MSEC);
483 -               spin_lock_irq(&boot_lock);
484 +               raw_spin_lock_irq(&boot_lock);
485         }
486         if (tries >= count)
487                 goto err;
488 @@ -231,10 +231,10 @@
489                 goto err;
490         if (hip04_cluster_is_down(cluster))
491                 hip04_set_snoop_filter(cluster, 0);
492 -       spin_unlock_irq(&boot_lock);
493 +       raw_spin_unlock_irq(&boot_lock);
494         return 1;
495  err:
496 -       spin_unlock_irq(&boot_lock);
497 +       raw_spin_unlock_irq(&boot_lock);
498         return 0;
499  }
500  #endif
501 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-omap2/omap-smp.c linux-4.14/arch/arm/mach-omap2/omap-smp.c
502 --- linux-4.14.orig/arch/arm/mach-omap2/omap-smp.c      2018-09-05 11:03:20.000000000 +0200
503 +++ linux-4.14/arch/arm/mach-omap2/omap-smp.c   2018-09-05 11:05:07.000000000 +0200
504 @@ -69,7 +69,7 @@
505         .startup_addr = omap5_secondary_startup,
506  };
507  
508 -static DEFINE_SPINLOCK(boot_lock);
509 +static DEFINE_RAW_SPINLOCK(boot_lock);
510  
511  void __iomem *omap4_get_scu_base(void)
512  {
513 @@ -177,8 +177,8 @@
514         /*
515          * Synchronise with the boot thread.
516          */
517 -       spin_lock(&boot_lock);
518 -       spin_unlock(&boot_lock);
519 +       raw_spin_lock(&boot_lock);
520 +       raw_spin_unlock(&boot_lock);
521  }
522  
523  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
524 @@ -191,7 +191,7 @@
525          * Set synchronisation state between this boot processor
526          * and the secondary one
527          */
528 -       spin_lock(&boot_lock);
529 +       raw_spin_lock(&boot_lock);
530  
531         /*
532          * Update the AuxCoreBoot0 with boot state for secondary core.
533 @@ -270,7 +270,7 @@
534          * Now the secondary core is starting up let it run its
535          * calibrations, then wait for it to finish
536          */
537 -       spin_unlock(&boot_lock);
538 +       raw_spin_unlock(&boot_lock);
539  
540         return 0;
541  }
542 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-prima2/platsmp.c linux-4.14/arch/arm/mach-prima2/platsmp.c
543 --- linux-4.14.orig/arch/arm/mach-prima2/platsmp.c      2017-11-12 19:46:13.000000000 +0100
544 +++ linux-4.14/arch/arm/mach-prima2/platsmp.c   2018-09-05 11:05:07.000000000 +0200
545 @@ -22,7 +22,7 @@
546  
547  static void __iomem *clk_base;
548  
549 -static DEFINE_SPINLOCK(boot_lock);
550 +static DEFINE_RAW_SPINLOCK(boot_lock);
551  
552  static void sirfsoc_secondary_init(unsigned int cpu)
553  {
554 @@ -36,8 +36,8 @@
555         /*
556          * Synchronise with the boot thread.
557          */
558 -       spin_lock(&boot_lock);
559 -       spin_unlock(&boot_lock);
560 +       raw_spin_lock(&boot_lock);
561 +       raw_spin_unlock(&boot_lock);
562  }
563  
564  static const struct of_device_id clk_ids[]  = {
565 @@ -75,7 +75,7 @@
566         /* make sure write buffer is drained */
567         mb();
568  
569 -       spin_lock(&boot_lock);
570 +       raw_spin_lock(&boot_lock);
571  
572         /*
573          * The secondary processor is waiting to be released from
574 @@ -107,7 +107,7 @@
575          * now the secondary core is starting up let it run its
576          * calibrations, then wait for it to finish
577          */
578 -       spin_unlock(&boot_lock);
579 +       raw_spin_unlock(&boot_lock);
580  
581         return pen_release != -1 ? -ENOSYS : 0;
582  }
583 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-qcom/platsmp.c linux-4.14/arch/arm/mach-qcom/platsmp.c
584 --- linux-4.14.orig/arch/arm/mach-qcom/platsmp.c        2017-11-12 19:46:13.000000000 +0100
585 +++ linux-4.14/arch/arm/mach-qcom/platsmp.c     2018-09-05 11:05:07.000000000 +0200
586 @@ -46,7 +46,7 @@
587  
588  extern void secondary_startup_arm(void);
589  
590 -static DEFINE_SPINLOCK(boot_lock);
591 +static DEFINE_RAW_SPINLOCK(boot_lock);
592  
593  #ifdef CONFIG_HOTPLUG_CPU
594  static void qcom_cpu_die(unsigned int cpu)
595 @@ -60,8 +60,8 @@
596         /*
597          * Synchronise with the boot thread.
598          */
599 -       spin_lock(&boot_lock);
600 -       spin_unlock(&boot_lock);
601 +       raw_spin_lock(&boot_lock);
602 +       raw_spin_unlock(&boot_lock);
603  }
604  
605  static int scss_release_secondary(unsigned int cpu)
606 @@ -284,7 +284,7 @@
607          * set synchronisation state between this boot processor
608          * and the secondary one
609          */
610 -       spin_lock(&boot_lock);
611 +       raw_spin_lock(&boot_lock);
612  
613         /*
614          * Send the secondary CPU a soft interrupt, thereby causing
615 @@ -297,7 +297,7 @@
616          * now the secondary core is starting up let it run its
617          * calibrations, then wait for it to finish
618          */
619 -       spin_unlock(&boot_lock);
620 +       raw_spin_unlock(&boot_lock);
621  
622         return ret;
623  }
624 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-spear/platsmp.c linux-4.14/arch/arm/mach-spear/platsmp.c
625 --- linux-4.14.orig/arch/arm/mach-spear/platsmp.c       2017-11-12 19:46:13.000000000 +0100
626 +++ linux-4.14/arch/arm/mach-spear/platsmp.c    2018-09-05 11:05:07.000000000 +0200
627 @@ -32,7 +32,7 @@
628         sync_cache_w(&pen_release);
629  }
630  
631 -static DEFINE_SPINLOCK(boot_lock);
632 +static DEFINE_RAW_SPINLOCK(boot_lock);
633  
634  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
635  
636 @@ -47,8 +47,8 @@
637         /*
638          * Synchronise with the boot thread.
639          */
640 -       spin_lock(&boot_lock);
641 -       spin_unlock(&boot_lock);
642 +       raw_spin_lock(&boot_lock);
643 +       raw_spin_unlock(&boot_lock);
644  }
645  
646  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
647 @@ -59,7 +59,7 @@
648          * set synchronisation state between this boot processor
649          * and the secondary one
650          */
651 -       spin_lock(&boot_lock);
652 +       raw_spin_lock(&boot_lock);
653  
654         /*
655          * The secondary processor is waiting to be released from
656 @@ -84,7 +84,7 @@
657          * now the secondary core is starting up let it run its
658          * calibrations, then wait for it to finish
659          */
660 -       spin_unlock(&boot_lock);
661 +       raw_spin_unlock(&boot_lock);
662  
663         return pen_release != -1 ? -ENOSYS : 0;
664  }
665 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mach-sti/platsmp.c linux-4.14/arch/arm/mach-sti/platsmp.c
666 --- linux-4.14.orig/arch/arm/mach-sti/platsmp.c 2017-11-12 19:46:13.000000000 +0100
667 +++ linux-4.14/arch/arm/mach-sti/platsmp.c      2018-09-05 11:05:07.000000000 +0200
668 @@ -35,7 +35,7 @@
669         sync_cache_w(&pen_release);
670  }
671  
672 -static DEFINE_SPINLOCK(boot_lock);
673 +static DEFINE_RAW_SPINLOCK(boot_lock);
674  
675  static void sti_secondary_init(unsigned int cpu)
676  {
677 @@ -48,8 +48,8 @@
678         /*
679          * Synchronise with the boot thread.
680          */
681 -       spin_lock(&boot_lock);
682 -       spin_unlock(&boot_lock);
683 +       raw_spin_lock(&boot_lock);
684 +       raw_spin_unlock(&boot_lock);
685  }
686  
687  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
688 @@ -60,7 +60,7 @@
689          * set synchronisation state between this boot processor
690          * and the secondary one
691          */
692 -       spin_lock(&boot_lock);
693 +       raw_spin_lock(&boot_lock);
694  
695         /*
696          * The secondary processor is waiting to be released from
697 @@ -91,7 +91,7 @@
698          * now the secondary core is starting up let it run its
699          * calibrations, then wait for it to finish
700          */
701 -       spin_unlock(&boot_lock);
702 +       raw_spin_unlock(&boot_lock);
703  
704         return pen_release != -1 ? -ENOSYS : 0;
705  }
706 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mm/fault.c linux-4.14/arch/arm/mm/fault.c
707 --- linux-4.14.orig/arch/arm/mm/fault.c 2017-11-12 19:46:13.000000000 +0100
708 +++ linux-4.14/arch/arm/mm/fault.c      2018-09-05 11:05:07.000000000 +0200
709 @@ -434,6 +434,9 @@
710         if (addr < TASK_SIZE)
711                 return do_page_fault(addr, fsr, regs);
712  
713 +       if (interrupts_enabled(regs))
714 +               local_irq_enable();
715 +
716         if (user_mode(regs))
717                 goto bad_area;
718  
719 @@ -501,6 +504,9 @@
720  static int
721  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
722  {
723 +       if (interrupts_enabled(regs))
724 +               local_irq_enable();
725 +
726         do_bad_area(addr, fsr, regs);
727         return 0;
728  }
729 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/mm/highmem.c linux-4.14/arch/arm/mm/highmem.c
730 --- linux-4.14.orig/arch/arm/mm/highmem.c       2017-11-12 19:46:13.000000000 +0100
731 +++ linux-4.14/arch/arm/mm/highmem.c    2018-09-05 11:05:07.000000000 +0200
732 @@ -34,6 +34,11 @@
733         return *ptep;
734  }
735  
736 +static unsigned int fixmap_idx(int type)
737 +{
738 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
739 +}
740 +
741  void *kmap(struct page *page)
742  {
743         might_sleep();
744 @@ -54,12 +59,13 @@
745  
746  void *kmap_atomic(struct page *page)
747  {
748 +       pte_t pte = mk_pte(page, kmap_prot);
749         unsigned int idx;
750         unsigned long vaddr;
751         void *kmap;
752         int type;
753  
754 -       preempt_disable();
755 +       preempt_disable_nort();
756         pagefault_disable();
757         if (!PageHighMem(page))
758                 return page_address(page);
759 @@ -79,7 +85,7 @@
760  
761         type = kmap_atomic_idx_push();
762  
763 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
764 +       idx = fixmap_idx(type);
765         vaddr = __fix_to_virt(idx);
766  #ifdef CONFIG_DEBUG_HIGHMEM
767         /*
768 @@ -93,7 +99,10 @@
769          * in place, so the contained TLB flush ensures the TLB is updated
770          * with the new mapping.
771          */
772 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
773 +#ifdef CONFIG_PREEMPT_RT_FULL
774 +       current->kmap_pte[type] = pte;
775 +#endif
776 +       set_fixmap_pte(idx, pte);
777  
778         return (void *)vaddr;
779  }
780 @@ -106,44 +115,75 @@
781  
782         if (kvaddr >= (void *)FIXADDR_START) {
783                 type = kmap_atomic_idx();
784 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
785 +               idx = fixmap_idx(type);
786  
787                 if (cache_is_vivt())
788                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
789 +#ifdef CONFIG_PREEMPT_RT_FULL
790 +               current->kmap_pte[type] = __pte(0);
791 +#endif
792  #ifdef CONFIG_DEBUG_HIGHMEM
793                 BUG_ON(vaddr != __fix_to_virt(idx));
794 -               set_fixmap_pte(idx, __pte(0));
795  #else
796                 (void) idx;  /* to kill a warning */
797  #endif
798 +               set_fixmap_pte(idx, __pte(0));
799                 kmap_atomic_idx_pop();
800         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
801                 /* this address was obtained through kmap_high_get() */
802                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
803         }
804         pagefault_enable();
805 -       preempt_enable();
806 +       preempt_enable_nort();
807  }
808  EXPORT_SYMBOL(__kunmap_atomic);
809  
810  void *kmap_atomic_pfn(unsigned long pfn)
811  {
812 +       pte_t pte = pfn_pte(pfn, kmap_prot);
813         unsigned long vaddr;
814         int idx, type;
815         struct page *page = pfn_to_page(pfn);
816  
817 -       preempt_disable();
818 +       preempt_disable_nort();
819         pagefault_disable();
820         if (!PageHighMem(page))
821                 return page_address(page);
822  
823         type = kmap_atomic_idx_push();
824 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
825 +       idx = fixmap_idx(type);
826         vaddr = __fix_to_virt(idx);
827  #ifdef CONFIG_DEBUG_HIGHMEM
828         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
829  #endif
830 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
831 +#ifdef CONFIG_PREEMPT_RT_FULL
832 +       current->kmap_pte[type] = pte;
833 +#endif
834 +       set_fixmap_pte(idx, pte);
835  
836         return (void *)vaddr;
837  }
838 +#if defined CONFIG_PREEMPT_RT_FULL
839 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
840 +{
841 +       int i;
842 +
843 +       /*
844 +        * Clear @prev's kmap_atomic mappings
845 +        */
846 +       for (i = 0; i < prev_p->kmap_idx; i++) {
847 +               int idx = fixmap_idx(i);
848 +
849 +               set_fixmap_pte(idx, __pte(0));
850 +       }
851 +       /*
852 +        * Restore @next_p's kmap_atomic mappings
853 +        */
854 +       for (i = 0; i < next_p->kmap_idx; i++) {
855 +               int idx = fixmap_idx(i);
856 +
857 +               if (!pte_none(next_p->kmap_pte[i]))
858 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
859 +       }
860 +}
861 +#endif
862 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm/plat-versatile/platsmp.c linux-4.14/arch/arm/plat-versatile/platsmp.c
863 --- linux-4.14.orig/arch/arm/plat-versatile/platsmp.c   2017-11-12 19:46:13.000000000 +0100
864 +++ linux-4.14/arch/arm/plat-versatile/platsmp.c        2018-09-05 11:05:07.000000000 +0200
865 @@ -32,7 +32,7 @@
866         sync_cache_w(&pen_release);
867  }
868  
869 -static DEFINE_SPINLOCK(boot_lock);
870 +static DEFINE_RAW_SPINLOCK(boot_lock);
871  
872  void versatile_secondary_init(unsigned int cpu)
873  {
874 @@ -45,8 +45,8 @@
875         /*
876          * Synchronise with the boot thread.
877          */
878 -       spin_lock(&boot_lock);
879 -       spin_unlock(&boot_lock);
880 +       raw_spin_lock(&boot_lock);
881 +       raw_spin_unlock(&boot_lock);
882  }
883  
884  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
885 @@ -57,7 +57,7 @@
886          * Set synchronisation state between this boot processor
887          * and the secondary one
888          */
889 -       spin_lock(&boot_lock);
890 +       raw_spin_lock(&boot_lock);
891  
892         /*
893          * This is really belt and braces; we hold unintended secondary
894 @@ -87,7 +87,7 @@
895          * now the secondary core is starting up let it run its
896          * calibrations, then wait for it to finish
897          */
898 -       spin_unlock(&boot_lock);
899 +       raw_spin_unlock(&boot_lock);
900  
901         return pen_release != -1 ? -ENOSYS : 0;
902  }
903 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/crypto/crc32-ce-glue.c linux-4.14/arch/arm64/crypto/crc32-ce-glue.c
904 --- linux-4.14.orig/arch/arm64/crypto/crc32-ce-glue.c   2018-09-05 11:03:20.000000000 +0200
905 +++ linux-4.14/arch/arm64/crypto/crc32-ce-glue.c        2018-09-05 11:05:07.000000000 +0200
906 @@ -208,7 +208,8 @@
907  
908  static int __init crc32_pmull_mod_init(void)
909  {
910 -       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
911 +       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
912 +           !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
913                 crc32_pmull_algs[0].update = crc32_pmull_update;
914                 crc32_pmull_algs[1].update = crc32c_pmull_update;
915  
916 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/crypto/Kconfig linux-4.14/arch/arm64/crypto/Kconfig
917 --- linux-4.14.orig/arch/arm64/crypto/Kconfig   2017-11-12 19:46:13.000000000 +0100
918 +++ linux-4.14/arch/arm64/crypto/Kconfig        2018-09-05 11:05:07.000000000 +0200
919 @@ -19,19 +19,19 @@
920  
921  config CRYPTO_SHA1_ARM64_CE
922         tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
923 -       depends on KERNEL_MODE_NEON
924 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
925         select CRYPTO_HASH
926         select CRYPTO_SHA1
927  
928  config CRYPTO_SHA2_ARM64_CE
929         tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
930 -       depends on KERNEL_MODE_NEON
931 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
932         select CRYPTO_HASH
933         select CRYPTO_SHA256_ARM64
934  
935  config CRYPTO_GHASH_ARM64_CE
936         tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
937 -       depends on KERNEL_MODE_NEON
938 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
939         select CRYPTO_HASH
940         select CRYPTO_GF128MUL
941         select CRYPTO_AES
942 @@ -39,7 +39,7 @@
943  
944  config CRYPTO_CRCT10DIF_ARM64_CE
945         tristate "CRCT10DIF digest algorithm using PMULL instructions"
946 -       depends on KERNEL_MODE_NEON && CRC_T10DIF
947 +       depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
948         select CRYPTO_HASH
949  
950  config CRYPTO_CRC32_ARM64_CE
951 @@ -53,13 +53,13 @@
952  
953  config CRYPTO_AES_ARM64_CE
954         tristate "AES core cipher using ARMv8 Crypto Extensions"
955 -       depends on ARM64 && KERNEL_MODE_NEON
956 +       depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
957         select CRYPTO_ALGAPI
958         select CRYPTO_AES_ARM64
959  
960  config CRYPTO_AES_ARM64_CE_CCM
961         tristate "AES in CCM mode using ARMv8 Crypto Extensions"
962 -       depends on ARM64 && KERNEL_MODE_NEON
963 +       depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
964         select CRYPTO_ALGAPI
965         select CRYPTO_AES_ARM64_CE
966         select CRYPTO_AES_ARM64
967 @@ -67,7 +67,7 @@
968  
969  config CRYPTO_AES_ARM64_CE_BLK
970         tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
971 -       depends on KERNEL_MODE_NEON
972 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
973         select CRYPTO_BLKCIPHER
974         select CRYPTO_AES_ARM64_CE
975         select CRYPTO_AES_ARM64
976 @@ -75,7 +75,7 @@
977  
978  config CRYPTO_AES_ARM64_NEON_BLK
979         tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
980 -       depends on KERNEL_MODE_NEON
981 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
982         select CRYPTO_BLKCIPHER
983         select CRYPTO_AES_ARM64
984         select CRYPTO_AES
985 @@ -83,13 +83,13 @@
986  
987  config CRYPTO_CHACHA20_NEON
988         tristate "NEON accelerated ChaCha20 symmetric cipher"
989 -       depends on KERNEL_MODE_NEON
990 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
991         select CRYPTO_BLKCIPHER
992         select CRYPTO_CHACHA20
993  
994  config CRYPTO_AES_ARM64_BS
995         tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
996 -       depends on KERNEL_MODE_NEON
997 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
998         select CRYPTO_BLKCIPHER
999         select CRYPTO_AES_ARM64_NEON_BLK
1000         select CRYPTO_AES_ARM64
1001 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/include/asm/spinlock_types.h linux-4.14/arch/arm64/include/asm/spinlock_types.h
1002 --- linux-4.14.orig/arch/arm64/include/asm/spinlock_types.h     2017-11-12 19:46:13.000000000 +0100
1003 +++ linux-4.14/arch/arm64/include/asm/spinlock_types.h  2018-09-05 11:05:07.000000000 +0200
1004 @@ -16,10 +16,6 @@
1005  #ifndef __ASM_SPINLOCK_TYPES_H
1006  #define __ASM_SPINLOCK_TYPES_H
1007  
1008 -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
1009 -# error "please don't include this file directly"
1010 -#endif
1011 -
1012  #include <linux/types.h>
1013  
1014  #define TICKET_SHIFT   16
1015 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/include/asm/thread_info.h linux-4.14/arch/arm64/include/asm/thread_info.h
1016 --- linux-4.14.orig/arch/arm64/include/asm/thread_info.h        2018-09-05 11:03:20.000000000 +0200
1017 +++ linux-4.14/arch/arm64/include/asm/thread_info.h     2018-09-05 11:05:07.000000000 +0200
1018 @@ -43,6 +43,7 @@
1019         u64                     ttbr0;          /* saved TTBR0_EL1 */
1020  #endif
1021         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1022 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1023  };
1024  
1025  #define INIT_THREAD_INFO(tsk)                                          \
1026 @@ -82,6 +83,7 @@
1027  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1028  #define TIF_UPROBE             4       /* uprobe breakpoint or singlestep */
1029  #define TIF_FSCHECK            5       /* Check FS is USER_DS on return */
1030 +#define TIF_NEED_RESCHED_LAZY  6
1031  #define TIF_NOHZ               7
1032  #define TIF_SYSCALL_TRACE      8
1033  #define TIF_SYSCALL_AUDIT      9
1034 @@ -98,6 +100,7 @@
1035  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1036  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1037  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1038 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1039  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1040  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1041  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1042 @@ -109,8 +112,9 @@
1043  
1044  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1045                                  _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1046 -                                _TIF_UPROBE | _TIF_FSCHECK)
1047 +                                _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY)
1048  
1049 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1050  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1051                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1052                                  _TIF_NOHZ)
1053 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/Kconfig linux-4.14/arch/arm64/Kconfig
1054 --- linux-4.14.orig/arch/arm64/Kconfig  2018-09-05 11:03:20.000000000 +0200
1055 +++ linux-4.14/arch/arm64/Kconfig       2018-09-05 11:05:07.000000000 +0200
1056 @@ -103,6 +103,7 @@
1057         select HAVE_PERF_EVENTS
1058         select HAVE_PERF_REGS
1059         select HAVE_PERF_USER_STACK_DUMP
1060 +       select HAVE_PREEMPT_LAZY
1061         select HAVE_REGS_AND_STACK_ACCESS_API
1062         select HAVE_RCU_TABLE_FREE
1063         select HAVE_SYSCALL_TRACEPOINTS
1064 @@ -791,7 +792,7 @@
1065  
1066  config XEN
1067         bool "Xen guest support on ARM64"
1068 -       depends on ARM64 && OF
1069 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1070         select SWIOTLB_XEN
1071         select PARAVIRT
1072         help
1073 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/asm-offsets.c linux-4.14/arch/arm64/kernel/asm-offsets.c
1074 --- linux-4.14.orig/arch/arm64/kernel/asm-offsets.c     2018-09-05 11:03:20.000000000 +0200
1075 +++ linux-4.14/arch/arm64/kernel/asm-offsets.c  2018-09-05 11:05:07.000000000 +0200
1076 @@ -39,6 +39,7 @@
1077    BLANK();
1078    DEFINE(TSK_TI_FLAGS,         offsetof(struct task_struct, thread_info.flags));
1079    DEFINE(TSK_TI_PREEMPT,       offsetof(struct task_struct, thread_info.preempt_count));
1080 +  DEFINE(TSK_TI_PREEMPT_LAZY,  offsetof(struct task_struct, thread_info.preempt_lazy_count));
1081    DEFINE(TSK_TI_ADDR_LIMIT,    offsetof(struct task_struct, thread_info.addr_limit));
1082  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
1083    DEFINE(TSK_TI_TTBR0,         offsetof(struct task_struct, thread_info.ttbr0));
1084 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/entry.S linux-4.14/arch/arm64/kernel/entry.S
1085 --- linux-4.14.orig/arch/arm64/kernel/entry.S   2018-09-05 11:03:20.000000000 +0200
1086 +++ linux-4.14/arch/arm64/kernel/entry.S        2018-09-05 11:05:07.000000000 +0200
1087 @@ -637,11 +637,16 @@
1088  
1089  #ifdef CONFIG_PREEMPT
1090         ldr     w24, [tsk, #TSK_TI_PREEMPT]     // get preempt count
1091 -       cbnz    w24, 1f                         // preempt count != 0
1092 +       cbnz    w24, 2f                         // preempt count != 0
1093         ldr     x0, [tsk, #TSK_TI_FLAGS]        // get flags
1094 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1095 -       bl      el1_preempt
1096 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1097 +
1098 +       ldr     w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
1099 +       cbnz    w24, 2f                         // preempt lazy count != 0
1100 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1101  1:
1102 +       bl      el1_preempt
1103 +2:
1104  #endif
1105  #ifdef CONFIG_TRACE_IRQFLAGS
1106         bl      trace_hardirqs_on
1107 @@ -655,6 +660,7 @@
1108  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1109         ldr     x0, [tsk, #TSK_TI_FLAGS]        // get new tasks TI_FLAGS
1110         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1111 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1112         ret     x24
1113  #endif
1114  
1115 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/arm64/kernel/signal.c linux-4.14/arch/arm64/kernel/signal.c
1116 --- linux-4.14.orig/arch/arm64/kernel/signal.c  2018-09-05 11:03:20.000000000 +0200
1117 +++ linux-4.14/arch/arm64/kernel/signal.c       2018-09-05 11:05:07.000000000 +0200
1118 @@ -756,7 +756,7 @@
1119                 /* Check valid user FS if needed */
1120                 addr_limit_user_check();
1121  
1122 -               if (thread_flags & _TIF_NEED_RESCHED) {
1123 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1124                         schedule();
1125                 } else {
1126                         local_irq_enable();
1127 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/blackfin/include/asm/spinlock_types.h linux-4.14/arch/blackfin/include/asm/spinlock_types.h
1128 --- linux-4.14.orig/arch/blackfin/include/asm/spinlock_types.h  2017-11-12 19:46:13.000000000 +0100
1129 +++ linux-4.14/arch/blackfin/include/asm/spinlock_types.h       2018-09-05 11:05:07.000000000 +0200
1130 @@ -7,10 +7,6 @@
1131  #ifndef __ASM_SPINLOCK_TYPES_H
1132  #define __ASM_SPINLOCK_TYPES_H
1133  
1134 -#ifndef __LINUX_SPINLOCK_TYPES_H
1135 -# error "please don't include this file directly"
1136 -#endif
1137 -
1138  #include <asm/rwlock.h>
1139  
1140  typedef struct {
1141 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/hexagon/include/asm/spinlock_types.h linux-4.14/arch/hexagon/include/asm/spinlock_types.h
1142 --- linux-4.14.orig/arch/hexagon/include/asm/spinlock_types.h   2017-11-12 19:46:13.000000000 +0100
1143 +++ linux-4.14/arch/hexagon/include/asm/spinlock_types.h        2018-09-05 11:05:07.000000000 +0200
1144 @@ -21,10 +21,6 @@
1145  #ifndef _ASM_SPINLOCK_TYPES_H
1146  #define _ASM_SPINLOCK_TYPES_H
1147  
1148 -#ifndef __LINUX_SPINLOCK_TYPES_H
1149 -# error "please don't include this file directly"
1150 -#endif
1151 -
1152  typedef struct {
1153         volatile unsigned int lock;
1154  } arch_spinlock_t;
1155 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/ia64/include/asm/spinlock_types.h linux-4.14/arch/ia64/include/asm/spinlock_types.h
1156 --- linux-4.14.orig/arch/ia64/include/asm/spinlock_types.h      2017-11-12 19:46:13.000000000 +0100
1157 +++ linux-4.14/arch/ia64/include/asm/spinlock_types.h   2018-09-05 11:05:07.000000000 +0200
1158 @@ -2,10 +2,6 @@
1159  #ifndef _ASM_IA64_SPINLOCK_TYPES_H
1160  #define _ASM_IA64_SPINLOCK_TYPES_H
1161  
1162 -#ifndef __LINUX_SPINLOCK_TYPES_H
1163 -# error "please don't include this file directly"
1164 -#endif
1165 -
1166  typedef struct {
1167         volatile unsigned int lock;
1168  } arch_spinlock_t;
1169 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/ia64/kernel/mca.c linux-4.14/arch/ia64/kernel/mca.c
1170 --- linux-4.14.orig/arch/ia64/kernel/mca.c      2017-11-12 19:46:13.000000000 +0100
1171 +++ linux-4.14/arch/ia64/kernel/mca.c   2018-09-05 11:05:07.000000000 +0200
1172 @@ -1824,7 +1824,7 @@
1173         ti->cpu = cpu;
1174         p->stack = ti;
1175         p->state = TASK_UNINTERRUPTIBLE;
1176 -       cpumask_set_cpu(cpu, &p->cpus_allowed);
1177 +       cpumask_set_cpu(cpu, &p->cpus_mask);
1178         INIT_LIST_HEAD(&p->tasks);
1179         p->parent = p->real_parent = p->group_leader = p;
1180         INIT_LIST_HEAD(&p->children);
1181 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/Kconfig linux-4.14/arch/Kconfig
1182 --- linux-4.14.orig/arch/Kconfig        2018-09-05 11:03:20.000000000 +0200
1183 +++ linux-4.14/arch/Kconfig     2018-09-05 11:05:07.000000000 +0200
1184 @@ -20,6 +20,7 @@
1185         tristate "OProfile system profiling"
1186         depends on PROFILING
1187         depends on HAVE_OPROFILE
1188 +       depends on !PREEMPT_RT_FULL
1189         select RING_BUFFER
1190         select RING_BUFFER_ALLOW_SWAP
1191         help
1192 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/m32r/include/asm/spinlock_types.h linux-4.14/arch/m32r/include/asm/spinlock_types.h
1193 --- linux-4.14.orig/arch/m32r/include/asm/spinlock_types.h      2017-11-12 19:46:13.000000000 +0100
1194 +++ linux-4.14/arch/m32r/include/asm/spinlock_types.h   2018-09-05 11:05:07.000000000 +0200
1195 @@ -2,10 +2,6 @@
1196  #ifndef _ASM_M32R_SPINLOCK_TYPES_H
1197  #define _ASM_M32R_SPINLOCK_TYPES_H
1198  
1199 -#ifndef __LINUX_SPINLOCK_TYPES_H
1200 -# error "please don't include this file directly"
1201 -#endif
1202 -
1203  typedef struct {
1204         volatile int slock;
1205  } arch_spinlock_t;
1206 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/metag/include/asm/spinlock_types.h linux-4.14/arch/metag/include/asm/spinlock_types.h
1207 --- linux-4.14.orig/arch/metag/include/asm/spinlock_types.h     2017-11-12 19:46:13.000000000 +0100
1208 +++ linux-4.14/arch/metag/include/asm/spinlock_types.h  2018-09-05 11:05:07.000000000 +0200
1209 @@ -2,10 +2,6 @@
1210  #ifndef _ASM_METAG_SPINLOCK_TYPES_H
1211  #define _ASM_METAG_SPINLOCK_TYPES_H
1212  
1213 -#ifndef __LINUX_SPINLOCK_TYPES_H
1214 -# error "please don't include this file directly"
1215 -#endif
1216 -
1217  typedef struct {
1218         volatile unsigned int lock;
1219  } arch_spinlock_t;
1220 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/include/asm/switch_to.h linux-4.14/arch/mips/include/asm/switch_to.h
1221 --- linux-4.14.orig/arch/mips/include/asm/switch_to.h   2017-11-12 19:46:13.000000000 +0100
1222 +++ linux-4.14/arch/mips/include/asm/switch_to.h        2018-09-05 11:05:07.000000000 +0200
1223 @@ -42,7 +42,7 @@
1224   * inline to try to keep the overhead down. If we have been forced to run on
1225   * a "CPU" with an FPU because of a previous high level of FP computation,
1226   * but did not actually use the FPU during the most recent time-slice (CU1
1227 - * isn't set), we undo the restriction on cpus_allowed.
1228 + * isn't set), we undo the restriction on cpus_mask.
1229   *
1230   * We're not calling set_cpus_allowed() here, because we have no need to
1231   * force prompt migration - we're already switching the current CPU to a
1232 @@ -57,7 +57,7 @@
1233             test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) &&             \
1234             (!(KSTK_STATUS(prev) & ST0_CU1))) {                         \
1235                 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND);          \
1236 -               prev->cpus_allowed = prev->thread.user_cpus_allowed;    \
1237 +               prev->cpus_mask = prev->thread.user_cpus_allowed;       \
1238         }                                                               \
1239         next->thread.emulated_fp = 0;                                   \
1240  } while(0)
1241 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/Kconfig linux-4.14/arch/mips/Kconfig
1242 --- linux-4.14.orig/arch/mips/Kconfig   2018-09-05 11:03:20.000000000 +0200
1243 +++ linux-4.14/arch/mips/Kconfig        2018-09-05 11:05:07.000000000 +0200
1244 @@ -2519,7 +2519,7 @@
1245  #
1246  config HIGHMEM
1247         bool "High Memory Support"
1248 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1249 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1250  
1251  config CPU_SUPPORTS_HIGHMEM
1252         bool
1253 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/kernel/mips-mt-fpaff.c linux-4.14/arch/mips/kernel/mips-mt-fpaff.c
1254 --- linux-4.14.orig/arch/mips/kernel/mips-mt-fpaff.c    2017-11-12 19:46:13.000000000 +0100
1255 +++ linux-4.14/arch/mips/kernel/mips-mt-fpaff.c 2018-09-05 11:05:07.000000000 +0200
1256 @@ -177,7 +177,7 @@
1257         if (retval)
1258                 goto out_unlock;
1259  
1260 -       cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
1261 +       cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
1262         cpumask_and(&mask, &allowed, cpu_active_mask);
1263  
1264  out_unlock:
1265 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mips/kernel/traps.c linux-4.14/arch/mips/kernel/traps.c
1266 --- linux-4.14.orig/arch/mips/kernel/traps.c    2018-09-05 11:03:20.000000000 +0200
1267 +++ linux-4.14/arch/mips/kernel/traps.c 2018-09-05 11:05:07.000000000 +0200
1268 @@ -1193,12 +1193,12 @@
1269                  * restricted the allowed set to exclude any CPUs with FPUs,
1270                  * we'll skip the procedure.
1271                  */
1272 -               if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
1273 +               if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
1274                         cpumask_t tmask;
1275  
1276                         current->thread.user_cpus_allowed
1277 -                               = current->cpus_allowed;
1278 -                       cpumask_and(&tmask, &current->cpus_allowed,
1279 +                               = current->cpus_mask;
1280 +                       cpumask_and(&tmask, &current->cpus_mask,
1281                                     &mt_fpu_cpumask);
1282                         set_cpus_allowed_ptr(current, &tmask);
1283                         set_thread_flag(TIF_FPUBOUND);
1284 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/mn10300/include/asm/spinlock_types.h linux-4.14/arch/mn10300/include/asm/spinlock_types.h
1285 --- linux-4.14.orig/arch/mn10300/include/asm/spinlock_types.h   2017-11-12 19:46:13.000000000 +0100
1286 +++ linux-4.14/arch/mn10300/include/asm/spinlock_types.h        2018-09-05 11:05:07.000000000 +0200
1287 @@ -2,10 +2,6 @@
1288  #ifndef _ASM_SPINLOCK_TYPES_H
1289  #define _ASM_SPINLOCK_TYPES_H
1290  
1291 -#ifndef __LINUX_SPINLOCK_TYPES_H
1292 -# error "please don't include this file directly"
1293 -#endif
1294 -
1295  typedef struct arch_spinlock {
1296         unsigned int slock;
1297  } arch_spinlock_t;
1298 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/include/asm/spinlock_types.h linux-4.14/arch/powerpc/include/asm/spinlock_types.h
1299 --- linux-4.14.orig/arch/powerpc/include/asm/spinlock_types.h   2017-11-12 19:46:13.000000000 +0100
1300 +++ linux-4.14/arch/powerpc/include/asm/spinlock_types.h        2018-09-05 11:05:07.000000000 +0200
1301 @@ -2,10 +2,6 @@
1302  #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
1303  #define _ASM_POWERPC_SPINLOCK_TYPES_H
1304  
1305 -#ifndef __LINUX_SPINLOCK_TYPES_H
1306 -# error "please don't include this file directly"
1307 -#endif
1308 -
1309  typedef struct {
1310         volatile unsigned int slock;
1311  } arch_spinlock_t;
1312 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/include/asm/thread_info.h linux-4.14/arch/powerpc/include/asm/thread_info.h
1313 --- linux-4.14.orig/arch/powerpc/include/asm/thread_info.h      2017-11-12 19:46:13.000000000 +0100
1314 +++ linux-4.14/arch/powerpc/include/asm/thread_info.h   2018-09-05 11:05:07.000000000 +0200
1315 @@ -36,6 +36,8 @@
1316         int             cpu;                    /* cpu we're on */
1317         int             preempt_count;          /* 0 => preemptable,
1318                                                    <0 => BUG */
1319 +       int             preempt_lazy_count;     /* 0 => preemptable,
1320 +                                                  <0 => BUG */
1321         unsigned long   local_flags;            /* private flags for thread */
1322  #ifdef CONFIG_LIVEPATCH
1323         unsigned long *livepatch_sp;
1324 @@ -81,8 +83,7 @@
1325  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1326  #define TIF_SIGPENDING         1       /* signal pending */
1327  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1328 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1329 -                                          TIF_NEED_RESCHED */
1330 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1331  #define TIF_32BIT              4       /* 32 bit binary */
1332  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1333  #define TIF_PATCH_PENDING      6       /* pending live patching update */
1334 @@ -101,6 +102,8 @@
1335  #if defined(CONFIG_PPC64)
1336  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1337  #endif
1338 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1339 +                                          TIF_NEED_RESCHED */
1340  
1341  /* as above, but as bit values */
1342  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1343 @@ -120,14 +123,16 @@
1344  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1345  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1346  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1347 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1348  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1349                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1350                                  _TIF_NOHZ)
1351  
1352  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1353                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1354 -                                _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
1355 +                                _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY)
1356  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1357 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1358  
1359  /* Bits in local_flags */
1360  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1361 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/Kconfig linux-4.14/arch/powerpc/Kconfig
1362 --- linux-4.14.orig/arch/powerpc/Kconfig        2018-09-05 11:03:20.000000000 +0200
1363 +++ linux-4.14/arch/powerpc/Kconfig     2018-09-05 11:05:07.000000000 +0200
1364 @@ -111,10 +111,11 @@
1365  
1366  config RWSEM_GENERIC_SPINLOCK
1367         bool
1368 +       default y if PREEMPT_RT_FULL
1369  
1370  config RWSEM_XCHGADD_ALGORITHM
1371         bool
1372 -       default y
1373 +       default y if !PREEMPT_RT_FULL
1374  
1375  config GENERIC_LOCKBREAK
1376         bool
1377 @@ -215,6 +216,7 @@
1378         select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
1379         select HAVE_PERF_REGS
1380         select HAVE_PERF_USER_STACK_DUMP
1381 +       select HAVE_PREEMPT_LAZY
1382         select HAVE_RCU_TABLE_FREE              if SMP
1383         select HAVE_REGS_AND_STACK_ACCESS_API
1384         select HAVE_SYSCALL_TRACEPOINTS
1385 @@ -390,7 +392,7 @@
1386  
1387  config HIGHMEM
1388         bool "High memory support"
1389 -       depends on PPC32
1390 +       depends on PPC32 && !PREEMPT_RT_FULL
1391  
1392  source kernel/Kconfig.hz
1393  source kernel/Kconfig.preempt
1394 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/asm-offsets.c linux-4.14/arch/powerpc/kernel/asm-offsets.c
1395 --- linux-4.14.orig/arch/powerpc/kernel/asm-offsets.c   2018-09-05 11:03:20.000000000 +0200
1396 +++ linux-4.14/arch/powerpc/kernel/asm-offsets.c        2018-09-05 11:05:07.000000000 +0200
1397 @@ -156,6 +156,7 @@
1398         OFFSET(TI_FLAGS, thread_info, flags);
1399         OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
1400         OFFSET(TI_PREEMPT, thread_info, preempt_count);
1401 +       OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
1402         OFFSET(TI_TASK, thread_info, task);
1403         OFFSET(TI_CPU, thread_info, cpu);
1404  
1405 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/entry_32.S linux-4.14/arch/powerpc/kernel/entry_32.S
1406 --- linux-4.14.orig/arch/powerpc/kernel/entry_32.S      2017-11-12 19:46:13.000000000 +0100
1407 +++ linux-4.14/arch/powerpc/kernel/entry_32.S   2018-09-05 11:05:07.000000000 +0200
1408 @@ -866,7 +866,14 @@
1409         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1410         bne     restore
1411         andi.   r8,r8,_TIF_NEED_RESCHED
1412 +       bne+    1f
1413 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1414 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1415 +       bne     restore
1416 +       lwz     r0,TI_FLAGS(r9)
1417 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1418         beq+    restore
1419 +1:
1420         lwz     r3,_MSR(r1)
1421         andi.   r0,r3,MSR_EE    /* interrupts off? */
1422         beq     restore         /* don't schedule if so */
1423 @@ -877,11 +884,11 @@
1424          */
1425         bl      trace_hardirqs_off
1426  #endif
1427 -1:     bl      preempt_schedule_irq
1428 +2:     bl      preempt_schedule_irq
1429         CURRENT_THREAD_INFO(r9, r1)
1430         lwz     r3,TI_FLAGS(r9)
1431 -       andi.   r0,r3,_TIF_NEED_RESCHED
1432 -       bne-    1b
1433 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1434 +       bne-    2b
1435  #ifdef CONFIG_TRACE_IRQFLAGS
1436         /* And now, to properly rebalance the above, we tell lockdep they
1437          * are being turned back on, which will happen when we return
1438 @@ -1204,7 +1211,7 @@
1439  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1440  
1441  do_work:                       /* r10 contains MSR_KERNEL here */
1442 -       andi.   r0,r9,_TIF_NEED_RESCHED
1443 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1444         beq     do_user_signal
1445  
1446  do_resched:                    /* r10 contains MSR_KERNEL here */
1447 @@ -1225,7 +1232,7 @@
1448         MTMSRD(r10)             /* disable interrupts */
1449         CURRENT_THREAD_INFO(r9, r1)
1450         lwz     r9,TI_FLAGS(r9)
1451 -       andi.   r0,r9,_TIF_NEED_RESCHED
1452 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1453         bne-    do_resched
1454         andi.   r0,r9,_TIF_USER_WORK_MASK
1455         beq     restore_user
1456 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/entry_64.S linux-4.14/arch/powerpc/kernel/entry_64.S
1457 --- linux-4.14.orig/arch/powerpc/kernel/entry_64.S      2018-09-05 11:03:20.000000000 +0200
1458 +++ linux-4.14/arch/powerpc/kernel/entry_64.S   2018-09-05 11:05:07.000000000 +0200
1459 @@ -690,7 +690,7 @@
1460         bl      restore_math
1461         b       restore
1462  #endif
1463 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1464 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1465         beq     2f
1466         bl      restore_interrupts
1467         SCHEDULE_USER
1468 @@ -752,10 +752,18 @@
1469  
1470  #ifdef CONFIG_PREEMPT
1471         /* Check if we need to preempt */
1472 +       lwz     r8,TI_PREEMPT(r9)
1473 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1474 +       bne     restore
1475         andi.   r0,r4,_TIF_NEED_RESCHED
1476 +       bne+    check_count
1477 +
1478 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1479         beq+    restore
1480 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1481 +
1482         /* Check that preempt_count() == 0 and interrupts are enabled */
1483 -       lwz     r8,TI_PREEMPT(r9)
1484 +check_count:
1485         cmpwi   cr1,r8,0
1486         ld      r0,SOFTE(r1)
1487         cmpdi   r0,0
1488 @@ -772,7 +780,7 @@
1489         /* Re-test flags and eventually loop */
1490         CURRENT_THREAD_INFO(r9, r1)
1491         ld      r4,TI_FLAGS(r9)
1492 -       andi.   r0,r4,_TIF_NEED_RESCHED
1493 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1494         bne     1b
1495  
1496         /*
1497 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/irq.c linux-4.14/arch/powerpc/kernel/irq.c
1498 --- linux-4.14.orig/arch/powerpc/kernel/irq.c   2018-09-05 11:03:20.000000000 +0200
1499 +++ linux-4.14/arch/powerpc/kernel/irq.c        2018-09-05 11:05:07.000000000 +0200
1500 @@ -693,6 +693,7 @@
1501         }
1502  }
1503  
1504 +#ifndef CONFIG_PREEMPT_RT_FULL
1505  void do_softirq_own_stack(void)
1506  {
1507         struct thread_info *curtp, *irqtp;
1508 @@ -710,6 +711,7 @@
1509         if (irqtp->flags)
1510                 set_bits(irqtp->flags, &curtp->flags);
1511  }
1512 +#endif
1513  
1514  irq_hw_number_t virq_to_hw(unsigned int virq)
1515  {
1516 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/misc_32.S linux-4.14/arch/powerpc/kernel/misc_32.S
1517 --- linux-4.14.orig/arch/powerpc/kernel/misc_32.S       2017-11-12 19:46:13.000000000 +0100
1518 +++ linux-4.14/arch/powerpc/kernel/misc_32.S    2018-09-05 11:05:07.000000000 +0200
1519 @@ -41,6 +41,7 @@
1520   * We store the saved ksp_limit in the unused part
1521   * of the STACK_FRAME_OVERHEAD
1522   */
1523 +#ifndef CONFIG_PREEMPT_RT_FULL
1524  _GLOBAL(call_do_softirq)
1525         mflr    r0
1526         stw     r0,4(r1)
1527 @@ -57,6 +58,7 @@
1528         stw     r10,THREAD+KSP_LIMIT(r2)
1529         mtlr    r0
1530         blr
1531 +#endif
1532  
1533  /*
1534   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1535 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kernel/misc_64.S linux-4.14/arch/powerpc/kernel/misc_64.S
1536 --- linux-4.14.orig/arch/powerpc/kernel/misc_64.S       2018-09-05 11:03:20.000000000 +0200
1537 +++ linux-4.14/arch/powerpc/kernel/misc_64.S    2018-09-05 11:05:07.000000000 +0200
1538 @@ -31,6 +31,7 @@
1539  
1540         .text
1541  
1542 +#ifndef CONFIG_PREEMPT_RT_FULL
1543  _GLOBAL(call_do_softirq)
1544         mflr    r0
1545         std     r0,16(r1)
1546 @@ -41,6 +42,7 @@
1547         ld      r0,16(r1)
1548         mtlr    r0
1549         blr
1550 +#endif
1551  
1552  _GLOBAL(call_do_irq)
1553         mflr    r0
1554 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/kvm/Kconfig linux-4.14/arch/powerpc/kvm/Kconfig
1555 --- linux-4.14.orig/arch/powerpc/kvm/Kconfig    2018-09-05 11:03:20.000000000 +0200
1556 +++ linux-4.14/arch/powerpc/kvm/Kconfig 2018-09-05 11:05:07.000000000 +0200
1557 @@ -177,6 +177,7 @@
1558  config KVM_MPIC
1559         bool "KVM in-kernel MPIC emulation"
1560         depends on KVM && E500
1561 +       depends on !PREEMPT_RT_FULL
1562         select HAVE_KVM_IRQCHIP
1563         select HAVE_KVM_IRQFD
1564         select HAVE_KVM_IRQ_ROUTING
1565 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/platforms/cell/spufs/sched.c linux-4.14/arch/powerpc/platforms/cell/spufs/sched.c
1566 --- linux-4.14.orig/arch/powerpc/platforms/cell/spufs/sched.c   2017-11-12 19:46:13.000000000 +0100
1567 +++ linux-4.14/arch/powerpc/platforms/cell/spufs/sched.c        2018-09-05 11:05:07.000000000 +0200
1568 @@ -141,7 +141,7 @@
1569          * runqueue. The context will be rescheduled on the proper node
1570          * if it is timesliced or preempted.
1571          */
1572 -       cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
1573 +       cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
1574  
1575         /* Save the current cpu id for spu interrupt routing. */
1576         ctx->last_ran = raw_smp_processor_id();
1577 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/powerpc/platforms/ps3/device-init.c linux-4.14/arch/powerpc/platforms/ps3/device-init.c
1578 --- linux-4.14.orig/arch/powerpc/platforms/ps3/device-init.c    2017-11-12 19:46:13.000000000 +0100
1579 +++ linux-4.14/arch/powerpc/platforms/ps3/device-init.c 2018-09-05 11:05:07.000000000 +0200
1580 @@ -752,7 +752,7 @@
1581         }
1582         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1583  
1584 -       res = wait_event_interruptible(dev->done.wait,
1585 +       res = swait_event_interruptible(dev->done.wait,
1586                                        dev->done.done || kthread_should_stop());
1587         if (kthread_should_stop())
1588                 res = -EINTR;
1589 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/s390/include/asm/spinlock_types.h linux-4.14/arch/s390/include/asm/spinlock_types.h
1590 --- linux-4.14.orig/arch/s390/include/asm/spinlock_types.h      2017-11-12 19:46:13.000000000 +0100
1591 +++ linux-4.14/arch/s390/include/asm/spinlock_types.h   2018-09-05 11:05:07.000000000 +0200
1592 @@ -2,10 +2,6 @@
1593  #ifndef __ASM_SPINLOCK_TYPES_H
1594  #define __ASM_SPINLOCK_TYPES_H
1595  
1596 -#ifndef __LINUX_SPINLOCK_TYPES_H
1597 -# error "please don't include this file directly"
1598 -#endif
1599 -
1600  typedef struct {
1601         int lock;
1602  } __attribute__ ((aligned (4))) arch_spinlock_t;
1603 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sh/include/asm/spinlock_types.h linux-4.14/arch/sh/include/asm/spinlock_types.h
1604 --- linux-4.14.orig/arch/sh/include/asm/spinlock_types.h        2017-11-12 19:46:13.000000000 +0100
1605 +++ linux-4.14/arch/sh/include/asm/spinlock_types.h     2018-09-05 11:05:07.000000000 +0200
1606 @@ -2,10 +2,6 @@
1607  #ifndef __ASM_SH_SPINLOCK_TYPES_H
1608  #define __ASM_SH_SPINLOCK_TYPES_H
1609  
1610 -#ifndef __LINUX_SPINLOCK_TYPES_H
1611 -# error "please don't include this file directly"
1612 -#endif
1613 -
1614  typedef struct {
1615         volatile unsigned int lock;
1616  } arch_spinlock_t;
1617 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sh/kernel/irq.c linux-4.14/arch/sh/kernel/irq.c
1618 --- linux-4.14.orig/arch/sh/kernel/irq.c        2017-11-12 19:46:13.000000000 +0100
1619 +++ linux-4.14/arch/sh/kernel/irq.c     2018-09-05 11:05:07.000000000 +0200
1620 @@ -148,6 +148,7 @@
1621         hardirq_ctx[cpu] = NULL;
1622  }
1623  
1624 +#ifndef CONFIG_PREEMPT_RT_FULL
1625  void do_softirq_own_stack(void)
1626  {
1627         struct thread_info *curctx;
1628 @@ -175,6 +176,7 @@
1629                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1630         );
1631  }
1632 +#endif
1633  #else
1634  static inline void handle_one_irq(unsigned int irq)
1635  {
1636 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sparc/Kconfig linux-4.14/arch/sparc/Kconfig
1637 --- linux-4.14.orig/arch/sparc/Kconfig  2017-11-12 19:46:13.000000000 +0100
1638 +++ linux-4.14/arch/sparc/Kconfig       2018-09-05 11:05:07.000000000 +0200
1639 @@ -206,12 +206,10 @@
1640  source kernel/Kconfig.hz
1641  
1642  config RWSEM_GENERIC_SPINLOCK
1643 -       bool
1644 -       default y if SPARC32
1645 +       def_bool PREEMPT_RT_FULL
1646  
1647  config RWSEM_XCHGADD_ALGORITHM
1648 -       bool
1649 -       default y if SPARC64
1650 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1651  
1652  config GENERIC_HWEIGHT
1653         bool
1654 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/sparc/kernel/irq_64.c linux-4.14/arch/sparc/kernel/irq_64.c
1655 --- linux-4.14.orig/arch/sparc/kernel/irq_64.c  2017-11-12 19:46:13.000000000 +0100
1656 +++ linux-4.14/arch/sparc/kernel/irq_64.c       2018-09-05 11:05:07.000000000 +0200
1657 @@ -855,6 +855,7 @@
1658         set_irq_regs(old_regs);
1659  }
1660  
1661 +#ifndef CONFIG_PREEMPT_RT_FULL
1662  void do_softirq_own_stack(void)
1663  {
1664         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1665 @@ -869,6 +870,7 @@
1666         __asm__ __volatile__("mov %0, %%sp"
1667                              : : "r" (orig_sp));
1668  }
1669 +#endif
1670  
1671  #ifdef CONFIG_HOTPLUG_CPU
1672  void fixup_irqs(void)
1673 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/include/asm/setup.h linux-4.14/arch/tile/include/asm/setup.h
1674 --- linux-4.14.orig/arch/tile/include/asm/setup.h       2017-11-12 19:46:13.000000000 +0100
1675 +++ linux-4.14/arch/tile/include/asm/setup.h    2018-09-05 11:05:07.000000000 +0200
1676 @@ -49,7 +49,7 @@
1677  
1678  /* Hook hardwall code into changes in affinity. */
1679  #define arch_set_cpus_allowed(p, new_mask) do { \
1680 -       if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
1681 +       if (!cpumask_equal(p->cpus_ptr, new_mask)) \
1682                 hardwall_deactivate_all(p); \
1683  } while (0)
1684  #endif
1685 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/include/asm/spinlock_types.h linux-4.14/arch/tile/include/asm/spinlock_types.h
1686 --- linux-4.14.orig/arch/tile/include/asm/spinlock_types.h      2017-11-12 19:46:13.000000000 +0100
1687 +++ linux-4.14/arch/tile/include/asm/spinlock_types.h   2018-09-05 11:05:07.000000000 +0200
1688 @@ -15,10 +15,6 @@
1689  #ifndef _ASM_TILE_SPINLOCK_TYPES_H
1690  #define _ASM_TILE_SPINLOCK_TYPES_H
1691  
1692 -#ifndef __LINUX_SPINLOCK_TYPES_H
1693 -# error "please don't include this file directly"
1694 -#endif
1695 -
1696  #ifdef __tilegx__
1697  
1698  /* Low 15 bits are "next"; high 15 bits are "current". */
1699 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/tile/kernel/hardwall.c linux-4.14/arch/tile/kernel/hardwall.c
1700 --- linux-4.14.orig/arch/tile/kernel/hardwall.c 2017-11-12 19:46:13.000000000 +0100
1701 +++ linux-4.14/arch/tile/kernel/hardwall.c      2018-09-05 11:05:07.000000000 +0200
1702 @@ -590,12 +590,12 @@
1703          * Get our affinity; if we're not bound to this tile uniquely,
1704          * we can't access the network registers.
1705          */
1706 -       if (cpumask_weight(&p->cpus_allowed) != 1)
1707 +       if (p->nr_cpus_allowed != 1)
1708                 return -EPERM;
1709  
1710         /* Make sure we are bound to a cpu assigned to this resource. */
1711         cpu = smp_processor_id();
1712 -       BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
1713 +       BUG_ON(cpumask_first(p->cpus_ptr) != cpu);
1714         if (!cpumask_test_cpu(cpu, &info->cpumask))
1715                 return -EINVAL;
1716  
1717 @@ -621,17 +621,17 @@
1718   * Deactivate a task's hardwall.  Must hold lock for hardwall_type.
1719   * This method may be called from exit_thread(), so we don't want to
1720   * rely on too many fields of struct task_struct still being valid.
1721 - * We assume the cpus_allowed, pid, and comm fields are still valid.
1722 + * We assume the nr_cpus_allowed, pid, and comm fields are still valid.
1723   */
1724  static void _hardwall_deactivate(struct hardwall_type *hwt,
1725                                  struct task_struct *task)
1726  {
1727         struct thread_struct *ts = &task->thread;
1728  
1729 -       if (cpumask_weight(&task->cpus_allowed) != 1) {
1730 +       if (task->nr_cpus_allowed != 1) {
1731                 pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n",
1732                        task->pid, task->comm, hwt->name,
1733 -                      cpumask_weight(&task->cpus_allowed));
1734 +                      task->nr_cpus_allowed);
1735                 BUG();
1736         }
1737  
1738 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/aesni-intel_glue.c linux-4.14/arch/x86/crypto/aesni-intel_glue.c
1739 --- linux-4.14.orig/arch/x86/crypto/aesni-intel_glue.c  2018-09-05 11:03:20.000000000 +0200
1740 +++ linux-4.14/arch/x86/crypto/aesni-intel_glue.c       2018-09-05 11:05:07.000000000 +0200
1741 @@ -387,14 +387,14 @@
1742  
1743         err = skcipher_walk_virt(&walk, req, true);
1744  
1745 -       kernel_fpu_begin();
1746         while ((nbytes = walk.nbytes)) {
1747 +               kernel_fpu_begin();
1748                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1749                               nbytes & AES_BLOCK_MASK);
1750 +               kernel_fpu_end();
1751                 nbytes &= AES_BLOCK_SIZE - 1;
1752                 err = skcipher_walk_done(&walk, nbytes);
1753         }
1754 -       kernel_fpu_end();
1755  
1756         return err;
1757  }
1758 @@ -409,14 +409,14 @@
1759  
1760         err = skcipher_walk_virt(&walk, req, true);
1761  
1762 -       kernel_fpu_begin();
1763         while ((nbytes = walk.nbytes)) {
1764 +               kernel_fpu_begin();
1765                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1766                               nbytes & AES_BLOCK_MASK);
1767 +               kernel_fpu_end();
1768                 nbytes &= AES_BLOCK_SIZE - 1;
1769                 err = skcipher_walk_done(&walk, nbytes);
1770         }
1771 -       kernel_fpu_end();
1772  
1773         return err;
1774  }
1775 @@ -431,14 +431,14 @@
1776  
1777         err = skcipher_walk_virt(&walk, req, true);
1778  
1779 -       kernel_fpu_begin();
1780         while ((nbytes = walk.nbytes)) {
1781 +               kernel_fpu_begin();
1782                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1783                               nbytes & AES_BLOCK_MASK, walk.iv);
1784 +               kernel_fpu_end();
1785                 nbytes &= AES_BLOCK_SIZE - 1;
1786                 err = skcipher_walk_done(&walk, nbytes);
1787         }
1788 -       kernel_fpu_end();
1789  
1790         return err;
1791  }
1792 @@ -453,14 +453,14 @@
1793  
1794         err = skcipher_walk_virt(&walk, req, true);
1795  
1796 -       kernel_fpu_begin();
1797         while ((nbytes = walk.nbytes)) {
1798 +               kernel_fpu_begin();
1799                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1800                               nbytes & AES_BLOCK_MASK, walk.iv);
1801 +               kernel_fpu_end();
1802                 nbytes &= AES_BLOCK_SIZE - 1;
1803                 err = skcipher_walk_done(&walk, nbytes);
1804         }
1805 -       kernel_fpu_end();
1806  
1807         return err;
1808  }
1809 @@ -510,18 +510,20 @@
1810  
1811         err = skcipher_walk_virt(&walk, req, true);
1812  
1813 -       kernel_fpu_begin();
1814         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1815 +               kernel_fpu_begin();
1816                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1817                                       nbytes & AES_BLOCK_MASK, walk.iv);
1818 +               kernel_fpu_end();
1819                 nbytes &= AES_BLOCK_SIZE - 1;
1820                 err = skcipher_walk_done(&walk, nbytes);
1821         }
1822         if (walk.nbytes) {
1823 +               kernel_fpu_begin();
1824                 ctr_crypt_final(ctx, &walk);
1825 +               kernel_fpu_end();
1826                 err = skcipher_walk_done(&walk, 0);
1827         }
1828 -       kernel_fpu_end();
1829  
1830         return err;
1831  }
1832 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx2_glue.c linux-4.14/arch/x86/crypto/camellia_aesni_avx2_glue.c
1833 --- linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx2_glue.c  2017-11-12 19:46:13.000000000 +0100
1834 +++ linux-4.14/arch/x86/crypto/camellia_aesni_avx2_glue.c       2018-09-05 11:05:07.000000000 +0200
1835 @@ -206,6 +206,20 @@
1836         bool fpu_enabled;
1837  };
1838  
1839 +#ifdef CONFIG_PREEMPT_RT_FULL
1840 +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
1841 +{
1842 +       bool fpu_enabled = ctx->fpu_enabled;
1843 +
1844 +       if (!fpu_enabled)
1845 +               return;
1846 +       camellia_fpu_end(fpu_enabled);
1847 +       ctx->fpu_enabled = false;
1848 +}
1849 +#else
1850 +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
1851 +#endif
1852 +
1853  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1854  {
1855         const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1856 @@ -221,16 +235,19 @@
1857         }
1858  
1859         if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
1860 +               kernel_fpu_resched();
1861                 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
1862                 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1863                 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1864         }
1865  
1866         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1867 +               kernel_fpu_resched();
1868                 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
1869                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1870                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1871         }
1872 +       camellia_fpu_end_rt(ctx);
1873  
1874         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1875                 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
1876 @@ -251,16 +268,19 @@
1877         }
1878  
1879         if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
1880 +               kernel_fpu_resched();
1881                 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
1882                 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1883                 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
1884         }
1885  
1886         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1887 +               kernel_fpu_resched();
1888                 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
1889                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1890                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1891         }
1892 +       camellia_fpu_end_rt(ctx);
1893  
1894         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1895                 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
1896 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx_glue.c linux-4.14/arch/x86/crypto/camellia_aesni_avx_glue.c
1897 --- linux-4.14.orig/arch/x86/crypto/camellia_aesni_avx_glue.c   2017-11-12 19:46:13.000000000 +0100
1898 +++ linux-4.14/arch/x86/crypto/camellia_aesni_avx_glue.c        2018-09-05 11:05:07.000000000 +0200
1899 @@ -210,6 +210,21 @@
1900         bool fpu_enabled;
1901  };
1902  
1903 +#ifdef CONFIG_PREEMPT_RT_FULL
1904 +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
1905 +{
1906 +       bool fpu_enabled = ctx->fpu_enabled;
1907 +
1908 +       if (!fpu_enabled)
1909 +               return;
1910 +       camellia_fpu_end(fpu_enabled);
1911 +       ctx->fpu_enabled = false;
1912 +}
1913 +
1914 +#else
1915 +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
1916 +#endif
1917 +
1918  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1919  {
1920         const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1921 @@ -225,10 +240,12 @@
1922         }
1923  
1924         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1925 +               kernel_fpu_resched();
1926                 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
1927                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1928                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1929         }
1930 +       camellia_fpu_end_rt(ctx);
1931  
1932         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1933                 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
1934 @@ -249,10 +266,12 @@
1935         }
1936  
1937         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
1938 +               kernel_fpu_resched();
1939                 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
1940                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
1941                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
1942         }
1943 +       camellia_fpu_end_rt(ctx);
1944  
1945         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1946                 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
1947 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/cast5_avx_glue.c linux-4.14/arch/x86/crypto/cast5_avx_glue.c
1948 --- linux-4.14.orig/arch/x86/crypto/cast5_avx_glue.c    2018-09-05 11:03:20.000000000 +0200
1949 +++ linux-4.14/arch/x86/crypto/cast5_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
1950 @@ -59,7 +59,7 @@
1951  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1952                      bool enc)
1953  {
1954 -       bool fpu_enabled = false;
1955 +       bool fpu_enabled;
1956         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1957         const unsigned int bsize = CAST5_BLOCK_SIZE;
1958         unsigned int nbytes;
1959 @@ -73,7 +73,7 @@
1960                 u8 *wsrc = walk->src.virt.addr;
1961                 u8 *wdst = walk->dst.virt.addr;
1962  
1963 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1964 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1965  
1966                 /* Process multi-block batch */
1967                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1968 @@ -102,10 +102,9 @@
1969                 } while (nbytes >= bsize);
1970  
1971  done:
1972 +               cast5_fpu_end(fpu_enabled);
1973                 err = blkcipher_walk_done(desc, walk, nbytes);
1974         }
1975 -
1976 -       cast5_fpu_end(fpu_enabled);
1977         return err;
1978  }
1979  
1980 @@ -226,7 +225,7 @@
1981  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1982                        struct scatterlist *src, unsigned int nbytes)
1983  {
1984 -       bool fpu_enabled = false;
1985 +       bool fpu_enabled;
1986         struct blkcipher_walk walk;
1987         int err;
1988  
1989 @@ -235,12 +234,11 @@
1990         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1991  
1992         while ((nbytes = walk.nbytes)) {
1993 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1994 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1995                 nbytes = __cbc_decrypt(desc, &walk);
1996 +               cast5_fpu_end(fpu_enabled);
1997                 err = blkcipher_walk_done(desc, &walk, nbytes);
1998         }
1999 -
2000 -       cast5_fpu_end(fpu_enabled);
2001         return err;
2002  }
2003  
2004 @@ -309,7 +307,7 @@
2005  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2006                      struct scatterlist *src, unsigned int nbytes)
2007  {
2008 -       bool fpu_enabled = false;
2009 +       bool fpu_enabled;
2010         struct blkcipher_walk walk;
2011         int err;
2012  
2013 @@ -318,13 +316,12 @@
2014         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2015  
2016         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2017 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2018 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2019                 nbytes = __ctr_crypt(desc, &walk);
2020 +               cast5_fpu_end(fpu_enabled);
2021                 err = blkcipher_walk_done(desc, &walk, nbytes);
2022         }
2023  
2024 -       cast5_fpu_end(fpu_enabled);
2025 -
2026         if (walk.nbytes) {
2027                 ctr_crypt_final(desc, &walk);
2028                 err = blkcipher_walk_done(desc, &walk, 0);
2029 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/cast6_avx_glue.c linux-4.14/arch/x86/crypto/cast6_avx_glue.c
2030 --- linux-4.14.orig/arch/x86/crypto/cast6_avx_glue.c    2017-11-12 19:46:13.000000000 +0100
2031 +++ linux-4.14/arch/x86/crypto/cast6_avx_glue.c 2018-09-05 11:05:07.000000000 +0200
2032 @@ -205,19 +205,33 @@
2033         bool fpu_enabled;
2034  };
2035  
2036 +#ifdef CONFIG_PREEMPT_RT_FULL
2037 +static void cast6_fpu_end_rt(struct crypt_priv *ctx)
2038 +{
2039 +       bool fpu_enabled = ctx->fpu_enabled;
2040 +
2041 +       if (!fpu_enabled)
2042 +               return;
2043 +       cast6_fpu_end(fpu_enabled);
2044 +       ctx->fpu_enabled = false;
2045 +}
2046 +
2047 +#else
2048 +static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
2049 +#endif
2050 +
2051  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2052  {
2053         const unsigned int bsize = CAST6_BLOCK_SIZE;
2054         struct crypt_priv *ctx = priv;
2055         int i;
2056  
2057 -       ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2058 -
2059         if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
2060 +               ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2061                 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
2062 +               cast6_fpu_end_rt(ctx);
2063                 return;
2064         }
2065 -
2066         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2067                 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
2068  }
2069 @@ -228,10 +242,10 @@
2070         struct crypt_priv *ctx = priv;
2071         int i;
2072  
2073 -       ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2074 -
2075         if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
2076 +               ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
2077                 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
2078 +               cast6_fpu_end_rt(ctx);
2079                 return;
2080         }
2081  
2082 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/chacha20_glue.c linux-4.14/arch/x86/crypto/chacha20_glue.c
2083 --- linux-4.14.orig/arch/x86/crypto/chacha20_glue.c     2017-11-12 19:46:13.000000000 +0100
2084 +++ linux-4.14/arch/x86/crypto/chacha20_glue.c  2018-09-05 11:05:07.000000000 +0200
2085 @@ -81,23 +81,24 @@
2086  
2087         crypto_chacha20_init(state, ctx, walk.iv);
2088  
2089 -       kernel_fpu_begin();
2090 -
2091         while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
2092 +               kernel_fpu_begin();
2093 +
2094                 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
2095                                 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
2096 +               kernel_fpu_end();
2097                 err = skcipher_walk_done(&walk,
2098                                          walk.nbytes % CHACHA20_BLOCK_SIZE);
2099         }
2100  
2101         if (walk.nbytes) {
2102 +               kernel_fpu_begin();
2103                 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
2104                                 walk.nbytes);
2105 +               kernel_fpu_end();
2106                 err = skcipher_walk_done(&walk, 0);
2107         }
2108  
2109 -       kernel_fpu_end();
2110 -
2111         return err;
2112  }
2113  
2114 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/glue_helper.c linux-4.14/arch/x86/crypto/glue_helper.c
2115 --- linux-4.14.orig/arch/x86/crypto/glue_helper.c       2017-11-12 19:46:13.000000000 +0100
2116 +++ linux-4.14/arch/x86/crypto/glue_helper.c    2018-09-05 11:05:07.000000000 +0200
2117 @@ -40,7 +40,7 @@
2118         void *ctx = crypto_blkcipher_ctx(desc->tfm);
2119         const unsigned int bsize = 128 / 8;
2120         unsigned int nbytes, i, func_bytes;
2121 -       bool fpu_enabled = false;
2122 +       bool fpu_enabled;
2123         int err;
2124  
2125         err = blkcipher_walk_virt(desc, walk);
2126 @@ -50,7 +50,7 @@
2127                 u8 *wdst = walk->dst.virt.addr;
2128  
2129                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2130 -                                            desc, fpu_enabled, nbytes);
2131 +                                            desc, false, nbytes);
2132  
2133                 for (i = 0; i < gctx->num_funcs; i++) {
2134                         func_bytes = bsize * gctx->funcs[i].num_blocks;
2135 @@ -72,10 +72,10 @@
2136                 }
2137  
2138  done:
2139 +               glue_fpu_end(fpu_enabled);
2140                 err = blkcipher_walk_done(desc, walk, nbytes);
2141         }
2142  
2143 -       glue_fpu_end(fpu_enabled);
2144         return err;
2145  }
2146  
2147 @@ -192,7 +192,7 @@
2148                             struct scatterlist *src, unsigned int nbytes)
2149  {
2150         const unsigned int bsize = 128 / 8;
2151 -       bool fpu_enabled = false;
2152 +       bool fpu_enabled;
2153         struct blkcipher_walk walk;
2154         int err;
2155  
2156 @@ -201,12 +201,12 @@
2157  
2158         while ((nbytes = walk.nbytes)) {
2159                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2160 -                                            desc, fpu_enabled, nbytes);
2161 +                                            desc, false, nbytes);
2162                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2163 +               glue_fpu_end(fpu_enabled);
2164                 err = blkcipher_walk_done(desc, &walk, nbytes);
2165         }
2166  
2167 -       glue_fpu_end(fpu_enabled);
2168         return err;
2169  }
2170  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2171 @@ -275,7 +275,7 @@
2172                           struct scatterlist *src, unsigned int nbytes)
2173  {
2174         const unsigned int bsize = 128 / 8;
2175 -       bool fpu_enabled = false;
2176 +       bool fpu_enabled;
2177         struct blkcipher_walk walk;
2178         int err;
2179  
2180 @@ -284,13 +284,12 @@
2181  
2182         while ((nbytes = walk.nbytes) >= bsize) {
2183                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2184 -                                            desc, fpu_enabled, nbytes);
2185 +                                            desc, false, nbytes);
2186                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2187 +               glue_fpu_end(fpu_enabled);
2188                 err = blkcipher_walk_done(desc, &walk, nbytes);
2189         }
2190  
2191 -       glue_fpu_end(fpu_enabled);
2192 -
2193         if (walk.nbytes) {
2194                 glue_ctr_crypt_final_128bit(
2195                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2196 @@ -380,7 +379,7 @@
2197                           void *tweak_ctx, void *crypt_ctx)
2198  {
2199         const unsigned int bsize = 128 / 8;
2200 -       bool fpu_enabled = false;
2201 +       bool fpu_enabled;
2202         struct blkcipher_walk walk;
2203         int err;
2204  
2205 @@ -393,21 +392,21 @@
2206  
2207         /* set minimum length to bsize, for tweak_fn */
2208         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2209 -                                    desc, fpu_enabled,
2210 +                                    desc, false,
2211                                      nbytes < bsize ? bsize : nbytes);
2212 -
2213         /* calculate first value of T */
2214         tweak_fn(tweak_ctx, walk.iv, walk.iv);
2215 +       glue_fpu_end(fpu_enabled);
2216  
2217         while (nbytes) {
2218 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2219 +                               desc, false, nbytes);
2220                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2221  
2222 +               glue_fpu_end(fpu_enabled);
2223                 err = blkcipher_walk_done(desc, &walk, nbytes);
2224                 nbytes = walk.nbytes;
2225         }
2226 -
2227 -       glue_fpu_end(fpu_enabled);
2228 -
2229         return err;
2230  }
2231  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2232 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_avx2_glue.c linux-4.14/arch/x86/crypto/serpent_avx2_glue.c
2233 --- linux-4.14.orig/arch/x86/crypto/serpent_avx2_glue.c 2017-11-12 19:46:13.000000000 +0100
2234 +++ linux-4.14/arch/x86/crypto/serpent_avx2_glue.c      2018-09-05 11:05:07.000000000 +0200
2235 @@ -184,6 +184,21 @@
2236         bool fpu_enabled;
2237  };
2238  
2239 +#ifdef CONFIG_PREEMPT_RT_FULL
2240 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2241 +{
2242 +       bool fpu_enabled = ctx->fpu_enabled;
2243 +
2244 +       if (!fpu_enabled)
2245 +               return;
2246 +       serpent_fpu_end(fpu_enabled);
2247 +       ctx->fpu_enabled = false;
2248 +}
2249 +
2250 +#else
2251 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2252 +#endif
2253 +
2254  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2255  {
2256         const unsigned int bsize = SERPENT_BLOCK_SIZE;
2257 @@ -199,10 +214,12 @@
2258         }
2259  
2260         while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
2261 +               kernel_fpu_resched();
2262                 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
2263                 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
2264                 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
2265         }
2266 +       serpent_fpu_end_rt(ctx);
2267  
2268         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2269                 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
2270 @@ -223,10 +240,12 @@
2271         }
2272  
2273         while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
2274 +               kernel_fpu_resched();
2275                 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
2276                 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
2277                 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
2278         }
2279 +       serpent_fpu_end_rt(ctx);
2280  
2281         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2282                 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
2283 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_avx_glue.c linux-4.14/arch/x86/crypto/serpent_avx_glue.c
2284 --- linux-4.14.orig/arch/x86/crypto/serpent_avx_glue.c  2017-11-12 19:46:13.000000000 +0100
2285 +++ linux-4.14/arch/x86/crypto/serpent_avx_glue.c       2018-09-05 11:05:07.000000000 +0200
2286 @@ -218,16 +218,31 @@
2287         bool fpu_enabled;
2288  };
2289  
2290 +#ifdef CONFIG_PREEMPT_RT_FULL
2291 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2292 +{
2293 +       bool fpu_enabled = ctx->fpu_enabled;
2294 +
2295 +       if (!fpu_enabled)
2296 +               return;
2297 +       serpent_fpu_end(fpu_enabled);
2298 +       ctx->fpu_enabled = false;
2299 +}
2300 +
2301 +#else
2302 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2303 +#endif
2304 +
2305  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2306  {
2307         const unsigned int bsize = SERPENT_BLOCK_SIZE;
2308         struct crypt_priv *ctx = priv;
2309         int i;
2310  
2311 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2312 -
2313         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2314 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2315                 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
2316 +               serpent_fpu_end_rt(ctx);
2317                 return;
2318         }
2319  
2320 @@ -241,10 +256,10 @@
2321         struct crypt_priv *ctx = priv;
2322         int i;
2323  
2324 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2325 -
2326         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2327 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2328                 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
2329 +               serpent_fpu_end_rt(ctx);
2330                 return;
2331         }
2332  
2333 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/serpent_sse2_glue.c linux-4.14/arch/x86/crypto/serpent_sse2_glue.c
2334 --- linux-4.14.orig/arch/x86/crypto/serpent_sse2_glue.c 2017-11-12 19:46:13.000000000 +0100
2335 +++ linux-4.14/arch/x86/crypto/serpent_sse2_glue.c      2018-09-05 11:05:07.000000000 +0200
2336 @@ -187,16 +187,31 @@
2337         bool fpu_enabled;
2338  };
2339  
2340 +#ifdef CONFIG_PREEMPT_RT_FULL
2341 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
2342 +{
2343 +       bool fpu_enabled = ctx->fpu_enabled;
2344 +
2345 +       if (!fpu_enabled)
2346 +               return;
2347 +       serpent_fpu_end(fpu_enabled);
2348 +       ctx->fpu_enabled = false;
2349 +}
2350 +
2351 +#else
2352 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
2353 +#endif
2354 +
2355  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2356  {
2357         const unsigned int bsize = SERPENT_BLOCK_SIZE;
2358         struct crypt_priv *ctx = priv;
2359         int i;
2360  
2361 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2362 -
2363         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2364 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2365                 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
2366 +               serpent_fpu_end_rt(ctx);
2367                 return;
2368         }
2369  
2370 @@ -210,10 +225,10 @@
2371         struct crypt_priv *ctx = priv;
2372         int i;
2373  
2374 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2375 -
2376         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
2377 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
2378                 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
2379 +               serpent_fpu_end_rt(ctx);
2380                 return;
2381         }
2382  
2383 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/crypto/twofish_avx_glue.c linux-4.14/arch/x86/crypto/twofish_avx_glue.c
2384 --- linux-4.14.orig/arch/x86/crypto/twofish_avx_glue.c  2017-11-12 19:46:13.000000000 +0100
2385 +++ linux-4.14/arch/x86/crypto/twofish_avx_glue.c       2018-09-05 11:05:07.000000000 +0200
2386 @@ -218,6 +218,21 @@
2387         bool fpu_enabled;
2388  };
2389  
2390 +#ifdef CONFIG_PREEMPT_RT_FULL
2391 +static void twofish_fpu_end_rt(struct crypt_priv *ctx)
2392 +{
2393 +       bool fpu_enabled = ctx->fpu_enabled;
2394 +
2395 +       if (!fpu_enabled)
2396 +               return;
2397 +       twofish_fpu_end(fpu_enabled);
2398 +       ctx->fpu_enabled = false;
2399 +}
2400 +
2401 +#else
2402 +static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
2403 +#endif
2404 +
2405  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
2406  {
2407         const unsigned int bsize = TF_BLOCK_SIZE;
2408 @@ -228,12 +243,16 @@
2409  
2410         if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
2411                 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
2412 +               twofish_fpu_end_rt(ctx);
2413                 return;
2414         }
2415  
2416 -       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
2417 +       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
2418 +               kernel_fpu_resched();
2419                 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
2420 +       }
2421  
2422 +       twofish_fpu_end_rt(ctx);
2423         nbytes %= bsize * 3;
2424  
2425         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
2426 @@ -250,11 +269,15 @@
2427  
2428         if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
2429                 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
2430 +               twofish_fpu_end_rt(ctx);
2431                 return;
2432         }
2433  
2434 -       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
2435 +       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
2436 +               kernel_fpu_resched();
2437                 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
2438 +       }
2439 +       twofish_fpu_end_rt(ctx);
2440  
2441         nbytes %= bsize * 3;
2442  
2443 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/common.c linux-4.14/arch/x86/entry/common.c
2444 --- linux-4.14.orig/arch/x86/entry/common.c     2018-09-05 11:03:20.000000000 +0200
2445 +++ linux-4.14/arch/x86/entry/common.c  2018-09-05 11:05:07.000000000 +0200
2446 @@ -133,7 +133,7 @@
2447  
2448  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2449         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2450 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
2451 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
2452  
2453  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2454  {
2455 @@ -148,9 +148,16 @@
2456                 /* We have work to do. */
2457                 local_irq_enable();
2458  
2459 -               if (cached_flags & _TIF_NEED_RESCHED)
2460 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2461                         schedule();
2462  
2463 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2464 +               if (unlikely(current->forced_info.si_signo)) {
2465 +                       struct task_struct *t = current;
2466 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2467 +                       t->forced_info.si_signo = 0;
2468 +               }
2469 +#endif
2470                 if (cached_flags & _TIF_UPROBE)
2471                         uprobe_notify_resume(regs);
2472  
2473 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/entry_32.S linux-4.14/arch/x86/entry/entry_32.S
2474 --- linux-4.14.orig/arch/x86/entry/entry_32.S   2018-09-05 11:03:20.000000000 +0200
2475 +++ linux-4.14/arch/x86/entry/entry_32.S        2018-09-05 11:05:07.000000000 +0200
2476 @@ -350,8 +350,25 @@
2477  ENTRY(resume_kernel)
2478         DISABLE_INTERRUPTS(CLBR_ANY)
2479  .Lneed_resched:
2480 +       # preempt count == 0 + NEED_RS set?
2481         cmpl    $0, PER_CPU_VAR(__preempt_count)
2482 +#ifndef CONFIG_PREEMPT_LAZY
2483         jnz     restore_all
2484 +#else
2485 +       jz test_int_off
2486 +
2487 +       # atleast preempt count == 0 ?
2488 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2489 +       jne restore_all
2490 +
2491 +       movl    PER_CPU_VAR(current_task), %ebp
2492 +       cmpl    $0,TASK_TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2493 +       jnz     restore_all
2494 +
2495 +       testl   $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2496 +       jz      restore_all
2497 +test_int_off:
2498 +#endif
2499         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2500         jz      restore_all
2501         call    preempt_schedule_irq
2502 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/entry/entry_64.S linux-4.14/arch/x86/entry/entry_64.S
2503 --- linux-4.14.orig/arch/x86/entry/entry_64.S   2018-09-05 11:03:20.000000000 +0200
2504 +++ linux-4.14/arch/x86/entry/entry_64.S        2018-09-05 11:05:07.000000000 +0200
2505 @@ -633,7 +633,23 @@
2506         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2507         jnc     1f
2508  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2509 +#ifndef CONFIG_PREEMPT_LAZY
2510 +       jnz     1f
2511 +#else
2512 +       jz      do_preempt_schedule_irq
2513 +
2514 +       # atleast preempt count == 0 ?
2515 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2516 +       jnz     1f
2517 +
2518 +       movq    PER_CPU_VAR(current_task), %rcx
2519 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2520         jnz     1f
2521 +
2522 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2523 +       jnc     1f
2524 +do_preempt_schedule_irq:
2525 +#endif
2526         call    preempt_schedule_irq
2527         jmp     0b
2528  1:
2529 @@ -988,6 +1004,7 @@
2530         jmp     2b
2531         .previous
2532  
2533 +#ifndef CONFIG_PREEMPT_RT_FULL
2534  /* Call softirq on interrupt stack. Interrupts are off. */
2535  ENTRY(do_softirq_own_stack)
2536         pushq   %rbp
2537 @@ -998,6 +1015,7 @@
2538         leaveq
2539         ret
2540  ENDPROC(do_softirq_own_stack)
2541 +#endif
2542  
2543  #ifdef CONFIG_XEN
2544  idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2545 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/fpu/api.h linux-4.14/arch/x86/include/asm/fpu/api.h
2546 --- linux-4.14.orig/arch/x86/include/asm/fpu/api.h      2017-11-12 19:46:13.000000000 +0100
2547 +++ linux-4.14/arch/x86/include/asm/fpu/api.h   2018-09-05 11:05:07.000000000 +0200
2548 @@ -25,6 +25,7 @@
2549  extern void __kernel_fpu_end(void);
2550  extern void kernel_fpu_begin(void);
2551  extern void kernel_fpu_end(void);
2552 +extern void kernel_fpu_resched(void);
2553  extern bool irq_fpu_usable(void);
2554  
2555  /*
2556 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/preempt.h linux-4.14/arch/x86/include/asm/preempt.h
2557 --- linux-4.14.orig/arch/x86/include/asm/preempt.h      2017-11-12 19:46:13.000000000 +0100
2558 +++ linux-4.14/arch/x86/include/asm/preempt.h   2018-09-05 11:05:07.000000000 +0200
2559 @@ -86,17 +86,46 @@
2560   * a decrement which hits zero means we have no preempt_count and should
2561   * reschedule.
2562   */
2563 -static __always_inline bool __preempt_count_dec_and_test(void)
2564 +static __always_inline bool ____preempt_count_dec_and_test(void)
2565  {
2566         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2567  }
2568  
2569 +static __always_inline bool __preempt_count_dec_and_test(void)
2570 +{
2571 +       if (____preempt_count_dec_and_test())
2572 +               return true;
2573 +#ifdef CONFIG_PREEMPT_LAZY
2574 +       if (current_thread_info()->preempt_lazy_count)
2575 +               return false;
2576 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2577 +#else
2578 +       return false;
2579 +#endif
2580 +}
2581 +
2582  /*
2583   * Returns true when we need to resched and can (barring IRQ state).
2584   */
2585  static __always_inline bool should_resched(int preempt_offset)
2586  {
2587 +#ifdef CONFIG_PREEMPT_LAZY
2588 +       u32 tmp;
2589 +
2590 +       tmp = raw_cpu_read_4(__preempt_count);
2591 +       if (tmp == preempt_offset)
2592 +               return true;
2593 +
2594 +       /* preempt count == 0 ? */
2595 +       tmp &= ~PREEMPT_NEED_RESCHED;
2596 +       if (tmp)
2597 +               return false;
2598 +       if (current_thread_info()->preempt_lazy_count)
2599 +               return false;
2600 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2601 +#else
2602         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2603 +#endif
2604  }
2605  
2606  #ifdef CONFIG_PREEMPT
2607 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/signal.h linux-4.14/arch/x86/include/asm/signal.h
2608 --- linux-4.14.orig/arch/x86/include/asm/signal.h       2017-11-12 19:46:13.000000000 +0100
2609 +++ linux-4.14/arch/x86/include/asm/signal.h    2018-09-05 11:05:07.000000000 +0200
2610 @@ -28,6 +28,19 @@
2611  #define SA_IA32_ABI    0x02000000u
2612  #define SA_X32_ABI     0x01000000u
2613  
2614 +/*
2615 + * Because some traps use the IST stack, we must keep preemption
2616 + * disabled while calling do_trap(), but do_trap() may call
2617 + * force_sig_info() which will grab the signal spin_locks for the
2618 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2619 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2620 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2621 + * trap.
2622 + */
2623 +#if defined(CONFIG_PREEMPT_RT_FULL)
2624 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2625 +#endif
2626 +
2627  #ifndef CONFIG_COMPAT
2628  typedef sigset_t compat_sigset_t;
2629  #endif
2630 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/stackprotector.h linux-4.14/arch/x86/include/asm/stackprotector.h
2631 --- linux-4.14.orig/arch/x86/include/asm/stackprotector.h       2017-11-12 19:46:13.000000000 +0100
2632 +++ linux-4.14/arch/x86/include/asm/stackprotector.h    2018-09-05 11:05:07.000000000 +0200
2633 @@ -60,7 +60,7 @@
2634   */
2635  static __always_inline void boot_init_stack_canary(void)
2636  {
2637 -       u64 canary;
2638 +       u64 uninitialized_var(canary);
2639         u64 tsc;
2640  
2641  #ifdef CONFIG_X86_64
2642 @@ -71,8 +71,14 @@
2643          * of randomness. The TSC only matters for very early init,
2644          * there it already has some randomness on most systems. Later
2645          * on during the bootup the random pool has true entropy too.
2646 +        * For preempt-rt we need to weaken the randomness a bit, as
2647 +        * we can't call into the random generator from atomic context
2648 +        * due to locking constraints. We just leave canary
2649 +        * uninitialized and use the TSC based randomness on top of it.
2650          */
2651 +#ifndef CONFIG_PREEMPT_RT_FULL
2652         get_random_bytes(&canary, sizeof(canary));
2653 +#endif
2654         tsc = rdtsc();
2655         canary += tsc + (tsc << 32UL);
2656         canary &= CANARY_MASK;
2657 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/thread_info.h linux-4.14/arch/x86/include/asm/thread_info.h
2658 --- linux-4.14.orig/arch/x86/include/asm/thread_info.h  2018-09-05 11:03:20.000000000 +0200
2659 +++ linux-4.14/arch/x86/include/asm/thread_info.h       2018-09-05 11:05:07.000000000 +0200
2660 @@ -56,11 +56,14 @@
2661  struct thread_info {
2662         unsigned long           flags;          /* low level flags */
2663         u32                     status;         /* thread synchronous flags */
2664 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2665 +                                                         <0 => BUG */
2666  };
2667  
2668  #define INIT_THREAD_INFO(tsk)                  \
2669  {                                              \
2670         .flags          = 0,                    \
2671 +       .preempt_lazy_count = 0,                \
2672  }
2673  
2674  #define init_stack             (init_thread_union.stack)
2675 @@ -69,6 +72,10 @@
2676  
2677  #include <asm/asm-offsets.h>
2678  
2679 +#define GET_THREAD_INFO(reg) \
2680 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2681 +       _ASM_SUB $(THREAD_SIZE),reg ;
2682 +
2683  #endif
2684  
2685  /*
2686 @@ -85,6 +92,7 @@
2687  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2688  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2689  #define TIF_SECCOMP            8       /* secure computing */
2690 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2691  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2692  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2693  #define TIF_PATCH_PENDING      13      /* pending live patching update */
2694 @@ -112,6 +120,7 @@
2695  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2696  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2697  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2698 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2699  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2700  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2701  #define _TIF_PATCH_PENDING     (1 << TIF_PATCH_PENDING)
2702 @@ -153,6 +162,8 @@
2703  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2704  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2705  
2706 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2707 +
2708  #define STACK_WARN             (THREAD_SIZE/8)
2709  
2710  /*
2711 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/include/asm/uv/uv_bau.h linux-4.14/arch/x86/include/asm/uv/uv_bau.h
2712 --- linux-4.14.orig/arch/x86/include/asm/uv/uv_bau.h    2017-11-12 19:46:13.000000000 +0100
2713 +++ linux-4.14/arch/x86/include/asm/uv/uv_bau.h 2018-09-05 11:05:07.000000000 +0200
2714 @@ -643,9 +643,9 @@
2715         cycles_t                send_message;
2716         cycles_t                period_end;
2717         cycles_t                period_time;
2718 -       spinlock_t              uvhub_lock;
2719 -       spinlock_t              queue_lock;
2720 -       spinlock_t              disable_lock;
2721 +       raw_spinlock_t          uvhub_lock;
2722 +       raw_spinlock_t          queue_lock;
2723 +       raw_spinlock_t          disable_lock;
2724         /* tunables */
2725         int                     max_concurr;
2726         int                     max_concurr_const;
2727 @@ -847,15 +847,15 @@
2728   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2729   * on equal.
2730   */
2731 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2732 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2733  {
2734 -       spin_lock(lock);
2735 +       raw_spin_lock(lock);
2736         if (atomic_read(v) >= u) {
2737 -               spin_unlock(lock);
2738 +               raw_spin_unlock(lock);
2739                 return 0;
2740         }
2741         atomic_inc(v);
2742 -       spin_unlock(lock);
2743 +       raw_spin_unlock(lock);
2744         return 1;
2745  }
2746  
2747 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/Kconfig linux-4.14/arch/x86/Kconfig
2748 --- linux-4.14.orig/arch/x86/Kconfig    2018-09-05 11:03:20.000000000 +0200
2749 +++ linux-4.14/arch/x86/Kconfig 2018-09-05 11:05:07.000000000 +0200
2750 @@ -169,6 +169,7 @@
2751         select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
2752         select HAVE_PERF_REGS
2753         select HAVE_PERF_USER_STACK_DUMP
2754 +       select HAVE_PREEMPT_LAZY
2755         select HAVE_RCU_TABLE_FREE
2756         select HAVE_REGS_AND_STACK_ACCESS_API
2757         select HAVE_RELIABLE_STACKTRACE         if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
2758 @@ -256,8 +257,11 @@
2759         def_bool y
2760         depends on ISA_DMA_API
2761  
2762 +config RWSEM_GENERIC_SPINLOCK
2763 +       def_bool PREEMPT_RT_FULL
2764 +
2765  config RWSEM_XCHGADD_ALGORITHM
2766 -       def_bool y
2767 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2768  
2769  config GENERIC_CALIBRATE_DELAY
2770         def_bool y
2771 @@ -932,7 +936,7 @@
2772  config MAXSMP
2773         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2774         depends on X86_64 && SMP && DEBUG_KERNEL
2775 -       select CPUMASK_OFFSTACK
2776 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2777         ---help---
2778           Enable maximum number of CPUS and NUMA Nodes for this architecture.
2779           If unsure, say N.
2780 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/apic/io_apic.c linux-4.14/arch/x86/kernel/apic/io_apic.c
2781 --- linux-4.14.orig/arch/x86/kernel/apic/io_apic.c      2018-09-05 11:03:20.000000000 +0200
2782 +++ linux-4.14/arch/x86/kernel/apic/io_apic.c   2018-09-05 11:05:07.000000000 +0200
2783 @@ -1691,7 +1691,8 @@
2784  static inline bool ioapic_irqd_mask(struct irq_data *data)
2785  {
2786         /* If we are moving the irq we need to mask it */
2787 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2788 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2789 +                    !irqd_irq_inprogress(data))) {
2790                 mask_ioapic_irq(data);
2791                 return true;
2792         }
2793 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/asm-offsets.c linux-4.14/arch/x86/kernel/asm-offsets.c
2794 --- linux-4.14.orig/arch/x86/kernel/asm-offsets.c       2018-09-05 11:03:20.000000000 +0200
2795 +++ linux-4.14/arch/x86/kernel/asm-offsets.c    2018-09-05 11:05:07.000000000 +0200
2796 @@ -38,6 +38,7 @@
2797  
2798         BLANK();
2799         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2800 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2801         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2802  
2803         BLANK();
2804 @@ -94,6 +95,7 @@
2805  
2806         BLANK();
2807         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2808 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2809  
2810         /* TLB state for the entry code */
2811         OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
2812 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c linux-4.14/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
2813 --- linux-4.14.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c     2017-11-12 19:46:13.000000000 +0100
2814 +++ linux-4.14/arch/x86/kernel/cpu/mcheck/dev-mcelog.c  2018-09-05 11:05:07.000000000 +0200
2815 @@ -14,6 +14,7 @@
2816  #include <linux/slab.h>
2817  #include <linux/kmod.h>
2818  #include <linux/poll.h>
2819 +#include <linux/swork.h>
2820  
2821  #include "mce-internal.h"
2822  
2823 @@ -86,13 +87,43 @@
2824  
2825  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2826  
2827 -
2828 -void mce_work_trigger(void)
2829 +static void __mce_work_trigger(struct swork_event *event)
2830  {
2831         if (mce_helper[0])
2832                 schedule_work(&mce_trigger_work);
2833  }
2834  
2835 +#ifdef CONFIG_PREEMPT_RT_FULL
2836 +static bool notify_work_ready __read_mostly;
2837 +static struct swork_event notify_work;
2838 +
2839 +static int mce_notify_work_init(void)
2840 +{
2841 +       int err;
2842 +
2843 +       err = swork_get();
2844 +       if (err)
2845 +               return err;
2846 +
2847 +       INIT_SWORK(&notify_work, __mce_work_trigger);
2848 +       notify_work_ready = true;
2849 +       return 0;
2850 +}
2851 +
2852 +void mce_work_trigger(void)
2853 +{
2854 +       if (notify_work_ready)
2855 +               swork_queue(&notify_work);
2856 +}
2857 +
2858 +#else
2859 +void mce_work_trigger(void)
2860 +{
2861 +       __mce_work_trigger(NULL);
2862 +}
2863 +static inline int mce_notify_work_init(void) { return 0; }
2864 +#endif
2865 +
2866  static ssize_t
2867  show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2868  {
2869 @@ -356,7 +387,7 @@
2870  
2871                 return err;
2872         }
2873 -
2874 +       mce_notify_work_init();
2875         mce_register_decode_chain(&dev_mcelog_nb);
2876         return 0;
2877  }
2878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-4.14/arch/x86/kernel/cpu/mcheck/mce.c
2879 --- linux-4.14.orig/arch/x86/kernel/cpu/mcheck/mce.c    2018-09-05 11:03:20.000000000 +0200
2880 +++ linux-4.14/arch/x86/kernel/cpu/mcheck/mce.c 2018-09-05 11:05:07.000000000 +0200
2881 @@ -42,6 +42,7 @@
2882  #include <linux/debugfs.h>
2883  #include <linux/irq_work.h>
2884  #include <linux/export.h>
2885 +#include <linux/jiffies.h>
2886  #include <linux/jump_label.h>
2887  
2888  #include <asm/intel-family.h>
2889 @@ -1365,7 +1366,7 @@
2890  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2891  
2892  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2893 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2894 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2895  
2896  static unsigned long mce_adjust_timer_default(unsigned long interval)
2897  {
2898 @@ -1374,27 +1375,19 @@
2899  
2900  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2901  
2902 -static void __start_timer(struct timer_list *t, unsigned long interval)
2903 +static void __start_timer(struct hrtimer *t, unsigned long iv)
2904  {
2905 -       unsigned long when = jiffies + interval;
2906 -       unsigned long flags;
2907 -
2908 -       local_irq_save(flags);
2909 -
2910 -       if (!timer_pending(t) || time_before(when, t->expires))
2911 -               mod_timer(t, round_jiffies(when));
2912 +       if (!iv)
2913 +               return;
2914  
2915 -       local_irq_restore(flags);
2916 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2917 +                              0, HRTIMER_MODE_REL_PINNED);
2918  }
2919  
2920 -static void mce_timer_fn(unsigned long data)
2921 +static  enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2922  {
2923 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2924 -       int cpu = smp_processor_id();
2925         unsigned long iv;
2926  
2927 -       WARN_ON(cpu != data);
2928 -
2929         iv = __this_cpu_read(mce_next_interval);
2930  
2931         if (mce_available(this_cpu_ptr(&cpu_info))) {
2932 @@ -1417,7 +1410,11 @@
2933  
2934  done:
2935         __this_cpu_write(mce_next_interval, iv);
2936 -       __start_timer(t, iv);
2937 +       if (!iv)
2938 +               return HRTIMER_NORESTART;
2939 +
2940 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv)));
2941 +       return HRTIMER_RESTART;
2942  }
2943  
2944  /*
2945 @@ -1425,7 +1422,7 @@
2946   */
2947  void mce_timer_kick(unsigned long interval)
2948  {
2949 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2950 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2951         unsigned long iv = __this_cpu_read(mce_next_interval);
2952  
2953         __start_timer(t, interval);
2954 @@ -1440,7 +1437,7 @@
2955         int cpu;
2956  
2957         for_each_online_cpu(cpu)
2958 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2959 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2960  }
2961  
2962  /*
2963 @@ -1769,7 +1766,7 @@
2964         }
2965  }
2966  
2967 -static void mce_start_timer(struct timer_list *t)
2968 +static void mce_start_timer(struct hrtimer *t)
2969  {
2970         unsigned long iv = check_interval * HZ;
2971  
2972 @@ -1782,18 +1779,19 @@
2973  
2974  static void __mcheck_cpu_setup_timer(void)
2975  {
2976 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2977 -       unsigned int cpu = smp_processor_id();
2978 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2979  
2980 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2981 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2982 +       t->function = mce_timer_fn;
2983  }
2984  
2985  static void __mcheck_cpu_init_timer(void)
2986  {
2987 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2988 -       unsigned int cpu = smp_processor_id();
2989 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2990 +
2991 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2992 +       t->function = mce_timer_fn;
2993  
2994 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2995         mce_start_timer(t);
2996  }
2997  
2998 @@ -2309,7 +2307,7 @@
2999  
3000  static int mce_cpu_online(unsigned int cpu)
3001  {
3002 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
3003 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
3004         int ret;
3005  
3006         mce_device_create(cpu);
3007 @@ -2326,10 +2324,10 @@
3008  
3009  static int mce_cpu_pre_down(unsigned int cpu)
3010  {
3011 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
3012 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
3013  
3014         mce_disable_cpu();
3015 -       del_timer_sync(t);
3016 +       hrtimer_cancel(t);
3017         mce_threshold_remove_device(cpu);
3018         mce_device_remove(cpu);
3019         return 0;
3020 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/fpu/core.c linux-4.14/arch/x86/kernel/fpu/core.c
3021 --- linux-4.14.orig/arch/x86/kernel/fpu/core.c  2018-09-05 11:03:20.000000000 +0200
3022 +++ linux-4.14/arch/x86/kernel/fpu/core.c       2018-09-05 11:05:07.000000000 +0200
3023 @@ -138,6 +138,18 @@
3024  }
3025  EXPORT_SYMBOL_GPL(kernel_fpu_end);
3026  
3027 +void kernel_fpu_resched(void)
3028 +{
3029 +       WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
3030 +
3031 +       if (should_resched(PREEMPT_OFFSET)) {
3032 +               kernel_fpu_end();
3033 +               cond_resched();
3034 +               kernel_fpu_begin();
3035 +       }
3036 +}
3037 +EXPORT_SYMBOL_GPL(kernel_fpu_resched);
3038 +
3039  /*
3040   * Save the FPU state (mark it for reload if necessary):
3041   *
3042 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/irq_32.c linux-4.14/arch/x86/kernel/irq_32.c
3043 --- linux-4.14.orig/arch/x86/kernel/irq_32.c    2018-09-05 11:03:20.000000000 +0200
3044 +++ linux-4.14/arch/x86/kernel/irq_32.c 2018-09-05 11:05:07.000000000 +0200
3045 @@ -130,6 +130,7 @@
3046                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
3047  }
3048  
3049 +#ifndef CONFIG_PREEMPT_RT_FULL
3050  void do_softirq_own_stack(void)
3051  {
3052         struct irq_stack *irqstk;
3053 @@ -146,6 +147,7 @@
3054  
3055         call_on_stack(__do_softirq, isp);
3056  }
3057 +#endif
3058  
3059  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3060  {
3061 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kernel/process_32.c linux-4.14/arch/x86/kernel/process_32.c
3062 --- linux-4.14.orig/arch/x86/kernel/process_32.c        2018-09-05 11:03:20.000000000 +0200
3063 +++ linux-4.14/arch/x86/kernel/process_32.c     2018-09-05 11:05:07.000000000 +0200
3064 @@ -38,6 +38,7 @@
3065  #include <linux/io.h>
3066  #include <linux/kdebug.h>
3067  #include <linux/syscalls.h>
3068 +#include <linux/highmem.h>
3069  
3070  #include <asm/pgtable.h>
3071  #include <asm/ldt.h>
3072 @@ -198,6 +199,35 @@
3073  }
3074  EXPORT_SYMBOL_GPL(start_thread);
3075  
3076 +#ifdef CONFIG_PREEMPT_RT_FULL
3077 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3078 +{
3079 +       int i;
3080 +
3081 +       /*
3082 +        * Clear @prev's kmap_atomic mappings
3083 +        */
3084 +       for (i = 0; i < prev_p->kmap_idx; i++) {
3085 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3086 +               pte_t *ptep = kmap_pte - idx;
3087 +
3088 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3089 +       }
3090 +       /*
3091 +        * Restore @next_p's kmap_atomic mappings
3092 +        */
3093 +       for (i = 0; i < next_p->kmap_idx; i++) {
3094 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3095 +
3096 +               if (!pte_none(next_p->kmap_pte[i]))
3097 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3098 +       }
3099 +}
3100 +#else
3101 +static inline void
3102 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3103 +#endif
3104 +
3105  
3106  /*
3107   *     switch_to(x,y) should switch tasks from x to y.
3108 @@ -273,6 +303,8 @@
3109                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3110                 __switch_to_xtra(prev_p, next_p, tss);
3111  
3112 +       switch_kmaps(prev_p, next_p);
3113 +
3114         /*
3115          * Leave lazy mode, flushing any hypercalls made here.
3116          * This must be done before restoring TLS segments so
3117 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kvm/lapic.c linux-4.14/arch/x86/kvm/lapic.c
3118 --- linux-4.14.orig/arch/x86/kvm/lapic.c        2018-09-05 11:03:20.000000000 +0200
3119 +++ linux-4.14/arch/x86/kvm/lapic.c     2018-09-05 11:05:07.000000000 +0200
3120 @@ -2120,7 +2120,7 @@
3121         apic->vcpu = vcpu;
3122  
3123         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3124 -                    HRTIMER_MODE_ABS_PINNED);
3125 +                    HRTIMER_MODE_ABS_PINNED_HARD);
3126         apic->lapic_timer.timer.function = apic_timer_fn;
3127  
3128         /*
3129 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/kvm/x86.c linux-4.14/arch/x86/kvm/x86.c
3130 --- linux-4.14.orig/arch/x86/kvm/x86.c  2018-09-05 11:03:20.000000000 +0200
3131 +++ linux-4.14/arch/x86/kvm/x86.c       2018-09-05 11:05:07.000000000 +0200
3132 @@ -6285,6 +6285,13 @@
3133                 goto out;
3134         }
3135  
3136 +#ifdef CONFIG_PREEMPT_RT_FULL
3137 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3138 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3139 +               return -EOPNOTSUPP;
3140 +       }
3141 +#endif
3142 +
3143         r = kvm_mmu_module_init();
3144         if (r)
3145                 goto out_free_percpu;
3146 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/mm/highmem_32.c linux-4.14/arch/x86/mm/highmem_32.c
3147 --- linux-4.14.orig/arch/x86/mm/highmem_32.c    2017-11-12 19:46:13.000000000 +0100
3148 +++ linux-4.14/arch/x86/mm/highmem_32.c 2018-09-05 11:05:07.000000000 +0200
3149 @@ -32,10 +32,11 @@
3150   */
3151  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3152  {
3153 +       pte_t pte = mk_pte(page, prot);
3154         unsigned long vaddr;
3155         int idx, type;
3156  
3157 -       preempt_disable();
3158 +       preempt_disable_nort();
3159         pagefault_disable();
3160  
3161         if (!PageHighMem(page))
3162 @@ -45,7 +46,10 @@
3163         idx = type + KM_TYPE_NR*smp_processor_id();
3164         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3165         BUG_ON(!pte_none(*(kmap_pte-idx)));
3166 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
3167 +#ifdef CONFIG_PREEMPT_RT_FULL
3168 +       current->kmap_pte[type] = pte;
3169 +#endif
3170 +       set_pte(kmap_pte-idx, pte);
3171         arch_flush_lazy_mmu_mode();
3172  
3173         return (void *)vaddr;
3174 @@ -88,6 +92,9 @@
3175                  * is a bad idea also, in case the page changes cacheability
3176                  * attributes or becomes a protected page in a hypervisor.
3177                  */
3178 +#ifdef CONFIG_PREEMPT_RT_FULL
3179 +               current->kmap_pte[type] = __pte(0);
3180 +#endif
3181                 kpte_clear_flush(kmap_pte-idx, vaddr);
3182                 kmap_atomic_idx_pop();
3183                 arch_flush_lazy_mmu_mode();
3184 @@ -100,7 +107,7 @@
3185  #endif
3186  
3187         pagefault_enable();
3188 -       preempt_enable();
3189 +       preempt_enable_nort();
3190  }
3191  EXPORT_SYMBOL(__kunmap_atomic);
3192  
3193 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/mm/iomap_32.c linux-4.14/arch/x86/mm/iomap_32.c
3194 --- linux-4.14.orig/arch/x86/mm/iomap_32.c      2017-11-12 19:46:13.000000000 +0100
3195 +++ linux-4.14/arch/x86/mm/iomap_32.c   2018-09-05 11:05:07.000000000 +0200
3196 @@ -56,6 +56,7 @@
3197  
3198  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3199  {
3200 +       pte_t pte = pfn_pte(pfn, prot);
3201         unsigned long vaddr;
3202         int idx, type;
3203  
3204 @@ -65,7 +66,12 @@
3205         type = kmap_atomic_idx_push();
3206         idx = type + KM_TYPE_NR * smp_processor_id();
3207         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3208 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3209 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
3210 +
3211 +#ifdef CONFIG_PREEMPT_RT_FULL
3212 +       current->kmap_pte[type] = pte;
3213 +#endif
3214 +       set_pte(kmap_pte - idx, pte);
3215         arch_flush_lazy_mmu_mode();
3216  
3217         return (void *)vaddr;
3218 @@ -113,6 +119,9 @@
3219                  * is a bad idea also, in case the page changes cacheability
3220                  * attributes or becomes a protected page in a hypervisor.
3221                  */
3222 +#ifdef CONFIG_PREEMPT_RT_FULL
3223 +               current->kmap_pte[type] = __pte(0);
3224 +#endif
3225                 kpte_clear_flush(kmap_pte-idx, vaddr);
3226                 kmap_atomic_idx_pop();
3227         }
3228 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/platform/uv/tlb_uv.c linux-4.14/arch/x86/platform/uv/tlb_uv.c
3229 --- linux-4.14.orig/arch/x86/platform/uv/tlb_uv.c       2018-09-05 11:03:20.000000000 +0200
3230 +++ linux-4.14/arch/x86/platform/uv/tlb_uv.c    2018-09-05 11:05:07.000000000 +0200
3231 @@ -740,9 +740,9 @@
3232  
3233                 quiesce_local_uvhub(hmaster);
3234  
3235 -               spin_lock(&hmaster->queue_lock);
3236 +               raw_spin_lock(&hmaster->queue_lock);
3237                 reset_with_ipi(&bau_desc->distribution, bcp);
3238 -               spin_unlock(&hmaster->queue_lock);
3239 +               raw_spin_unlock(&hmaster->queue_lock);
3240  
3241                 end_uvhub_quiesce(hmaster);
3242  
3243 @@ -762,9 +762,9 @@
3244  
3245                 quiesce_local_uvhub(hmaster);
3246  
3247 -               spin_lock(&hmaster->queue_lock);
3248 +               raw_spin_lock(&hmaster->queue_lock);
3249                 reset_with_ipi(&bau_desc->distribution, bcp);
3250 -               spin_unlock(&hmaster->queue_lock);
3251 +               raw_spin_unlock(&hmaster->queue_lock);
3252  
3253                 end_uvhub_quiesce(hmaster);
3254  
3255 @@ -785,7 +785,7 @@
3256         cycles_t tm1;
3257  
3258         hmaster = bcp->uvhub_master;
3259 -       spin_lock(&hmaster->disable_lock);
3260 +       raw_spin_lock(&hmaster->disable_lock);
3261         if (!bcp->baudisabled) {
3262                 stat->s_bau_disabled++;
3263                 tm1 = get_cycles();
3264 @@ -798,7 +798,7 @@
3265                         }
3266                 }
3267         }
3268 -       spin_unlock(&hmaster->disable_lock);
3269 +       raw_spin_unlock(&hmaster->disable_lock);
3270  }
3271  
3272  static void count_max_concurr(int stat, struct bau_control *bcp,
3273 @@ -861,7 +861,7 @@
3274   */
3275  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3276  {
3277 -       spinlock_t *lock = &hmaster->uvhub_lock;
3278 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
3279         atomic_t *v;
3280  
3281         v = &hmaster->active_descriptor_count;
3282 @@ -995,7 +995,7 @@
3283         struct bau_control *hmaster;
3284  
3285         hmaster = bcp->uvhub_master;
3286 -       spin_lock(&hmaster->disable_lock);
3287 +       raw_spin_lock(&hmaster->disable_lock);
3288         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3289                 stat->s_bau_reenabled++;
3290                 for_each_present_cpu(tcpu) {
3291 @@ -1007,10 +1007,10 @@
3292                                 tbcp->period_giveups = 0;
3293                         }
3294                 }
3295 -               spin_unlock(&hmaster->disable_lock);
3296 +               raw_spin_unlock(&hmaster->disable_lock);
3297                 return 0;
3298         }
3299 -       spin_unlock(&hmaster->disable_lock);
3300 +       raw_spin_unlock(&hmaster->disable_lock);
3301         return -1;
3302  }
3303  
3304 @@ -1942,9 +1942,9 @@
3305                 bcp->cong_reps                  = congested_reps;
3306                 bcp->disabled_period            = sec_2_cycles(disabled_period);
3307                 bcp->giveup_limit               = giveup_limit;
3308 -               spin_lock_init(&bcp->queue_lock);
3309 -               spin_lock_init(&bcp->uvhub_lock);
3310 -               spin_lock_init(&bcp->disable_lock);
3311 +               raw_spin_lock_init(&bcp->queue_lock);
3312 +               raw_spin_lock_init(&bcp->uvhub_lock);
3313 +               raw_spin_lock_init(&bcp->disable_lock);
3314         }
3315  }
3316  
3317 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/x86/platform/uv/uv_time.c linux-4.14/arch/x86/platform/uv/uv_time.c
3318 --- linux-4.14.orig/arch/x86/platform/uv/uv_time.c      2017-11-12 19:46:13.000000000 +0100
3319 +++ linux-4.14/arch/x86/platform/uv/uv_time.c   2018-09-05 11:05:07.000000000 +0200
3320 @@ -57,7 +57,7 @@
3321  
3322  /* There is one of these allocated per node */
3323  struct uv_rtc_timer_head {
3324 -       spinlock_t      lock;
3325 +       raw_spinlock_t  lock;
3326         /* next cpu waiting for timer, local node relative: */
3327         int             next_cpu;
3328         /* number of cpus on this node: */
3329 @@ -177,7 +177,7 @@
3330                                 uv_rtc_deallocate_timers();
3331                                 return -ENOMEM;
3332                         }
3333 -                       spin_lock_init(&head->lock);
3334 +                       raw_spin_lock_init(&head->lock);
3335                         head->ncpus = uv_blade_nr_possible_cpus(bid);
3336                         head->next_cpu = -1;
3337                         blade_info[bid] = head;
3338 @@ -231,7 +231,7 @@
3339         unsigned long flags;
3340         int next_cpu;
3341  
3342 -       spin_lock_irqsave(&head->lock, flags);
3343 +       raw_spin_lock_irqsave(&head->lock, flags);
3344  
3345         next_cpu = head->next_cpu;
3346         *t = expires;
3347 @@ -243,12 +243,12 @@
3348                 if (uv_setup_intr(cpu, expires)) {
3349                         *t = ULLONG_MAX;
3350                         uv_rtc_find_next_timer(head, pnode);
3351 -                       spin_unlock_irqrestore(&head->lock, flags);
3352 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
3353                         return -ETIME;
3354                 }
3355         }
3356  
3357 -       spin_unlock_irqrestore(&head->lock, flags);
3358 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3359         return 0;
3360  }
3361  
3362 @@ -267,7 +267,7 @@
3363         unsigned long flags;
3364         int rc = 0;
3365  
3366 -       spin_lock_irqsave(&head->lock, flags);
3367 +       raw_spin_lock_irqsave(&head->lock, flags);
3368  
3369         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3370                 rc = 1;
3371 @@ -279,7 +279,7 @@
3372                         uv_rtc_find_next_timer(head, pnode);
3373         }
3374  
3375 -       spin_unlock_irqrestore(&head->lock, flags);
3376 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3377  
3378         return rc;
3379  }
3380 @@ -299,13 +299,17 @@
3381  static u64 uv_read_rtc(struct clocksource *cs)
3382  {
3383         unsigned long offset;
3384 +       u64 cycles;
3385  
3386 +       preempt_disable();
3387         if (uv_get_min_hub_revision_id() == 1)
3388                 offset = 0;
3389         else
3390                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3391  
3392 -       return (u64)uv_read_local_mmr(UVH_RTC | offset);
3393 +       cycles = (u64)uv_read_local_mmr(UVH_RTC | offset);
3394 +       preempt_enable();
3395 +       return cycles;
3396  }
3397  
3398  /*
3399 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/arch/xtensa/include/asm/spinlock_types.h linux-4.14/arch/xtensa/include/asm/spinlock_types.h
3400 --- linux-4.14.orig/arch/xtensa/include/asm/spinlock_types.h    2017-11-12 19:46:13.000000000 +0100
3401 +++ linux-4.14/arch/xtensa/include/asm/spinlock_types.h 2018-09-05 11:05:07.000000000 +0200
3402 @@ -2,10 +2,6 @@
3403  #ifndef __ASM_SPINLOCK_TYPES_H
3404  #define __ASM_SPINLOCK_TYPES_H
3405  
3406 -#ifndef __LINUX_SPINLOCK_TYPES_H
3407 -# error "please don't include this file directly"
3408 -#endif
3409 -
3410  typedef struct {
3411         volatile unsigned int slock;
3412  } arch_spinlock_t;
3413 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-core.c linux-4.14/block/blk-core.c
3414 --- linux-4.14.orig/block/blk-core.c    2018-09-05 11:03:20.000000000 +0200
3415 +++ linux-4.14/block/blk-core.c 2018-09-05 11:05:07.000000000 +0200
3416 @@ -116,6 +116,9 @@
3417  
3418         INIT_LIST_HEAD(&rq->queuelist);
3419         INIT_LIST_HEAD(&rq->timeout_list);
3420 +#ifdef CONFIG_PREEMPT_RT_FULL
3421 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3422 +#endif
3423         rq->cpu = -1;
3424         rq->q = q;
3425         rq->__sector = (sector_t) -1;
3426 @@ -280,7 +283,7 @@
3427  void blk_start_queue(struct request_queue *q)
3428  {
3429         lockdep_assert_held(q->queue_lock);
3430 -       WARN_ON(!in_interrupt() && !irqs_disabled());
3431 +       WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
3432         WARN_ON_ONCE(q->mq_ops);
3433  
3434         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3435 @@ -808,12 +811,21 @@
3436         percpu_ref_put(&q->q_usage_counter);
3437  }
3438  
3439 +static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
3440 +{
3441 +       struct request_queue *q =
3442 +               container_of(sev, struct request_queue, mq_pcpu_wake);
3443 +
3444 +       wake_up_all(&q->mq_freeze_wq);
3445 +}
3446 +
3447  static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3448  {
3449         struct request_queue *q =
3450                 container_of(ref, struct request_queue, q_usage_counter);
3451  
3452 -       wake_up_all(&q->mq_freeze_wq);
3453 +       if (wq_has_sleeper(&q->mq_freeze_wq))
3454 +               swork_queue(&q->mq_pcpu_wake);
3455  }
3456  
3457  static void blk_rq_timed_out_timer(unsigned long data)
3458 @@ -890,6 +902,7 @@
3459         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3460  
3461         init_waitqueue_head(&q->mq_freeze_wq);
3462 +       INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
3463  
3464         /*
3465          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3466 @@ -3308,7 +3321,7 @@
3467                 blk_run_queue_async(q);
3468         else
3469                 __blk_run_queue(q);
3470 -       spin_unlock(q->queue_lock);
3471 +       spin_unlock_irq(q->queue_lock);
3472  }
3473  
3474  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3475 @@ -3356,7 +3369,6 @@
3476  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3477  {
3478         struct request_queue *q;
3479 -       unsigned long flags;
3480         struct request *rq;
3481         LIST_HEAD(list);
3482         unsigned int depth;
3483 @@ -3376,11 +3388,6 @@
3484         q = NULL;
3485         depth = 0;
3486  
3487 -       /*
3488 -        * Save and disable interrupts here, to avoid doing it for every
3489 -        * queue lock we have to take.
3490 -        */
3491 -       local_irq_save(flags);
3492         while (!list_empty(&list)) {
3493                 rq = list_entry_rq(list.next);
3494                 list_del_init(&rq->queuelist);
3495 @@ -3393,7 +3400,7 @@
3496                                 queue_unplugged(q, depth, from_schedule);
3497                         q = rq->q;
3498                         depth = 0;
3499 -                       spin_lock(q->queue_lock);
3500 +                       spin_lock_irq(q->queue_lock);
3501                 }
3502  
3503                 /*
3504 @@ -3420,8 +3427,6 @@
3505          */
3506         if (q)
3507                 queue_unplugged(q, depth, from_schedule);
3508 -
3509 -       local_irq_restore(flags);
3510  }
3511  
3512  void blk_finish_plug(struct blk_plug *plug)
3513 @@ -3631,6 +3636,8 @@
3514         if (!kblockd_workqueue)
3515                 panic("Failed to create kblockd\n");
3516  
3517 +       BUG_ON(swork_get());
3518 +
3519         request_cachep = kmem_cache_create("blkdev_requests",
3520                         sizeof(struct request), 0, SLAB_PANIC, NULL);
3521  
3522 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-ioc.c linux-4.14/block/blk-ioc.c
3523 --- linux-4.14.orig/block/blk-ioc.c     2017-11-12 19:46:13.000000000 +0100
3524 +++ linux-4.14/block/blk-ioc.c  2018-09-05 11:05:07.000000000 +0200
3525 @@ -9,6 +9,7 @@
3526  #include <linux/blkdev.h>
3527  #include <linux/slab.h>
3528  #include <linux/sched/task.h>
3529 +#include <linux/delay.h>
3530  
3531  #include "blk.h"
3532  
3533 @@ -118,7 +119,7 @@
3534                         spin_unlock(q->queue_lock);
3535                 } else {
3536                         spin_unlock_irqrestore(&ioc->lock, flags);
3537 -                       cpu_relax();
3538 +                       cpu_chill();
3539                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3540                 }
3541         }
3542 @@ -202,7 +203,7 @@
3543                                 spin_unlock(icq->q->queue_lock);
3544                         } else {
3545                                 spin_unlock_irqrestore(&ioc->lock, flags);
3546 -                               cpu_relax();
3547 +                               cpu_chill();
3548                                 goto retry;
3549                         }
3550                 }
3551 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-mq.c linux-4.14/block/blk-mq.c
3552 --- linux-4.14.orig/block/blk-mq.c      2018-09-05 11:03:20.000000000 +0200
3553 +++ linux-4.14/block/blk-mq.c   2018-09-05 11:05:07.000000000 +0200
3554 @@ -339,6 +339,9 @@
3555         /* tag was already set */
3556         rq->extra_len = 0;
3557  
3558 +#ifdef CONFIG_PREEMPT_RT_FULL
3559 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3560 +#endif
3561         INIT_LIST_HEAD(&rq->timeout_list);
3562         rq->timeout = 0;
3563  
3564 @@ -533,12 +536,24 @@
3565  }
3566  EXPORT_SYMBOL(blk_mq_end_request);
3567  
3568 +#ifdef CONFIG_PREEMPT_RT_FULL
3569 +
3570 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3571 +{
3572 +       struct request *rq = container_of(work, struct request, work);
3573 +
3574 +       rq->q->softirq_done_fn(rq);
3575 +}
3576 +
3577 +#else
3578 +
3579  static void __blk_mq_complete_request_remote(void *data)
3580  {
3581         struct request *rq = data;
3582  
3583         rq->q->softirq_done_fn(rq);
3584  }
3585 +#endif
3586  
3587  static void __blk_mq_complete_request(struct request *rq)
3588  {
3589 @@ -558,19 +573,27 @@
3590                 return;
3591         }
3592  
3593 -       cpu = get_cpu();
3594 +       cpu = get_cpu_light();
3595         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3596                 shared = cpus_share_cache(cpu, ctx->cpu);
3597  
3598         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3599 +#ifdef CONFIG_PREEMPT_RT_FULL
3600 +               /*
3601 +                * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
3602 +                * here. But we could try to invoke it one the CPU like this.
3603 +                */
3604 +               schedule_work_on(ctx->cpu, &rq->work);
3605 +#else
3606                 rq->csd.func = __blk_mq_complete_request_remote;
3607                 rq->csd.info = rq;
3608                 rq->csd.flags = 0;
3609                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3610 +#endif
3611         } else {
3612                 rq->q->softirq_done_fn(rq);
3613         }
3614 -       put_cpu();
3615 +       put_cpu_light();
3616  }
3617  
3618  /**
3619 @@ -1238,14 +1261,14 @@
3620                 return;
3621  
3622         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3623 -               int cpu = get_cpu();
3624 +               int cpu = get_cpu_light();
3625                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3626                         __blk_mq_run_hw_queue(hctx);
3627 -                       put_cpu();
3628 +                       put_cpu_light();
3629                         return;
3630                 }
3631  
3632 -               put_cpu();
3633 +               put_cpu_light();
3634         }
3635  
3636         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
3637 @@ -2863,10 +2886,9 @@
3638         kt = nsecs;
3639  
3640         mode = HRTIMER_MODE_REL;
3641 -       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3642 +       hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
3643         hrtimer_set_expires(&hs.timer, kt);
3644  
3645 -       hrtimer_init_sleeper(&hs, current);
3646         do {
3647                 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
3648                         break;
3649 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-mq.h linux-4.14/block/blk-mq.h
3650 --- linux-4.14.orig/block/blk-mq.h      2018-09-05 11:03:20.000000000 +0200
3651 +++ linux-4.14/block/blk-mq.h   2018-09-05 11:05:07.000000000 +0200
3652 @@ -98,12 +98,12 @@
3653   */
3654  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3655  {
3656 -       return __blk_mq_get_ctx(q, get_cpu());
3657 +       return __blk_mq_get_ctx(q, get_cpu_light());
3658  }
3659  
3660  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3661  {
3662 -       put_cpu();
3663 +       put_cpu_light();
3664  }
3665  
3666  struct blk_mq_alloc_data {
3667 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/blk-softirq.c linux-4.14/block/blk-softirq.c
3668 --- linux-4.14.orig/block/blk-softirq.c 2017-11-12 19:46:13.000000000 +0100
3669 +++ linux-4.14/block/blk-softirq.c      2018-09-05 11:05:07.000000000 +0200
3670 @@ -53,6 +53,7 @@
3671                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3672  
3673         local_irq_restore(flags);
3674 +       preempt_check_resched_rt();
3675  }
3676  
3677  /*
3678 @@ -91,6 +92,7 @@
3679                          this_cpu_ptr(&blk_cpu_done));
3680         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3681         local_irq_enable();
3682 +       preempt_check_resched_rt();
3683  
3684         return 0;
3685  }
3686 @@ -143,6 +145,7 @@
3687                 goto do_local;
3688  
3689         local_irq_restore(flags);
3690 +       preempt_check_resched_rt();
3691  }
3692  
3693  /**
3694 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/block/bounce.c linux-4.14/block/bounce.c
3695 --- linux-4.14.orig/block/bounce.c      2018-09-05 11:03:20.000000000 +0200
3696 +++ linux-4.14/block/bounce.c   2018-09-05 11:05:07.000000000 +0200
3697 @@ -66,11 +66,11 @@
3698         unsigned long flags;
3699         unsigned char *vto;
3700  
3701 -       local_irq_save(flags);
3702 +       local_irq_save_nort(flags);
3703         vto = kmap_atomic(to->bv_page);
3704         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3705         kunmap_atomic(vto);
3706 -       local_irq_restore(flags);
3707 +       local_irq_restore_nort(flags);
3708  }
3709  
3710  #else /* CONFIG_HIGHMEM */
3711 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/algapi.c linux-4.14/crypto/algapi.c
3712 --- linux-4.14.orig/crypto/algapi.c     2018-09-05 11:03:20.000000000 +0200
3713 +++ linux-4.14/crypto/algapi.c  2018-09-05 11:05:07.000000000 +0200
3714 @@ -731,13 +731,13 @@
3715  
3716  int crypto_register_notifier(struct notifier_block *nb)
3717  {
3718 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3719 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3720  }
3721  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3722  
3723  int crypto_unregister_notifier(struct notifier_block *nb)
3724  {
3725 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3726 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3727  }
3728  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3729  
3730 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/api.c linux-4.14/crypto/api.c
3731 --- linux-4.14.orig/crypto/api.c        2017-11-12 19:46:13.000000000 +0100
3732 +++ linux-4.14/crypto/api.c     2018-09-05 11:05:07.000000000 +0200
3733 @@ -31,7 +31,7 @@
3734  DECLARE_RWSEM(crypto_alg_sem);
3735  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3736  
3737 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3738 +SRCU_NOTIFIER_HEAD(crypto_chain);
3739  EXPORT_SYMBOL_GPL(crypto_chain);
3740  
3741  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3742 @@ -236,10 +236,10 @@
3743  {
3744         int ok;
3745  
3746 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3747 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3748         if (ok == NOTIFY_DONE) {
3749                 request_module("cryptomgr");
3750 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3751 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3752         }
3753  
3754         return ok;
3755 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/crypto/internal.h linux-4.14/crypto/internal.h
3756 --- linux-4.14.orig/crypto/internal.h   2017-11-12 19:46:13.000000000 +0100
3757 +++ linux-4.14/crypto/internal.h        2018-09-05 11:05:07.000000000 +0200
3758 @@ -47,7 +47,7 @@
3759  
3760  extern struct list_head crypto_alg_list;
3761  extern struct rw_semaphore crypto_alg_sem;
3762 -extern struct blocking_notifier_head crypto_chain;
3763 +extern struct srcu_notifier_head crypto_chain;
3764  
3765  #ifdef CONFIG_PROC_FS
3766  void __init crypto_init_proc(void);
3767 @@ -143,7 +143,7 @@
3768  
3769  static inline void crypto_notify(unsigned long val, void *v)
3770  {
3771 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3772 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3773  }
3774  
3775  #endif /* _CRYPTO_INTERNAL_H */
3776 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/events.txt linux-4.14/Documentation/trace/events.txt
3777 --- linux-4.14.orig/Documentation/trace/events.txt      2017-11-12 19:46:13.000000000 +0100
3778 +++ linux-4.14/Documentation/trace/events.txt   2018-09-05 11:05:07.000000000 +0200
3779 @@ -517,1550 +517,4 @@
3780    totals derived from one or more trace event format fields and/or
3781    event counts (hitcount).
3782  
3783 -  The format of a hist trigger is as follows:
3784 -
3785 -        hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
3786 -          [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
3787 -          [:clear][:name=histname1] [if <filter>]
3788 -
3789 -  When a matching event is hit, an entry is added to a hash table
3790 -  using the key(s) and value(s) named.  Keys and values correspond to
3791 -  fields in the event's format description.  Values must correspond to
3792 -  numeric fields - on an event hit, the value(s) will be added to a
3793 -  sum kept for that field.  The special string 'hitcount' can be used
3794 -  in place of an explicit value field - this is simply a count of
3795 -  event hits.  If 'values' isn't specified, an implicit 'hitcount'
3796 -  value will be automatically created and used as the only value.
3797 -  Keys can be any field, or the special string 'stacktrace', which
3798 -  will use the event's kernel stacktrace as the key.  The keywords
3799 -  'keys' or 'key' can be used to specify keys, and the keywords
3800 -  'values', 'vals', or 'val' can be used to specify values.  Compound
3801 -  keys consisting of up to two fields can be specified by the 'keys'
3802 -  keyword.  Hashing a compound key produces a unique entry in the
3803 -  table for each unique combination of component keys, and can be
3804 -  useful for providing more fine-grained summaries of event data.
3805 -  Additionally, sort keys consisting of up to two fields can be
3806 -  specified by the 'sort' keyword.  If more than one field is
3807 -  specified, the result will be a 'sort within a sort': the first key
3808 -  is taken to be the primary sort key and the second the secondary
3809 -  key.  If a hist trigger is given a name using the 'name' parameter,
3810 -  its histogram data will be shared with other triggers of the same
3811 -  name, and trigger hits will update this common data.  Only triggers
3812 -  with 'compatible' fields can be combined in this way; triggers are
3813 -  'compatible' if the fields named in the trigger share the same
3814 -  number and type of fields and those fields also have the same names.
3815 -  Note that any two events always share the compatible 'hitcount' and
3816 -  'stacktrace' fields and can therefore be combined using those
3817 -  fields, however pointless that may be.
3818 -
3819 -  'hist' triggers add a 'hist' file to each event's subdirectory.
3820 -  Reading the 'hist' file for the event will dump the hash table in
3821 -  its entirety to stdout.  If there are multiple hist triggers
3822 -  attached to an event, there will be a table for each trigger in the
3823 -  output.  The table displayed for a named trigger will be the same as
3824 -  any other instance having the same name. Each printed hash table
3825 -  entry is a simple list of the keys and values comprising the entry;
3826 -  keys are printed first and are delineated by curly braces, and are
3827 -  followed by the set of value fields for the entry.  By default,
3828 -  numeric fields are displayed as base-10 integers.  This can be
3829 -  modified by appending any of the following modifiers to the field
3830 -  name:
3831 -
3832 -        .hex        display a number as a hex value
3833 -       .sym        display an address as a symbol
3834 -       .sym-offset display an address as a symbol and offset
3835 -       .syscall    display a syscall id as a system call name
3836 -       .execname   display a common_pid as a program name
3837 -
3838 -  Note that in general the semantics of a given field aren't
3839 -  interpreted when applying a modifier to it, but there are some
3840 -  restrictions to be aware of in this regard:
3841 -
3842 -    - only the 'hex' modifier can be used for values (because values
3843 -      are essentially sums, and the other modifiers don't make sense
3844 -      in that context).
3845 -    - the 'execname' modifier can only be used on a 'common_pid'.  The
3846 -      reason for this is that the execname is simply the 'comm' value
3847 -      saved for the 'current' process when an event was triggered,
3848 -      which is the same as the common_pid value saved by the event
3849 -      tracing code.  Trying to apply that comm value to other pid
3850 -      values wouldn't be correct, and typically events that care save
3851 -      pid-specific comm fields in the event itself.
3852 -
3853 -  A typical usage scenario would be the following to enable a hist
3854 -  trigger, read its current contents, and then turn it off:
3855 -
3856 -  # echo 'hist:keys=skbaddr.hex:vals=len' > \
3857 -    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3858 -
3859 -  # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
3860 -
3861 -  # echo '!hist:keys=skbaddr.hex:vals=len' > \
3862 -    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3863 -
3864 -  The trigger file itself can be read to show the details of the
3865 -  currently attached hist trigger.  This information is also displayed
3866 -  at the top of the 'hist' file when read.
3867 -
3868 -  By default, the size of the hash table is 2048 entries.  The 'size'
3869 -  parameter can be used to specify more or fewer than that.  The units
3870 -  are in terms of hashtable entries - if a run uses more entries than
3871 -  specified, the results will show the number of 'drops', the number
3872 -  of hits that were ignored.  The size should be a power of 2 between
3873 -  128 and 131072 (any non- power-of-2 number specified will be rounded
3874 -  up).
3875 -
3876 -  The 'sort' parameter can be used to specify a value field to sort
3877 -  on.  The default if unspecified is 'hitcount' and the default sort
3878 -  order is 'ascending'.  To sort in the opposite direction, append
3879 -  .descending' to the sort key.
3880 -
3881 -  The 'pause' parameter can be used to pause an existing hist trigger
3882 -  or to start a hist trigger but not log any events until told to do
3883 -  so.  'continue' or 'cont' can be used to start or restart a paused
3884 -  hist trigger.
3885 -
3886 -  The 'clear' parameter will clear the contents of a running hist
3887 -  trigger and leave its current paused/active state.
3888 -
3889 -  Note that the 'pause', 'cont', and 'clear' parameters should be
3890 -  applied using 'append' shell operator ('>>') if applied to an
3891 -  existing trigger, rather than via the '>' operator, which will cause
3892 -  the trigger to be removed through truncation.
3893 -
3894 -- enable_hist/disable_hist
3895 -
3896 -  The enable_hist and disable_hist triggers can be used to have one
3897 -  event conditionally start and stop another event's already-attached
3898 -  hist trigger.  Any number of enable_hist and disable_hist triggers
3899 -  can be attached to a given event, allowing that event to kick off
3900 -  and stop aggregations on a host of other events.
3901 -
3902 -  The format is very similar to the enable/disable_event triggers:
3903 -
3904 -      enable_hist:<system>:<event>[:count]
3905 -      disable_hist:<system>:<event>[:count]
3906 -
3907 -  Instead of enabling or disabling the tracing of the target event
3908 -  into the trace buffer as the enable/disable_event triggers do, the
3909 -  enable/disable_hist triggers enable or disable the aggregation of
3910 -  the target event into a hash table.
3911 -
3912 -  A typical usage scenario for the enable_hist/disable_hist triggers
3913 -  would be to first set up a paused hist trigger on some event,
3914 -  followed by an enable_hist/disable_hist pair that turns the hist
3915 -  aggregation on and off when conditions of interest are hit:
3916 -
3917 -  # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
3918 -    /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
3919 -
3920 -  # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
3921 -    /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
3922 -
3923 -  # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
3924 -    /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
3925 -
3926 -  The above sets up an initially paused hist trigger which is unpaused
3927 -  and starts aggregating events when a given program is executed, and
3928 -  which stops aggregating when the process exits and the hist trigger
3929 -  is paused again.
3930 -
3931 -  The examples below provide a more concrete illustration of the
3932 -  concepts and typical usage patterns discussed above.
3933 -
3934 -
3935 -6.2 'hist' trigger examples
3936 ----------------------------
3937 -
3938 -  The first set of examples creates aggregations using the kmalloc
3939 -  event.  The fields that can be used for the hist trigger are listed
3940 -  in the kmalloc event's format file:
3941 -
3942 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
3943 -    name: kmalloc
3944 -    ID: 374
3945 -    format:
3946 -       field:unsigned short common_type;       offset:0;       size:2; signed:0;
3947 -       field:unsigned char common_flags;       offset:2;       size:1; signed:0;
3948 -       field:unsigned char common_preempt_count;               offset:3;       size:1; signed:0;
3949 -       field:int common_pid;                                   offset:4;       size:4; signed:1;
3950 -
3951 -       field:unsigned long call_site;                          offset:8;       size:8; signed:0;
3952 -       field:const void * ptr;                                 offset:16;      size:8; signed:0;
3953 -       field:size_t bytes_req;                                 offset:24;      size:8; signed:0;
3954 -       field:size_t bytes_alloc;                               offset:32;      size:8; signed:0;
3955 -       field:gfp_t gfp_flags;                                  offset:40;      size:4; signed:0;
3956 -
3957 -  We'll start by creating a hist trigger that generates a simple table
3958 -  that lists the total number of bytes requested for each function in
3959 -  the kernel that made one or more calls to kmalloc:
3960 -
3961 -    # echo 'hist:key=call_site:val=bytes_req' > \
3962 -            /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
3963 -
3964 -  This tells the tracing system to create a 'hist' trigger using the
3965 -  call_site field of the kmalloc event as the key for the table, which
3966 -  just means that each unique call_site address will have an entry
3967 -  created for it in the table.  The 'val=bytes_req' parameter tells
3968 -  the hist trigger that for each unique entry (call_site) in the
3969 -  table, it should keep a running total of the number of bytes
3970 -  requested by that call_site.
3971 -
3972 -  We'll let it run for awhile and then dump the contents of the 'hist'
3973 -  file in the kmalloc event's subdirectory (for readability, a number
3974 -  of entries have been omitted):
3975 -
3976 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
3977 -    # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
3978 -
3979 -    { call_site: 18446744072106379007 } hitcount:          1  bytes_req:        176
3980 -    { call_site: 18446744071579557049 } hitcount:          1  bytes_req:       1024
3981 -    { call_site: 18446744071580608289 } hitcount:          1  bytes_req:      16384
3982 -    { call_site: 18446744071581827654 } hitcount:          1  bytes_req:         24
3983 -    { call_site: 18446744071580700980 } hitcount:          1  bytes_req:          8
3984 -    { call_site: 18446744071579359876 } hitcount:          1  bytes_req:        152
3985 -    { call_site: 18446744071580795365 } hitcount:          3  bytes_req:        144
3986 -    { call_site: 18446744071581303129 } hitcount:          3  bytes_req:        144
3987 -    { call_site: 18446744071580713234 } hitcount:          4  bytes_req:       2560
3988 -    { call_site: 18446744071580933750 } hitcount:          4  bytes_req:        736
3989 -    .
3990 -    .
3991 -    .
3992 -    { call_site: 18446744072106047046 } hitcount:         69  bytes_req:       5576
3993 -    { call_site: 18446744071582116407 } hitcount:         73  bytes_req:       2336
3994 -    { call_site: 18446744072106054684 } hitcount:        136  bytes_req:     140504
3995 -    { call_site: 18446744072106224230 } hitcount:        136  bytes_req:      19584
3996 -    { call_site: 18446744072106078074 } hitcount:        153  bytes_req:       2448
3997 -    { call_site: 18446744072106062406 } hitcount:        153  bytes_req:      36720
3998 -    { call_site: 18446744071582507929 } hitcount:        153  bytes_req:      37088
3999 -    { call_site: 18446744072102520590 } hitcount:        273  bytes_req:      10920
4000 -    { call_site: 18446744071582143559 } hitcount:        358  bytes_req:        716
4001 -    { call_site: 18446744072106465852 } hitcount:        417  bytes_req:      56712
4002 -    { call_site: 18446744072102523378 } hitcount:        485  bytes_req:      27160
4003 -    { call_site: 18446744072099568646 } hitcount:       1676  bytes_req:      33520
4004 -
4005 -    Totals:
4006 -        Hits: 4610
4007 -        Entries: 45
4008 -        Dropped: 0
4009 -
4010 -  The output displays a line for each entry, beginning with the key
4011 -  specified in the trigger, followed by the value(s) also specified in
4012 -  the trigger.  At the beginning of the output is a line that displays
4013 -  the trigger info, which can also be displayed by reading the
4014 -  'trigger' file:
4015 -
4016 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4017 -    hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
4018 -
4019 -  At the end of the output are a few lines that display the overall
4020 -  totals for the run.  The 'Hits' field shows the total number of
4021 -  times the event trigger was hit, the 'Entries' field shows the total
4022 -  number of used entries in the hash table, and the 'Dropped' field
4023 -  shows the number of hits that were dropped because the number of
4024 -  used entries for the run exceeded the maximum number of entries
4025 -  allowed for the table (normally 0, but if not a hint that you may
4026 -  want to increase the size of the table using the 'size' parameter).
4027 -
4028 -  Notice in the above output that there's an extra field, 'hitcount',
4029 -  which wasn't specified in the trigger.  Also notice that in the
4030 -  trigger info output, there's a parameter, 'sort=hitcount', which
4031 -  wasn't specified in the trigger either.  The reason for that is that
4032 -  every trigger implicitly keeps a count of the total number of hits
4033 -  attributed to a given entry, called the 'hitcount'.  That hitcount
4034 -  information is explicitly displayed in the output, and in the
4035 -  absence of a user-specified sort parameter, is used as the default
4036 -  sort field.
4037 -
4038 -  The value 'hitcount' can be used in place of an explicit value in
4039 -  the 'values' parameter if you don't really need to have any
4040 -  particular field summed and are mainly interested in hit
4041 -  frequencies.
4042 -
4043 -  To turn the hist trigger off, simply call up the trigger in the
4044 -  command history and re-execute it with a '!' prepended:
4045 -
4046 -    # echo '!hist:key=call_site:val=bytes_req' > \
4047 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4048 -
4049 -  Finally, notice that the call_site as displayed in the output above
4050 -  isn't really very useful.  It's an address, but normally addresses
4051 -  are displayed in hex.  To have a numeric field displayed as a hex
4052 -  value, simply append '.hex' to the field name in the trigger:
4053 -
4054 -    # echo 'hist:key=call_site.hex:val=bytes_req' > \
4055 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4056 -
4057 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4058 -    # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
4059 -
4060 -    { call_site: ffffffffa026b291 } hitcount:          1  bytes_req:        433
4061 -    { call_site: ffffffffa07186ff } hitcount:          1  bytes_req:        176
4062 -    { call_site: ffffffff811ae721 } hitcount:          1  bytes_req:      16384
4063 -    { call_site: ffffffff811c5134 } hitcount:          1  bytes_req:          8
4064 -    { call_site: ffffffffa04a9ebb } hitcount:          1  bytes_req:        511
4065 -    { call_site: ffffffff8122e0a6 } hitcount:          1  bytes_req:         12
4066 -    { call_site: ffffffff8107da84 } hitcount:          1  bytes_req:        152
4067 -    { call_site: ffffffff812d8246 } hitcount:          1  bytes_req:         24
4068 -    { call_site: ffffffff811dc1e5 } hitcount:          3  bytes_req:        144
4069 -    { call_site: ffffffffa02515e8 } hitcount:          3  bytes_req:        648
4070 -    { call_site: ffffffff81258159 } hitcount:          3  bytes_req:        144
4071 -    { call_site: ffffffff811c80f4 } hitcount:          4  bytes_req:        544
4072 -    .
4073 -    .
4074 -    .
4075 -    { call_site: ffffffffa06c7646 } hitcount:        106  bytes_req:       8024
4076 -    { call_site: ffffffffa06cb246 } hitcount:        132  bytes_req:      31680
4077 -    { call_site: ffffffffa06cef7a } hitcount:        132  bytes_req:       2112
4078 -    { call_site: ffffffff8137e399 } hitcount:        132  bytes_req:      23232
4079 -    { call_site: ffffffffa06c941c } hitcount:        185  bytes_req:     171360
4080 -    { call_site: ffffffffa06f2a66 } hitcount:        185  bytes_req:      26640
4081 -    { call_site: ffffffffa036a70e } hitcount:        265  bytes_req:      10600
4082 -    { call_site: ffffffff81325447 } hitcount:        292  bytes_req:        584
4083 -    { call_site: ffffffffa072da3c } hitcount:        446  bytes_req:      60656
4084 -    { call_site: ffffffffa036b1f2 } hitcount:        526  bytes_req:      29456
4085 -    { call_site: ffffffffa0099c06 } hitcount:       1780  bytes_req:      35600
4086 -
4087 -    Totals:
4088 -        Hits: 4775
4089 -        Entries: 46
4090 -        Dropped: 0
4091 -
4092 -  Even that's only marginally more useful - while hex values do look
4093 -  more like addresses, what users are typically more interested in
4094 -  when looking at text addresses are the corresponding symbols
4095 -  instead.  To have an address displayed as symbolic value instead,
4096 -  simply append '.sym' or '.sym-offset' to the field name in the
4097 -  trigger:
4098 -
4099 -    # echo 'hist:key=call_site.sym:val=bytes_req' > \
4100 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4101 -
4102 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4103 -    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
4104 -
4105 -    { call_site: [ffffffff810adcb9] syslog_print_all                              } hitcount:          1  bytes_req:       1024
4106 -    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
4107 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
4108 -    { call_site: [ffffffff8154acbe] usb_alloc_urb                                 } hitcount:          1  bytes_req:        192
4109 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
4110 -    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
4111 -    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
4112 -    { call_site: [ffffffff811febd5] fsnotify_alloc_group                          } hitcount:          2  bytes_req:        528
4113 -    { call_site: [ffffffff81440f58] __tty_buffer_request_room                     } hitcount:          2  bytes_req:       2624
4114 -    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          2  bytes_req:         96
4115 -    { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211]      } hitcount:          2  bytes_req:        464
4116 -    { call_site: [ffffffff81672406] tcp_get_metrics                               } hitcount:          2  bytes_req:        304
4117 -    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
4118 -    { call_site: [ffffffff81089b05] sched_create_group                            } hitcount:          2  bytes_req:       1424
4119 -    .
4120 -    .
4121 -    .
4122 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1185  bytes_req:     123240
4123 -    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:       1185  bytes_req:     104280
4124 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       1402  bytes_req:     190672
4125 -    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       1518  bytes_req:     146208
4126 -    { call_site: [ffffffffa029070e] drm_vma_node_allow [drm]                      } hitcount:       1746  bytes_req:      69840
4127 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       2021  bytes_req:     792312
4128 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       2592  bytes_req:     145152
4129 -    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2629  bytes_req:     378576
4130 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2629  bytes_req:    3783248
4131 -    { call_site: [ffffffff81325607] apparmor_file_alloc_security                  } hitcount:       5192  bytes_req:      10384
4132 -    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       5529  bytes_req:     110584
4133 -    { call_site: [ffffffff8131ebf7] aa_alloc_task_context                         } hitcount:      21943  bytes_req:     702176
4134 -    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:      55759  bytes_req:    5074265
4135 -
4136 -    Totals:
4137 -        Hits: 109928
4138 -        Entries: 71
4139 -        Dropped: 0
4140 -
4141 -  Because the default sort key above is 'hitcount', the above shows a
4142 -  the list of call_sites by increasing hitcount, so that at the bottom
4143 -  we see the functions that made the most kmalloc calls during the
4144 -  run.  If instead we we wanted to see the top kmalloc callers in
4145 -  terms of the number of bytes requested rather than the number of
4146 -  calls, and we wanted the top caller to appear at the top, we can use
4147 -  the 'sort' parameter, along with the 'descending' modifier:
4148 -
4149 -    # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
4150 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4151 -
4152 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4153 -    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
4154 -
4155 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2186  bytes_req:    3397464
4156 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1790  bytes_req:     712176
4157 -    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       8132  bytes_req:     513135
4158 -    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        106  bytes_req:     440128
4159 -    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2186  bytes_req:     314784
4160 -    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       2174  bytes_req:     208992
4161 -    { call_site: [ffffffff811ae8e1] __kmalloc                                     } hitcount:          8  bytes_req:     131072
4162 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:        859  bytes_req:     116824
4163 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       1834  bytes_req:     102704
4164 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:        972  bytes_req:     101088
4165 -    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:        972  bytes_req:      85536
4166 -    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       3333  bytes_req:      66664
4167 -    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        209  bytes_req:      61632
4168 -    .
4169 -    .
4170 -    .
4171 -    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
4172 -    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
4173 -    { call_site: [ffffffff812d8406] copy_semundo                                  } hitcount:          2  bytes_req:         48
4174 -    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          1  bytes_req:         48
4175 -    { call_site: [ffffffffa027121a] drm_getmagic [drm]                            } hitcount:          1  bytes_req:         48
4176 -    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
4177 -    { call_site: [ffffffff811c52f4] bprm_change_interp                            } hitcount:          2  bytes_req:         16
4178 -    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
4179 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
4180 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
4181 -
4182 -    Totals:
4183 -        Hits: 32133
4184 -        Entries: 81
4185 -        Dropped: 0
4186 -
4187 -  To display the offset and size information in addition to the symbol
4188 -  name, just use 'sym-offset' instead:
4189 -
4190 -    # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
4191 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4192 -
4193 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4194 -    # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
4195 -
4196 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915]                  } hitcount:       4569  bytes_req:    3163720
4197 -    { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915]                      } hitcount:       4569  bytes_req:     657936
4198 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915]      } hitcount:       1519  bytes_req:     472936
4199 -    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915]      } hitcount:       3050  bytes_req:     211832
4200 -    { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50                                 } hitcount:         34  bytes_req:     148384
4201 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915]                  } hitcount:       1385  bytes_req:     144040
4202 -    { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0                                   } hitcount:          8  bytes_req:     131072
4203 -    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm]              } hitcount:       1385  bytes_req:     121880
4204 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm]                  } hitcount:       1848  bytes_req:     103488
4205 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915]            } hitcount:        461  bytes_req:      62696
4206 -    { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm]                      } hitcount:       1541  bytes_req:      61640
4207 -    { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0                                } hitcount:         57  bytes_req:      57456
4208 -    .
4209 -    .
4210 -    .
4211 -    { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0                       } hitcount:          2  bytes_req:        128
4212 -    { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm]                      } hitcount:          3  bytes_req:         96
4213 -    { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0                         } hitcount:          8  bytes_req:         96
4214 -    { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650                            } hitcount:          3  bytes_req:         84
4215 -    { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110                              } hitcount:          1  bytes_req:          8
4216 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid]                     } hitcount:          1  bytes_req:          7
4217 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid]                    } hitcount:          1  bytes_req:          7
4218 -
4219 -    Totals:
4220 -        Hits: 26098
4221 -        Entries: 64
4222 -        Dropped: 0
4223 -
4224 -  We can also add multiple fields to the 'values' parameter.  For
4225 -  example, we might want to see the total number of bytes allocated
4226 -  alongside bytes requested, and display the result sorted by bytes
4227 -  allocated in a descending order:
4228 -
4229 -    # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
4230 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4231 -
4232 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4233 -    # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
4234 -
4235 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       7403  bytes_req:    4084360  bytes_alloc:    5958016
4236 -    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        541  bytes_req:    2213968  bytes_alloc:    2228224
4237 -    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       7404  bytes_req:    1066176  bytes_alloc:    1421568
4238 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1565  bytes_req:     557368  bytes_alloc:    1037760
4239 -    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       9557  bytes_req:     595778  bytes_alloc:     695744
4240 -    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       5839  bytes_req:     430680  bytes_alloc:     470400
4241 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       2388  bytes_req:     324768  bytes_alloc:     458496
4242 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       3911  bytes_req:     219016  bytes_alloc:     250304
4243 -    { call_site: [ffffffff815f8d7b] sk_prot_alloc                                 } hitcount:        235  bytes_req:     236880  bytes_alloc:     240640
4244 -    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        557  bytes_req:     169024  bytes_alloc:     221760
4245 -    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       9378  bytes_req:     187548  bytes_alloc:     206312
4246 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1519  bytes_req:     157976  bytes_alloc:     194432
4247 -    .
4248 -    .
4249 -    .
4250 -    { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach                 } hitcount:          2  bytes_req:        144  bytes_alloc:        192
4251 -    { call_site: [ffffffff81097ee8] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
4252 -    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
4253 -    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
4254 -    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
4255 -    { call_site: [ffffffff81213e80] load_elf_binary                               } hitcount:          3  bytes_req:         84  bytes_alloc:         96
4256 -    { call_site: [ffffffff81079a2e] kthread_create_on_node                        } hitcount:          1  bytes_req:         56  bytes_alloc:         64
4257 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7  bytes_alloc:          8
4258 -    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8  bytes_alloc:          8
4259 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7  bytes_alloc:          8
4260 -
4261 -    Totals:
4262 -        Hits: 66598
4263 -        Entries: 65
4264 -        Dropped: 0
4265 -
4266 -  Finally, to finish off our kmalloc example, instead of simply having
4267 -  the hist trigger display symbolic call_sites, we can have the hist
4268 -  trigger additionally display the complete set of kernel stack traces
4269 -  that led to each call_site.  To do that, we simply use the special
4270 -  value 'stacktrace' for the key parameter:
4271 -
4272 -    # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
4273 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
4274 -
4275 -  The above trigger will use the kernel stack trace in effect when an
4276 -  event is triggered as the key for the hash table.  This allows the
4277 -  enumeration of every kernel callpath that led up to a particular
4278 -  event, along with a running total of any of the event fields for
4279 -  that event.  Here we tally bytes requested and bytes allocated for
4280 -  every callpath in the system that led up to a kmalloc (in this case
4281 -  every callpath to a kmalloc for a kernel compile):
4282 -
4283 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
4284 -    # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
4285 -
4286 -    { stacktrace:
4287 -         __kmalloc_track_caller+0x10b/0x1a0
4288 -         kmemdup+0x20/0x50
4289 -         hidraw_report_event+0x8a/0x120 [hid]
4290 -         hid_report_raw_event+0x3ea/0x440 [hid]
4291 -         hid_input_report+0x112/0x190 [hid]
4292 -         hid_irq_in+0xc2/0x260 [usbhid]
4293 -         __usb_hcd_giveback_urb+0x72/0x120
4294 -         usb_giveback_urb_bh+0x9e/0xe0
4295 -         tasklet_hi_action+0xf8/0x100
4296 -         __do_softirq+0x114/0x2c0
4297 -         irq_exit+0xa5/0xb0
4298 -         do_IRQ+0x5a/0xf0
4299 -         ret_from_intr+0x0/0x30
4300 -         cpuidle_enter+0x17/0x20
4301 -         cpu_startup_entry+0x315/0x3e0
4302 -         rest_init+0x7c/0x80
4303 -    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
4304 -    { stacktrace:
4305 -         __kmalloc_track_caller+0x10b/0x1a0
4306 -         kmemdup+0x20/0x50
4307 -         hidraw_report_event+0x8a/0x120 [hid]
4308 -         hid_report_raw_event+0x3ea/0x440 [hid]
4309 -         hid_input_report+0x112/0x190 [hid]
4310 -         hid_irq_in+0xc2/0x260 [usbhid]
4311 -         __usb_hcd_giveback_urb+0x72/0x120
4312 -         usb_giveback_urb_bh+0x9e/0xe0
4313 -         tasklet_hi_action+0xf8/0x100
4314 -         __do_softirq+0x114/0x2c0
4315 -         irq_exit+0xa5/0xb0
4316 -         do_IRQ+0x5a/0xf0
4317 -         ret_from_intr+0x0/0x30
4318 -    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
4319 -    { stacktrace:
4320 -         kmem_cache_alloc_trace+0xeb/0x150
4321 -         aa_alloc_task_context+0x27/0x40
4322 -         apparmor_cred_prepare+0x1f/0x50
4323 -         security_prepare_creds+0x16/0x20
4324 -         prepare_creds+0xdf/0x1a0
4325 -         SyS_capset+0xb5/0x200
4326 -         system_call_fastpath+0x12/0x6a
4327 -    } hitcount:          1  bytes_req:         32  bytes_alloc:         32
4328 -    .
4329 -    .
4330 -    .
4331 -    { stacktrace:
4332 -         __kmalloc+0x11b/0x1b0
4333 -         i915_gem_execbuffer2+0x6c/0x2c0 [i915]
4334 -         drm_ioctl+0x349/0x670 [drm]
4335 -         do_vfs_ioctl+0x2f0/0x4f0
4336 -         SyS_ioctl+0x81/0xa0
4337 -         system_call_fastpath+0x12/0x6a
4338 -    } hitcount:      17726  bytes_req:   13944120  bytes_alloc:   19593808
4339 -    { stacktrace:
4340 -         __kmalloc+0x11b/0x1b0
4341 -         load_elf_phdrs+0x76/0xa0
4342 -         load_elf_binary+0x102/0x1650
4343 -         search_binary_handler+0x97/0x1d0
4344 -         do_execveat_common.isra.34+0x551/0x6e0
4345 -         SyS_execve+0x3a/0x50
4346 -         return_from_execve+0x0/0x23
4347 -    } hitcount:      33348  bytes_req:   17152128  bytes_alloc:   20226048
4348 -    { stacktrace:
4349 -         kmem_cache_alloc_trace+0xeb/0x150
4350 -         apparmor_file_alloc_security+0x27/0x40
4351 -         security_file_alloc+0x16/0x20
4352 -         get_empty_filp+0x93/0x1c0
4353 -         path_openat+0x31/0x5f0
4354 -         do_filp_open+0x3a/0x90
4355 -         do_sys_open+0x128/0x220
4356 -         SyS_open+0x1e/0x20
4357 -         system_call_fastpath+0x12/0x6a
4358 -    } hitcount:    4766422  bytes_req:    9532844  bytes_alloc:   38131376
4359 -    { stacktrace:
4360 -         __kmalloc+0x11b/0x1b0
4361 -         seq_buf_alloc+0x1b/0x50
4362 -         seq_read+0x2cc/0x370
4363 -         proc_reg_read+0x3d/0x80
4364 -         __vfs_read+0x28/0xe0
4365 -         vfs_read+0x86/0x140
4366 -         SyS_read+0x46/0xb0
4367 -         system_call_fastpath+0x12/0x6a
4368 -    } hitcount:      19133  bytes_req:   78368768  bytes_alloc:   78368768
4369 -
4370 -    Totals:
4371 -        Hits: 6085872
4372 -        Entries: 253
4373 -        Dropped: 0
4374 -
4375 -  If you key a hist trigger on common_pid, in order for example to
4376 -  gather and display sorted totals for each process, you can use the
4377 -  special .execname modifier to display the executable names for the
4378 -  processes in the table rather than raw pids.  The example below
4379 -  keeps a per-process sum of total bytes read:
4380 -
4381 -    # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
4382 -           /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
4383 -
4384 -    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
4385 -    # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
4386 -
4387 -    { common_pid: gnome-terminal  [      3196] } hitcount:        280  count:    1093512
4388 -    { common_pid: Xorg            [      1309] } hitcount:        525  count:     256640
4389 -    { common_pid: compiz          [      2889] } hitcount:         59  count:     254400
4390 -    { common_pid: bash            [      8710] } hitcount:          3  count:      66369
4391 -    { common_pid: dbus-daemon-lau [      8703] } hitcount:         49  count:      47739
4392 -    { common_pid: irqbalance      [      1252] } hitcount:         27  count:      27648
4393 -    { common_pid: 01ifupdown      [      8705] } hitcount:          3  count:      17216
4394 -    { common_pid: dbus-daemon     [       772] } hitcount:         10  count:      12396
4395 -    { common_pid: Socket Thread   [      8342] } hitcount:         11  count:      11264
4396 -    { common_pid: nm-dhcp-client. [      8701] } hitcount:          6  count:       7424
4397 -    { common_pid: gmain           [      1315] } hitcount:         18  count:       6336
4398 -    .
4399 -    .
4400 -    .
4401 -    { common_pid: postgres        [      1892] } hitcount:          2  count:         32
4402 -    { common_pid: postgres        [      1891] } hitcount:          2  count:         32
4403 -    { common_pid: gmain           [      8704] } hitcount:          2  count:         32
4404 -    { common_pid: upstart-dbus-br [      2740] } hitcount:         21  count:         21
4405 -    { common_pid: nm-dispatcher.a [      8696] } hitcount:          1  count:         16
4406 -    { common_pid: indicator-datet [      2904] } hitcount:          1  count:         16
4407 -    { common_pid: gdbus           [      2998] } hitcount:          1  count:         16
4408 -    { common_pid: rtkit-daemon    [      2052] } hitcount:          1  count:          8
4409 -    { common_pid: init            [         1] } hitcount:          2  count:          2
4410 -
4411 -    Totals:
4412 -        Hits: 2116
4413 -        Entries: 51
4414 -        Dropped: 0
4415 -
4416 -  Similarly, if you key a hist trigger on syscall id, for example to
4417 -  gather and display a list of systemwide syscall hits, you can use
4418 -  the special .syscall modifier to display the syscall names rather
4419 -  than raw ids.  The example below keeps a running total of syscall
4420 -  counts for the system during the run:
4421 -
4422 -    # echo 'hist:key=id.syscall:val=hitcount' > \
4423 -           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4424 -
4425 -    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4426 -    # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
4427 -
4428 -    { id: sys_fsync                     [ 74] } hitcount:          1
4429 -    { id: sys_newuname                  [ 63] } hitcount:          1
4430 -    { id: sys_prctl                     [157] } hitcount:          1
4431 -    { id: sys_statfs                    [137] } hitcount:          1
4432 -    { id: sys_symlink                   [ 88] } hitcount:          1
4433 -    { id: sys_sendmmsg                  [307] } hitcount:          1
4434 -    { id: sys_semctl                    [ 66] } hitcount:          1
4435 -    { id: sys_readlink                  [ 89] } hitcount:          3
4436 -    { id: sys_bind                      [ 49] } hitcount:          3
4437 -    { id: sys_getsockname               [ 51] } hitcount:          3
4438 -    { id: sys_unlink                    [ 87] } hitcount:          3
4439 -    { id: sys_rename                    [ 82] } hitcount:          4
4440 -    { id: unknown_syscall               [ 58] } hitcount:          4
4441 -    { id: sys_connect                   [ 42] } hitcount:          4
4442 -    { id: sys_getpid                    [ 39] } hitcount:          4
4443 -    .
4444 -    .
4445 -    .
4446 -    { id: sys_rt_sigprocmask            [ 14] } hitcount:        952
4447 -    { id: sys_futex                     [202] } hitcount:       1534
4448 -    { id: sys_write                     [  1] } hitcount:       2689
4449 -    { id: sys_setitimer                 [ 38] } hitcount:       2797
4450 -    { id: sys_read                      [  0] } hitcount:       3202
4451 -    { id: sys_select                    [ 23] } hitcount:       3773
4452 -    { id: sys_writev                    [ 20] } hitcount:       4531
4453 -    { id: sys_poll                      [  7] } hitcount:       8314
4454 -    { id: sys_recvmsg                   [ 47] } hitcount:      13738
4455 -    { id: sys_ioctl                     [ 16] } hitcount:      21843
4456 -
4457 -    Totals:
4458 -        Hits: 67612
4459 -        Entries: 72
4460 -        Dropped: 0
4461 -
4462 -    The syscall counts above provide a rough overall picture of system
4463 -    call activity on the system; we can see for example that the most
4464 -    popular system call on this system was the 'sys_ioctl' system call.
4465 -
4466 -    We can use 'compound' keys to refine that number and provide some
4467 -    further insight as to which processes exactly contribute to the
4468 -    overall ioctl count.
4469 -
4470 -    The command below keeps a hitcount for every unique combination of
4471 -    system call id and pid - the end result is essentially a table
4472 -    that keeps a per-pid sum of system call hits.  The results are
4473 -    sorted using the system call id as the primary key, and the
4474 -    hitcount sum as the secondary key:
4475 -
4476 -    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
4477 -           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4478 -
4479 -    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4480 -    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
4481 -
4482 -    { id: sys_read                      [  0], common_pid: rtkit-daemon    [      1877] } hitcount:          1
4483 -    { id: sys_read                      [  0], common_pid: gdbus           [      2976] } hitcount:          1
4484 -    { id: sys_read                      [  0], common_pid: console-kit-dae [      3400] } hitcount:          1
4485 -    { id: sys_read                      [  0], common_pid: postgres        [      1865] } hitcount:          1
4486 -    { id: sys_read                      [  0], common_pid: deja-dup-monito [      3543] } hitcount:          2
4487 -    { id: sys_read                      [  0], common_pid: NetworkManager  [       890] } hitcount:          2
4488 -    { id: sys_read                      [  0], common_pid: evolution-calen [      3048] } hitcount:          2
4489 -    { id: sys_read                      [  0], common_pid: postgres        [      1864] } hitcount:          2
4490 -    { id: sys_read                      [  0], common_pid: nm-applet       [      3022] } hitcount:          2
4491 -    { id: sys_read                      [  0], common_pid: whoopsie        [      1212] } hitcount:          2
4492 -    .
4493 -    .
4494 -    .
4495 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      8479] } hitcount:          1
4496 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      3472] } hitcount:         12
4497 -    { id: sys_ioctl                     [ 16], common_pid: gnome-terminal  [      3199] } hitcount:         16
4498 -    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:       1808
4499 -    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:       5580
4500 -    .
4501 -    .
4502 -    .
4503 -    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2690] } hitcount:          3
4504 -    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2688] } hitcount:         16
4505 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [       975] } hitcount:          2
4506 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3204] } hitcount:          4
4507 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2888] } hitcount:          4
4508 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3003] } hitcount:          4
4509 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2873] } hitcount:          4
4510 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3196] } hitcount:          6
4511 -    { id: sys_openat                    [257], common_pid: java            [      2623] } hitcount:          2
4512 -    { id: sys_eventfd2                  [290], common_pid: ibus-ui-gtk3    [      2760] } hitcount:          4
4513 -    { id: sys_eventfd2                  [290], common_pid: compiz          [      2994] } hitcount:          6
4514 -
4515 -    Totals:
4516 -        Hits: 31536
4517 -        Entries: 323
4518 -        Dropped: 0
4519 -
4520 -    The above list does give us a breakdown of the ioctl syscall by
4521 -    pid, but it also gives us quite a bit more than that, which we
4522 -    don't really care about at the moment.  Since we know the syscall
4523 -    id for sys_ioctl (16, displayed next to the sys_ioctl name), we
4524 -    can use that to filter out all the other syscalls:
4525 -
4526 -    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
4527 -           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
4528 -
4529 -    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
4530 -    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
4531 -
4532 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2769] } hitcount:          1
4533 -    { id: sys_ioctl                     [ 16], common_pid: evolution-addre [      8571] } hitcount:          1
4534 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      3003] } hitcount:          1
4535 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2781] } hitcount:          1
4536 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2829] } hitcount:          1
4537 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      8726] } hitcount:          1
4538 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      8508] } hitcount:          1
4539 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2970] } hitcount:          1
4540 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2768] } hitcount:          1
4541 -    .
4542 -    .
4543 -    .
4544 -    { id: sys_ioctl                     [ 16], common_pid: pool            [      8559] } hitcount:         45
4545 -    { id: sys_ioctl                     [ 16], common_pid: pool            [      8555] } hitcount:         48
4546 -    { id: sys_ioctl                     [ 16], common_pid: pool            [      8551] } hitcount:         48
4547 -    { id: sys_ioctl                     [ 16], common_pid: avahi-daemon    [       896] } hitcount:         66
4548 -    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:      26674
4549 -    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:      73443
4550 -
4551 -    Totals:
4552 -        Hits: 101162
4553 -        Entries: 103
4554 -        Dropped: 0
4555 -
4556 -    The above output shows that 'compiz' and 'Xorg' are far and away
4557 -    the heaviest ioctl callers (which might lead to questions about
4558 -    whether they really need to be making all those calls and to
4559 -    possible avenues for further investigation.)
4560 -
4561 -    The compound key examples used a key and a sum value (hitcount) to
4562 -    sort the output, but we can just as easily use two keys instead.
4563 -    Here's an example where we use a compound key composed of the the
4564 -    common_pid and size event fields.  Sorting with pid as the primary
4565 -    key and 'size' as the secondary key allows us to display an
4566 -    ordered summary of the recvfrom sizes, with counts, received by
4567 -    each process:
4568 -
4569 -    # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
4570 -           /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
4571 -
4572 -    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
4573 -    # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
4574 -
4575 -    { common_pid: smbd            [       784], size:          4 } hitcount:          1
4576 -    { common_pid: dnsmasq         [      1412], size:       4096 } hitcount:        672
4577 -    { common_pid: postgres        [      1796], size:       1000 } hitcount:          6
4578 -    { common_pid: postgres        [      1867], size:       1000 } hitcount:         10
4579 -    { common_pid: bamfdaemon      [      2787], size:         28 } hitcount:          2
4580 -    { common_pid: bamfdaemon      [      2787], size:      14360 } hitcount:          1
4581 -    { common_pid: compiz          [      2994], size:          8 } hitcount:          1
4582 -    { common_pid: compiz          [      2994], size:         20 } hitcount:         11
4583 -    { common_pid: gnome-terminal  [      3199], size:          4 } hitcount:          2
4584 -    { common_pid: firefox         [      8817], size:          4 } hitcount:          1
4585 -    { common_pid: firefox         [      8817], size:          8 } hitcount:          5
4586 -    { common_pid: firefox         [      8817], size:        588 } hitcount:          2
4587 -    { common_pid: firefox         [      8817], size:        628 } hitcount:          1
4588 -    { common_pid: firefox         [      8817], size:       6944 } hitcount:          1
4589 -    { common_pid: firefox         [      8817], size:     408880 } hitcount:          2
4590 -    { common_pid: firefox         [      8822], size:          8 } hitcount:          2
4591 -    { common_pid: firefox         [      8822], size:        160 } hitcount:          2
4592 -    { common_pid: firefox         [      8822], size:        320 } hitcount:          2
4593 -    { common_pid: firefox         [      8822], size:        352 } hitcount:          1
4594 -    .
4595 -    .
4596 -    .
4597 -    { common_pid: pool            [      8923], size:       1960 } hitcount:         10
4598 -    { common_pid: pool            [      8923], size:       2048 } hitcount:         10
4599 -    { common_pid: pool            [      8924], size:       1960 } hitcount:         10
4600 -    { common_pid: pool            [      8924], size:       2048 } hitcount:         10
4601 -    { common_pid: pool            [      8928], size:       1964 } hitcount:          4
4602 -    { common_pid: pool            [      8928], size:       1965 } hitcount:          2
4603 -    { common_pid: pool            [      8928], size:       2048 } hitcount:          6
4604 -    { common_pid: pool            [      8929], size:       1982 } hitcount:          1
4605 -    { common_pid: pool            [      8929], size:       2048 } hitcount:          1
4606 -
4607 -    Totals:
4608 -        Hits: 2016
4609 -        Entries: 224
4610 -        Dropped: 0
4611 -
4612 -  The above example also illustrates the fact that although a compound
4613 -  key is treated as a single entity for hashing purposes, the sub-keys
4614 -  it's composed of can be accessed independently.
4615 -
4616 -  The next example uses a string field as the hash key and
4617 -  demonstrates how you can manually pause and continue a hist trigger.
4618 -  In this example, we'll aggregate fork counts and don't expect a
4619 -  large number of entries in the hash table, so we'll drop it to a
4620 -  much smaller number, say 256:
4621 -
4622 -    # echo 'hist:key=child_comm:val=hitcount:size=256' > \
4623 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4624 -
4625 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4626 -    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
4627 -
4628 -    { child_comm: dconf worker                        } hitcount:          1
4629 -    { child_comm: ibus-daemon                         } hitcount:          1
4630 -    { child_comm: whoopsie                            } hitcount:          1
4631 -    { child_comm: smbd                                } hitcount:          1
4632 -    { child_comm: gdbus                               } hitcount:          1
4633 -    { child_comm: kthreadd                            } hitcount:          1
4634 -    { child_comm: dconf worker                        } hitcount:          1
4635 -    { child_comm: evolution-alarm                     } hitcount:          2
4636 -    { child_comm: Socket Thread                       } hitcount:          2
4637 -    { child_comm: postgres                            } hitcount:          2
4638 -    { child_comm: bash                                } hitcount:          3
4639 -    { child_comm: compiz                              } hitcount:          3
4640 -    { child_comm: evolution-sourc                     } hitcount:          4
4641 -    { child_comm: dhclient                            } hitcount:          4
4642 -    { child_comm: pool                                } hitcount:          5
4643 -    { child_comm: nm-dispatcher.a                     } hitcount:          8
4644 -    { child_comm: firefox                             } hitcount:          8
4645 -    { child_comm: dbus-daemon                         } hitcount:          8
4646 -    { child_comm: glib-pacrunner                      } hitcount:         10
4647 -    { child_comm: evolution                           } hitcount:         23
4648 -
4649 -    Totals:
4650 -        Hits: 89
4651 -        Entries: 20
4652 -        Dropped: 0
4653 -
4654 -  If we want to pause the hist trigger, we can simply append :pause to
4655 -  the command that started the trigger.  Notice that the trigger info
4656 -  displays as [paused]:
4657 -
4658 -    # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
4659 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4660 -
4661 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4662 -    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
4663 -
4664 -    { child_comm: dconf worker                        } hitcount:          1
4665 -    { child_comm: kthreadd                            } hitcount:          1
4666 -    { child_comm: dconf worker                        } hitcount:          1
4667 -    { child_comm: gdbus                               } hitcount:          1
4668 -    { child_comm: ibus-daemon                         } hitcount:          1
4669 -    { child_comm: Socket Thread                       } hitcount:          2
4670 -    { child_comm: evolution-alarm                     } hitcount:          2
4671 -    { child_comm: smbd                                } hitcount:          2
4672 -    { child_comm: bash                                } hitcount:          3
4673 -    { child_comm: whoopsie                            } hitcount:          3
4674 -    { child_comm: compiz                              } hitcount:          3
4675 -    { child_comm: evolution-sourc                     } hitcount:          4
4676 -    { child_comm: pool                                } hitcount:          5
4677 -    { child_comm: postgres                            } hitcount:          6
4678 -    { child_comm: firefox                             } hitcount:          8
4679 -    { child_comm: dhclient                            } hitcount:         10
4680 -    { child_comm: emacs                               } hitcount:         12
4681 -    { child_comm: dbus-daemon                         } hitcount:         20
4682 -    { child_comm: nm-dispatcher.a                     } hitcount:         20
4683 -    { child_comm: evolution                           } hitcount:         35
4684 -    { child_comm: glib-pacrunner                      } hitcount:         59
4685 -
4686 -    Totals:
4687 -        Hits: 199
4688 -        Entries: 21
4689 -        Dropped: 0
4690 -
4691 -  To manually continue having the trigger aggregate events, append
4692 -  :cont instead.  Notice that the trigger info displays as [active]
4693 -  again, and the data has changed:
4694 -
4695 -    # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
4696 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
4697 -
4698 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
4699 -    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
4700 -
4701 -    { child_comm: dconf worker                        } hitcount:          1
4702 -    { child_comm: dconf worker                        } hitcount:          1
4703 -    { child_comm: kthreadd                            } hitcount:          1
4704 -    { child_comm: gdbus                               } hitcount:          1
4705 -    { child_comm: ibus-daemon                         } hitcount:          1
4706 -    { child_comm: Socket Thread                       } hitcount:          2
4707 -    { child_comm: evolution-alarm                     } hitcount:          2
4708 -    { child_comm: smbd                                } hitcount:          2
4709 -    { child_comm: whoopsie                            } hitcount:          3
4710 -    { child_comm: compiz                              } hitcount:          3
4711 -    { child_comm: evolution-sourc                     } hitcount:          4
4712 -    { child_comm: bash                                } hitcount:          5
4713 -    { child_comm: pool                                } hitcount:          5
4714 -    { child_comm: postgres                            } hitcount:          6
4715 -    { child_comm: firefox                             } hitcount:          8
4716 -    { child_comm: dhclient                            } hitcount:         11
4717 -    { child_comm: emacs                               } hitcount:         12
4718 -    { child_comm: dbus-daemon                         } hitcount:         22
4719 -    { child_comm: nm-dispatcher.a                     } hitcount:         22
4720 -    { child_comm: evolution                           } hitcount:         35
4721 -    { child_comm: glib-pacrunner                      } hitcount:         59
4722 -
4723 -    Totals:
4724 -        Hits: 206
4725 -        Entries: 21
4726 -        Dropped: 0
4727 -
4728 -  The previous example showed how to start and stop a hist trigger by
4729 -  appending 'pause' and 'continue' to the hist trigger command.  A
4730 -  hist trigger can also be started in a paused state by initially
4731 -  starting the trigger with ':pause' appended.  This allows you to
4732 -  start the trigger only when you're ready to start collecting data
4733 -  and not before.  For example, you could start the trigger in a
4734 -  paused state, then unpause it and do something you want to measure,
4735 -  then pause the trigger again when done.
4736 -
4737 -  Of course, doing this manually can be difficult and error-prone, but
4738 -  it is possible to automatically start and stop a hist trigger based
4739 -  on some condition, via the enable_hist and disable_hist triggers.
4740 -
4741 -  For example, suppose we wanted to take a look at the relative
4742 -  weights in terms of skb length for each callpath that leads to a
4743 -  netif_receieve_skb event when downloading a decent-sized file using
4744 -  wget.
4745 -
4746 -  First we set up an initially paused stacktrace trigger on the
4747 -  netif_receive_skb event:
4748 -
4749 -    # echo 'hist:key=stacktrace:vals=len:pause' > \
4750 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4751 -
4752 -  Next, we set up an 'enable_hist' trigger on the sched_process_exec
4753 -  event, with an 'if filename==/usr/bin/wget' filter.  The effect of
4754 -  this new trigger is that it will 'unpause' the hist trigger we just
4755 -  set up on netif_receive_skb if and only if it sees a
4756 -  sched_process_exec event with a filename of '/usr/bin/wget'.  When
4757 -  that happens, all netif_receive_skb events are aggregated into a
4758 -  hash table keyed on stacktrace:
4759 -
4760 -    # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
4761 -           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4762 -
4763 -  The aggregation continues until the netif_receive_skb is paused
4764 -  again, which is what the following disable_hist event does by
4765 -  creating a similar setup on the sched_process_exit event, using the
4766 -  filter 'comm==wget':
4767 -
4768 -    # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
4769 -           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4770 -
4771 -  Whenever a process exits and the comm field of the disable_hist
4772 -  trigger filter matches 'comm==wget', the netif_receive_skb hist
4773 -  trigger is disabled.
4774 -
4775 -  The overall effect is that netif_receive_skb events are aggregated
4776 -  into the hash table for only the duration of the wget.  Executing a
4777 -  wget command and then listing the 'hist' file will display the
4778 -  output generated by the wget command:
4779 -
4780 -    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
4781 -
4782 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4783 -    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
4784 -
4785 -    { stacktrace:
4786 -         __netif_receive_skb_core+0x46d/0x990
4787 -         __netif_receive_skb+0x18/0x60
4788 -         netif_receive_skb_internal+0x23/0x90
4789 -         napi_gro_receive+0xc8/0x100
4790 -         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
4791 -         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
4792 -         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
4793 -         ieee80211_rx+0x31d/0x900 [mac80211]
4794 -         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
4795 -         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
4796 -         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
4797 -         irq_thread_fn+0x20/0x50
4798 -         irq_thread+0x11f/0x150
4799 -         kthread+0xd2/0xf0
4800 -         ret_from_fork+0x42/0x70
4801 -    } hitcount:         85  len:      28884
4802 -    { stacktrace:
4803 -         __netif_receive_skb_core+0x46d/0x990
4804 -         __netif_receive_skb+0x18/0x60
4805 -         netif_receive_skb_internal+0x23/0x90
4806 -         napi_gro_complete+0xa4/0xe0
4807 -         dev_gro_receive+0x23a/0x360
4808 -         napi_gro_receive+0x30/0x100
4809 -         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
4810 -         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
4811 -         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
4812 -         ieee80211_rx+0x31d/0x900 [mac80211]
4813 -         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
4814 -         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
4815 -         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
4816 -         irq_thread_fn+0x20/0x50
4817 -         irq_thread+0x11f/0x150
4818 -         kthread+0xd2/0xf0
4819 -    } hitcount:         98  len:     664329
4820 -    { stacktrace:
4821 -         __netif_receive_skb_core+0x46d/0x990
4822 -         __netif_receive_skb+0x18/0x60
4823 -         process_backlog+0xa8/0x150
4824 -         net_rx_action+0x15d/0x340
4825 -         __do_softirq+0x114/0x2c0
4826 -         do_softirq_own_stack+0x1c/0x30
4827 -         do_softirq+0x65/0x70
4828 -         __local_bh_enable_ip+0xb5/0xc0
4829 -         ip_finish_output+0x1f4/0x840
4830 -         ip_output+0x6b/0xc0
4831 -         ip_local_out_sk+0x31/0x40
4832 -         ip_send_skb+0x1a/0x50
4833 -         udp_send_skb+0x173/0x2a0
4834 -         udp_sendmsg+0x2bf/0x9f0
4835 -         inet_sendmsg+0x64/0xa0
4836 -         sock_sendmsg+0x3d/0x50
4837 -    } hitcount:        115  len:      13030
4838 -    { stacktrace:
4839 -         __netif_receive_skb_core+0x46d/0x990
4840 -         __netif_receive_skb+0x18/0x60
4841 -         netif_receive_skb_internal+0x23/0x90
4842 -         napi_gro_complete+0xa4/0xe0
4843 -         napi_gro_flush+0x6d/0x90
4844 -         iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
4845 -         irq_thread_fn+0x20/0x50
4846 -         irq_thread+0x11f/0x150
4847 -         kthread+0xd2/0xf0
4848 -         ret_from_fork+0x42/0x70
4849 -    } hitcount:        934  len:    5512212
4850 -
4851 -    Totals:
4852 -        Hits: 1232
4853 -        Entries: 4
4854 -        Dropped: 0
4855 -
4856 -  The above shows all the netif_receive_skb callpaths and their total
4857 -  lengths for the duration of the wget command.
4858 -
4859 -  The 'clear' hist trigger param can be used to clear the hash table.
4860 -  Suppose we wanted to try another run of the previous example but
4861 -  this time also wanted to see the complete list of events that went
4862 -  into the histogram.  In order to avoid having to set everything up
4863 -  again, we can just clear the histogram first:
4864 -
4865 -    # echo 'hist:key=stacktrace:vals=len:clear' >> \
4866 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4867 -
4868 -  Just to verify that it is in fact cleared, here's what we now see in
4869 -  the hist file:
4870 -
4871 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4872 -    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
4873 -
4874 -    Totals:
4875 -        Hits: 0
4876 -        Entries: 0
4877 -        Dropped: 0
4878 -
4879 -  Since we want to see the detailed list of every netif_receive_skb
4880 -  event occurring during the new run, which are in fact the same
4881 -  events being aggregated into the hash table, we add some additional
4882 -  'enable_event' events to the triggering sched_process_exec and
4883 -  sched_process_exit events as such:
4884 -
4885 -    # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
4886 -           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4887 -
4888 -    # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
4889 -           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4890 -
4891 -  If you read the trigger files for the sched_process_exec and
4892 -  sched_process_exit triggers, you should see two triggers for each:
4893 -  one enabling/disabling the hist aggregation and the other
4894 -  enabling/disabling the logging of events:
4895 -
4896 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
4897 -    enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
4898 -    enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
4899 -
4900 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
4901 -    enable_event:net:netif_receive_skb:unlimited if comm==wget
4902 -    disable_hist:net:netif_receive_skb:unlimited if comm==wget
4903 -
4904 -  In other words, whenever either of the sched_process_exec or
4905 -  sched_process_exit events is hit and matches 'wget', it enables or
4906 -  disables both the histogram and the event log, and what you end up
4907 -  with is a hash table and set of events just covering the specified
4908 -  duration.  Run the wget command again:
4909 -
4910 -    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
4911 -
4912 -  Displaying the 'hist' file should show something similar to what you
4913 -  saw in the last run, but this time you should also see the
4914 -  individual events in the trace file:
4915 -
4916 -    # cat /sys/kernel/debug/tracing/trace
4917 -
4918 -    # tracer: nop
4919 -    #
4920 -    # entries-in-buffer/entries-written: 183/1426   #P:4
4921 -    #
4922 -    #                              _-----=> irqs-off
4923 -    #                             / _----=> need-resched
4924 -    #                            | / _---=> hardirq/softirq
4925 -    #                            || / _--=> preempt-depth
4926 -    #                            ||| /     delay
4927 -    #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
4928 -    #              | |       |   ||||       |         |
4929 -                wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
4930 -                wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
4931 -             dnsmasq-1382  [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
4932 -             dnsmasq-1382  [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
4933 -    ##### CPU 2 buffer started ####
4934 -      irq/29-iwlwifi-559   [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
4935 -      irq/29-iwlwifi-559   [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
4936 -      irq/29-iwlwifi-559   [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
4937 -      irq/29-iwlwifi-559   [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
4938 -      irq/29-iwlwifi-559   [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
4939 -    .
4940 -    .
4941 -    .
4942 -
4943 -  The following example demonstrates how multiple hist triggers can be
4944 -  attached to a given event.  This capability can be useful for
4945 -  creating a set of different summaries derived from the same set of
4946 -  events, or for comparing the effects of different filters, among
4947 -  other things.
4948 -
4949 -    # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
4950 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4951 -    # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
4952 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4953 -    # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
4954 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4955 -    # echo 'hist:keys=skbaddr.hex:vals=len' >> \
4956 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4957 -    # echo 'hist:keys=len:vals=common_preempt_count' >> \
4958 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
4959 -
4960 -  The above set of commands create four triggers differing only in
4961 -  their filters, along with a completely different though fairly
4962 -  nonsensical trigger.  Note that in order to append multiple hist
4963 -  triggers to the same file, you should use the '>>' operator to
4964 -  append them ('>' will also add the new hist trigger, but will remove
4965 -  any existing hist triggers beforehand).
4966 -
4967 -  Displaying the contents of the 'hist' file for the event shows the
4968 -  contents of all five histograms:
4969 -
4970 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
4971 -
4972 -    # event histogram
4973 -    #
4974 -    # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
4975 -    #
4976 -
4977 -    { len:        176 } hitcount:          1  common_preempt_count:          0
4978 -    { len:        223 } hitcount:          1  common_preempt_count:          0
4979 -    { len:       4854 } hitcount:          1  common_preempt_count:          0
4980 -    { len:        395 } hitcount:          1  common_preempt_count:          0
4981 -    { len:        177 } hitcount:          1  common_preempt_count:          0
4982 -    { len:        446 } hitcount:          1  common_preempt_count:          0
4983 -    { len:       1601 } hitcount:          1  common_preempt_count:          0
4984 -    .
4985 -    .
4986 -    .
4987 -    { len:       1280 } hitcount:         66  common_preempt_count:          0
4988 -    { len:        116 } hitcount:         81  common_preempt_count:         40
4989 -    { len:        708 } hitcount:        112  common_preempt_count:          0
4990 -    { len:         46 } hitcount:        221  common_preempt_count:          0
4991 -    { len:       1264 } hitcount:        458  common_preempt_count:          0
4992 -
4993 -    Totals:
4994 -        Hits: 1428
4995 -        Entries: 147
4996 -        Dropped: 0
4997 -
4998 -
4999 -    # event histogram
5000 -    #
5001 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5002 -    #
5003 -
5004 -    { skbaddr: ffff8800baee5e00 } hitcount:          1  len:        130
5005 -    { skbaddr: ffff88005f3d5600 } hitcount:          1  len:       1280
5006 -    { skbaddr: ffff88005f3d4900 } hitcount:          1  len:       1280
5007 -    { skbaddr: ffff88009fed6300 } hitcount:          1  len:        115
5008 -    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:        115
5009 -    { skbaddr: ffff88008cdb1900 } hitcount:          1  len:         46
5010 -    { skbaddr: ffff880064b5ef00 } hitcount:          1  len:        118
5011 -    { skbaddr: ffff880044e3c700 } hitcount:          1  len:         60
5012 -    { skbaddr: ffff880100065900 } hitcount:          1  len:         46
5013 -    { skbaddr: ffff8800d46bd500 } hitcount:          1  len:        116
5014 -    { skbaddr: ffff88005f3d5f00 } hitcount:          1  len:       1280
5015 -    { skbaddr: ffff880100064700 } hitcount:          1  len:        365
5016 -    { skbaddr: ffff8800badb6f00 } hitcount:          1  len:         60
5017 -    .
5018 -    .
5019 -    .
5020 -    { skbaddr: ffff88009fe0be00 } hitcount:         27  len:      24677
5021 -    { skbaddr: ffff88009fe0a400 } hitcount:         27  len:      23052
5022 -    { skbaddr: ffff88009fe0b700 } hitcount:         31  len:      25589
5023 -    { skbaddr: ffff88009fe0b600 } hitcount:         32  len:      27326
5024 -    { skbaddr: ffff88006a462800 } hitcount:         68  len:      71678
5025 -    { skbaddr: ffff88006a463700 } hitcount:         70  len:      72678
5026 -    { skbaddr: ffff88006a462b00 } hitcount:         71  len:      77589
5027 -    { skbaddr: ffff88006a463600 } hitcount:         73  len:      71307
5028 -    { skbaddr: ffff88006a462200 } hitcount:         81  len:      81032
5029 -
5030 -    Totals:
5031 -        Hits: 1451
5032 -        Entries: 318
5033 -        Dropped: 0
5034 -
5035 -
5036 -    # event histogram
5037 -    #
5038 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
5039 -    #
5040 -
5041 -
5042 -    Totals:
5043 -        Hits: 0
5044 -        Entries: 0
5045 -        Dropped: 0
5046 -
5047 -
5048 -    # event histogram
5049 -    #
5050 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
5051 -    #
5052 -
5053 -    { skbaddr: ffff88009fd2c300 } hitcount:          1  len:       7212
5054 -    { skbaddr: ffff8800d2bcce00 } hitcount:          1  len:       7212
5055 -    { skbaddr: ffff8800d2bcd700 } hitcount:          1  len:       7212
5056 -    { skbaddr: ffff8800d2bcda00 } hitcount:          1  len:      21492
5057 -    { skbaddr: ffff8800ae2e2d00 } hitcount:          1  len:       7212
5058 -    { skbaddr: ffff8800d2bcdb00 } hitcount:          1  len:       7212
5059 -    { skbaddr: ffff88006a4df500 } hitcount:          1  len:       4854
5060 -    { skbaddr: ffff88008ce47b00 } hitcount:          1  len:      18636
5061 -    { skbaddr: ffff8800ae2e2200 } hitcount:          1  len:      12924
5062 -    { skbaddr: ffff88005f3e1000 } hitcount:          1  len:       4356
5063 -    { skbaddr: ffff8800d2bcdc00 } hitcount:          2  len:      24420
5064 -    { skbaddr: ffff8800d2bcc200 } hitcount:          2  len:      12996
5065 -
5066 -    Totals:
5067 -        Hits: 14
5068 -        Entries: 12
5069 -        Dropped: 0
5070 -
5071 -
5072 -    # event histogram
5073 -    #
5074 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
5075 -    #
5076 -
5077 -
5078 -    Totals:
5079 -        Hits: 0
5080 -        Entries: 0
5081 -        Dropped: 0
5082 -
5083 -  Named triggers can be used to have triggers share a common set of
5084 -  histogram data.  This capability is mostly useful for combining the
5085 -  output of events generated by tracepoints contained inside inline
5086 -  functions, but names can be used in a hist trigger on any event.
5087 -  For example, these two triggers when hit will update the same 'len'
5088 -  field in the shared 'foo' histogram data:
5089 -
5090 -    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
5091 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
5092 -    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
5093 -           /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5094 -
5095 -  You can see that they're updating common histogram data by reading
5096 -  each event's hist files at the same time:
5097 -
5098 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
5099 -      cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5100 -
5101 -    # event histogram
5102 -    #
5103 -    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5104 -    #
5105 -
5106 -    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
5107 -    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
5108 -    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
5109 -    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
5110 -    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
5111 -    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
5112 -    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
5113 -    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
5114 -    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
5115 -    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
5116 -    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
5117 -    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
5118 -    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
5119 -    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
5120 -    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
5121 -    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
5122 -    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
5123 -    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
5124 -    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
5125 -    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
5126 -    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
5127 -    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
5128 -    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
5129 -    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
5130 -    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
5131 -    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
5132 -    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
5133 -    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
5134 -    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
5135 -    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
5136 -    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
5137 -    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
5138 -    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
5139 -    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
5140 -    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
5141 -    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
5142 -    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
5143 -    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
5144 -    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
5145 -    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
5146 -    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
5147 -    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
5148 -
5149 -    Totals:
5150 -        Hits: 81
5151 -        Entries: 42
5152 -        Dropped: 0
5153 -    # event histogram
5154 -    #
5155 -    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
5156 -    #
5157 -
5158 -    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
5159 -    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
5160 -    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
5161 -    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
5162 -    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
5163 -    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
5164 -    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
5165 -    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
5166 -    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
5167 -    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
5168 -    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
5169 -    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
5170 -    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
5171 -    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
5172 -    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
5173 -    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
5174 -    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
5175 -    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
5176 -    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
5177 -    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
5178 -    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
5179 -    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
5180 -    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
5181 -    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
5182 -    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
5183 -    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
5184 -    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
5185 -    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
5186 -    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
5187 -    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
5188 -    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
5189 -    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
5190 -    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
5191 -    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
5192 -    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
5193 -    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
5194 -    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
5195 -    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
5196 -    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
5197 -    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
5198 -    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
5199 -    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
5200 -
5201 -    Totals:
5202 -        Hits: 81
5203 -        Entries: 42
5204 -        Dropped: 0
5205 -
5206 -  And here's an example that shows how to combine histogram data from
5207 -  any two events even if they don't share any 'compatible' fields
5208 -  other than 'hitcount' and 'stacktrace'.  These commands create a
5209 -  couple of triggers named 'bar' using those fields:
5210 -
5211 -    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
5212 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
5213 -    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
5214 -          /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5215 -
5216 -  And displaying the output of either shows some interesting if
5217 -  somewhat confusing output:
5218 -
5219 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
5220 -    # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5221 -
5222 -    # event histogram
5223 -    #
5224 -    # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
5225 -    #
5226 -
5227 -    { stacktrace:
5228 -             _do_fork+0x18e/0x330
5229 -             kernel_thread+0x29/0x30
5230 -             kthreadd+0x154/0x1b0
5231 -             ret_from_fork+0x3f/0x70
5232 -    } hitcount:          1
5233 -    { stacktrace:
5234 -             netif_rx_internal+0xb2/0xd0
5235 -             netif_rx_ni+0x20/0x70
5236 -             dev_loopback_xmit+0xaa/0xd0
5237 -             ip_mc_output+0x126/0x240
5238 -             ip_local_out_sk+0x31/0x40
5239 -             igmp_send_report+0x1e9/0x230
5240 -             igmp_timer_expire+0xe9/0x120
5241 -             call_timer_fn+0x39/0xf0
5242 -             run_timer_softirq+0x1e1/0x290
5243 -             __do_softirq+0xfd/0x290
5244 -             irq_exit+0x98/0xb0
5245 -             smp_apic_timer_interrupt+0x4a/0x60
5246 -             apic_timer_interrupt+0x6d/0x80
5247 -             cpuidle_enter+0x17/0x20
5248 -             call_cpuidle+0x3b/0x60
5249 -             cpu_startup_entry+0x22d/0x310
5250 -    } hitcount:          1
5251 -    { stacktrace:
5252 -             netif_rx_internal+0xb2/0xd0
5253 -             netif_rx_ni+0x20/0x70
5254 -             dev_loopback_xmit+0xaa/0xd0
5255 -             ip_mc_output+0x17f/0x240
5256 -             ip_local_out_sk+0x31/0x40
5257 -             ip_send_skb+0x1a/0x50
5258 -             udp_send_skb+0x13e/0x270
5259 -             udp_sendmsg+0x2bf/0x980
5260 -             inet_sendmsg+0x67/0xa0
5261 -             sock_sendmsg+0x38/0x50
5262 -             SYSC_sendto+0xef/0x170
5263 -             SyS_sendto+0xe/0x10
5264 -             entry_SYSCALL_64_fastpath+0x12/0x6a
5265 -    } hitcount:          2
5266 -    { stacktrace:
5267 -             netif_rx_internal+0xb2/0xd0
5268 -             netif_rx+0x1c/0x60
5269 -             loopback_xmit+0x6c/0xb0
5270 -             dev_hard_start_xmit+0x219/0x3a0
5271 -             __dev_queue_xmit+0x415/0x4f0
5272 -             dev_queue_xmit_sk+0x13/0x20
5273 -             ip_finish_output2+0x237/0x340
5274 -             ip_finish_output+0x113/0x1d0
5275 -             ip_output+0x66/0xc0
5276 -             ip_local_out_sk+0x31/0x40
5277 -             ip_send_skb+0x1a/0x50
5278 -             udp_send_skb+0x16d/0x270
5279 -             udp_sendmsg+0x2bf/0x980
5280 -             inet_sendmsg+0x67/0xa0
5281 -             sock_sendmsg+0x38/0x50
5282 -             ___sys_sendmsg+0x14e/0x270
5283 -    } hitcount:         76
5284 -    { stacktrace:
5285 -             netif_rx_internal+0xb2/0xd0
5286 -             netif_rx+0x1c/0x60
5287 -             loopback_xmit+0x6c/0xb0
5288 -             dev_hard_start_xmit+0x219/0x3a0
5289 -             __dev_queue_xmit+0x415/0x4f0
5290 -             dev_queue_xmit_sk+0x13/0x20
5291 -             ip_finish_output2+0x237/0x340
5292 -             ip_finish_output+0x113/0x1d0
5293 -             ip_output+0x66/0xc0
5294 -             ip_local_out_sk+0x31/0x40
5295 -             ip_send_skb+0x1a/0x50
5296 -             udp_send_skb+0x16d/0x270
5297 -             udp_sendmsg+0x2bf/0x980
5298 -             inet_sendmsg+0x67/0xa0
5299 -             sock_sendmsg+0x38/0x50
5300 -             ___sys_sendmsg+0x269/0x270
5301 -    } hitcount:         77
5302 -    { stacktrace:
5303 -             netif_rx_internal+0xb2/0xd0
5304 -             netif_rx+0x1c/0x60
5305 -             loopback_xmit+0x6c/0xb0
5306 -             dev_hard_start_xmit+0x219/0x3a0
5307 -             __dev_queue_xmit+0x415/0x4f0
5308 -             dev_queue_xmit_sk+0x13/0x20
5309 -             ip_finish_output2+0x237/0x340
5310 -             ip_finish_output+0x113/0x1d0
5311 -             ip_output+0x66/0xc0
5312 -             ip_local_out_sk+0x31/0x40
5313 -             ip_send_skb+0x1a/0x50
5314 -             udp_send_skb+0x16d/0x270
5315 -             udp_sendmsg+0x2bf/0x980
5316 -             inet_sendmsg+0x67/0xa0
5317 -             sock_sendmsg+0x38/0x50
5318 -             SYSC_sendto+0xef/0x170
5319 -    } hitcount:         88
5320 -    { stacktrace:
5321 -             _do_fork+0x18e/0x330
5322 -             SyS_clone+0x19/0x20
5323 -             entry_SYSCALL_64_fastpath+0x12/0x6a
5324 -    } hitcount:        244
5325 -
5326 -    Totals:
5327 -        Hits: 489
5328 -        Entries: 7
5329 -        Dropped: 0
5330 +  See Documentation/trace/histogram.txt for details and examples.
5331 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/ftrace.txt linux-4.14/Documentation/trace/ftrace.txt
5332 --- linux-4.14.orig/Documentation/trace/ftrace.txt      2017-11-12 19:46:13.000000000 +0100
5333 +++ linux-4.14/Documentation/trace/ftrace.txt   2018-09-05 11:05:07.000000000 +0200
5334 @@ -539,6 +539,30 @@
5335  
5336         See events.txt for more information.
5337  
5338 +  timestamp_mode:
5339 +
5340 +       Certain tracers may change the timestamp mode used when
5341 +       logging trace events into the event buffer.  Events with
5342 +       different modes can coexist within a buffer but the mode in
5343 +       effect when an event is logged determines which timestamp mode
5344 +       is used for that event.  The default timestamp mode is
5345 +       'delta'.
5346 +
5347 +       Usual timestamp modes for tracing:
5348 +
5349 +         # cat timestamp_mode
5350 +         [delta] absolute
5351 +
5352 +         The timestamp mode with the square brackets around it is the
5353 +         one in effect.
5354 +
5355 +         delta: Default timestamp mode - timestamp is a delta against
5356 +                a per-buffer timestamp.
5357 +
5358 +         absolute: The timestamp is a full timestamp, not a delta
5359 +                 against some other value.  As such it takes up more
5360 +                 space and is less efficient.
5361 +
5362    hwlat_detector:
5363  
5364         Directory for the Hardware Latency Detector.
5365 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/Documentation/trace/histogram.txt linux-4.14/Documentation/trace/histogram.txt
5366 --- linux-4.14.orig/Documentation/trace/histogram.txt   1970-01-01 01:00:00.000000000 +0100
5367 +++ linux-4.14/Documentation/trace/histogram.txt        2018-09-05 11:05:07.000000000 +0200
5368 @@ -0,0 +1,1995 @@
5369 +                            Event Histograms
5370 +
5371 +                   Documentation written by Tom Zanussi
5372 +
5373 +1. Introduction
5374 +===============
5375 +
5376 +  Histogram triggers are special event triggers that can be used to
5377 +  aggregate trace event data into histograms.  For information on
5378 +  trace events and event triggers, see Documentation/trace/events.txt.
5379 +
5380 +
5381 +2. Histogram Trigger Command
5382 +============================
5383 +
5384 +  A histogram trigger command is an event trigger command that
5385 +  aggregates event hits into a hash table keyed on one or more trace
5386 +  event format fields (or stacktrace) and a set of running totals
5387 +  derived from one or more trace event format fields and/or event
5388 +  counts (hitcount).
5389 +
5390 +  The format of a hist trigger is as follows:
5391 +
5392 +        hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
5393 +          [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
5394 +          [:clear][:name=histname1] [if <filter>]
5395 +
5396 +  When a matching event is hit, an entry is added to a hash table
5397 +  using the key(s) and value(s) named.  Keys and values correspond to
5398 +  fields in the event's format description.  Values must correspond to
5399 +  numeric fields - on an event hit, the value(s) will be added to a
5400 +  sum kept for that field.  The special string 'hitcount' can be used
5401 +  in place of an explicit value field - this is simply a count of
5402 +  event hits.  If 'values' isn't specified, an implicit 'hitcount'
5403 +  value will be automatically created and used as the only value.
5404 +  Keys can be any field, or the special string 'stacktrace', which
5405 +  will use the event's kernel stacktrace as the key.  The keywords
5406 +  'keys' or 'key' can be used to specify keys, and the keywords
5407 +  'values', 'vals', or 'val' can be used to specify values.  Compound
5408 +  keys consisting of up to two fields can be specified by the 'keys'
5409 +  keyword.  Hashing a compound key produces a unique entry in the
5410 +  table for each unique combination of component keys, and can be
5411 +  useful for providing more fine-grained summaries of event data.
5412 +  Additionally, sort keys consisting of up to two fields can be
5413 +  specified by the 'sort' keyword.  If more than one field is
5414 +  specified, the result will be a 'sort within a sort': the first key
5415 +  is taken to be the primary sort key and the second the secondary
5416 +  key.  If a hist trigger is given a name using the 'name' parameter,
5417 +  its histogram data will be shared with other triggers of the same
5418 +  name, and trigger hits will update this common data.  Only triggers
5419 +  with 'compatible' fields can be combined in this way; triggers are
5420 +  'compatible' if the fields named in the trigger share the same
5421 +  number and type of fields and those fields also have the same names.
5422 +  Note that any two events always share the compatible 'hitcount' and
5423 +  'stacktrace' fields and can therefore be combined using those
5424 +  fields, however pointless that may be.
5425 +
5426 +  'hist' triggers add a 'hist' file to each event's subdirectory.
5427 +  Reading the 'hist' file for the event will dump the hash table in
5428 +  its entirety to stdout.  If there are multiple hist triggers
5429 +  attached to an event, there will be a table for each trigger in the
5430 +  output.  The table displayed for a named trigger will be the same as
5431 +  any other instance having the same name. Each printed hash table
5432 +  entry is a simple list of the keys and values comprising the entry;
5433 +  keys are printed first and are delineated by curly braces, and are
5434 +  followed by the set of value fields for the entry.  By default,
5435 +  numeric fields are displayed as base-10 integers.  This can be
5436 +  modified by appending any of the following modifiers to the field
5437 +  name:
5438 +
5439 +        .hex        display a number as a hex value
5440 +       .sym        display an address as a symbol
5441 +       .sym-offset display an address as a symbol and offset
5442 +       .syscall    display a syscall id as a system call name
5443 +       .execname   display a common_pid as a program name
5444 +       .log2       display log2 value rather than raw number
5445 +       .usecs      display a common_timestamp in microseconds
5446 +
5447 +  Note that in general the semantics of a given field aren't
5448 +  interpreted when applying a modifier to it, but there are some
5449 +  restrictions to be aware of in this regard:
5450 +
5451 +    - only the 'hex' modifier can be used for values (because values
5452 +      are essentially sums, and the other modifiers don't make sense
5453 +      in that context).
5454 +    - the 'execname' modifier can only be used on a 'common_pid'.  The
5455 +      reason for this is that the execname is simply the 'comm' value
5456 +      saved for the 'current' process when an event was triggered,
5457 +      which is the same as the common_pid value saved by the event
5458 +      tracing code.  Trying to apply that comm value to other pid
5459 +      values wouldn't be correct, and typically events that care save
5460 +      pid-specific comm fields in the event itself.
5461 +
5462 +  A typical usage scenario would be the following to enable a hist
5463 +  trigger, read its current contents, and then turn it off:
5464 +
5465 +  # echo 'hist:keys=skbaddr.hex:vals=len' > \
5466 +    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5467 +
5468 +  # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
5469 +
5470 +  # echo '!hist:keys=skbaddr.hex:vals=len' > \
5471 +    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
5472 +
5473 +  The trigger file itself can be read to show the details of the
5474 +  currently attached hist trigger.  This information is also displayed
5475 +  at the top of the 'hist' file when read.
5476 +
5477 +  By default, the size of the hash table is 2048 entries.  The 'size'
5478 +  parameter can be used to specify more or fewer than that.  The units
5479 +  are in terms of hashtable entries - if a run uses more entries than
5480 +  specified, the results will show the number of 'drops', the number
5481 +  of hits that were ignored.  The size should be a power of 2 between
5482 +  128 and 131072 (any non- power-of-2 number specified will be rounded
5483 +  up).
5484 +
5485 +  The 'sort' parameter can be used to specify a value field to sort
5486 +  on.  The default if unspecified is 'hitcount' and the default sort
5487 +  order is 'ascending'.  To sort in the opposite direction, append
5488 +  .descending' to the sort key.
5489 +
5490 +  The 'pause' parameter can be used to pause an existing hist trigger
5491 +  or to start a hist trigger but not log any events until told to do
5492 +  so.  'continue' or 'cont' can be used to start or restart a paused
5493 +  hist trigger.
5494 +
5495 +  The 'clear' parameter will clear the contents of a running hist
5496 +  trigger and leave its current paused/active state.
5497 +
5498 +  Note that the 'pause', 'cont', and 'clear' parameters should be
5499 +  applied using 'append' shell operator ('>>') if applied to an
5500 +  existing trigger, rather than via the '>' operator, which will cause
5501 +  the trigger to be removed through truncation.
5502 +
5503 +- enable_hist/disable_hist
5504 +
5505 +  The enable_hist and disable_hist triggers can be used to have one
5506 +  event conditionally start and stop another event's already-attached
5507 +  hist trigger.  Any number of enable_hist and disable_hist triggers
5508 +  can be attached to a given event, allowing that event to kick off
5509 +  and stop aggregations on a host of other events.
5510 +
5511 +  The format is very similar to the enable/disable_event triggers:
5512 +
5513 +      enable_hist:<system>:<event>[:count]
5514 +      disable_hist:<system>:<event>[:count]
5515 +
5516 +  Instead of enabling or disabling the tracing of the target event
5517 +  into the trace buffer as the enable/disable_event triggers do, the
5518 +  enable/disable_hist triggers enable or disable the aggregation of
5519 +  the target event into a hash table.
5520 +
5521 +  A typical usage scenario for the enable_hist/disable_hist triggers
5522 +  would be to first set up a paused hist trigger on some event,
5523 +  followed by an enable_hist/disable_hist pair that turns the hist
5524 +  aggregation on and off when conditions of interest are hit:
5525 +
5526 +  # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
5527 +    /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
5528 +
5529 +  # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
5530 +    /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
5531 +
5532 +  # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
5533 +    /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
5534 +
5535 +  The above sets up an initially paused hist trigger which is unpaused
5536 +  and starts aggregating events when a given program is executed, and
5537 +  which stops aggregating when the process exits and the hist trigger
5538 +  is paused again.
5539 +
5540 +  The examples below provide a more concrete illustration of the
5541 +  concepts and typical usage patterns discussed above.
5542 +
5543 +  'special' event fields
5544 +  ------------------------
5545 +
5546 +  There are a number of 'special event fields' available for use as
5547 +  keys or values in a hist trigger.  These look like and behave as if
5548 +  they were actual event fields, but aren't really part of the event's
5549 +  field definition or format file.  They are however available for any
5550 +  event, and can be used anywhere an actual event field could be.
5551 +  They are:
5552 +
5553 +    common_timestamp       u64 - timestamp (from ring buffer) associated
5554 +                                 with the event, in nanoseconds.  May be
5555 +                                modified by .usecs to have timestamps
5556 +                                interpreted as microseconds.
5557 +    cpu                    int - the cpu on which the event occurred.
5558 +
5559 +  Extended error information
5560 +  --------------------------
5561 +
5562 +  For some error conditions encountered when invoking a hist trigger
5563 +  command, extended error information is available via the
5564 +  corresponding event's 'hist' file.  Reading the hist file after an
5565 +  error will display more detailed information about what went wrong,
5566 +  if information is available.  This extended error information will
5567 +  be available until the next hist trigger command for that event.
5568 +
5569 +  If available for a given error condition, the extended error
5570 +  information and usage takes the following form:
5571 +
5572 +    # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
5573 +    echo: write error: Invalid argument
5574 +
5575 +    # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
5576 +    ERROR: Couldn't yyy: zzz
5577 +      Last command: xxx
5578 +
5579 +6.2 'hist' trigger examples
5580 +---------------------------
5581 +
5582 +  The first set of examples creates aggregations using the kmalloc
5583 +  event.  The fields that can be used for the hist trigger are listed
5584 +  in the kmalloc event's format file:
5585 +
5586 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
5587 +    name: kmalloc
5588 +    ID: 374
5589 +    format:
5590 +       field:unsigned short common_type;       offset:0;       size:2; signed:0;
5591 +       field:unsigned char common_flags;       offset:2;       size:1; signed:0;
5592 +       field:unsigned char common_preempt_count;               offset:3;       size:1; signed:0;
5593 +       field:int common_pid;                                   offset:4;       size:4; signed:1;
5594 +
5595 +       field:unsigned long call_site;                          offset:8;       size:8; signed:0;
5596 +       field:const void * ptr;                                 offset:16;      size:8; signed:0;
5597 +       field:size_t bytes_req;                                 offset:24;      size:8; signed:0;
5598 +       field:size_t bytes_alloc;                               offset:32;      size:8; signed:0;
5599 +       field:gfp_t gfp_flags;                                  offset:40;      size:4; signed:0;
5600 +
5601 +  We'll start by creating a hist trigger that generates a simple table
5602 +  that lists the total number of bytes requested for each function in
5603 +  the kernel that made one or more calls to kmalloc:
5604 +
5605 +    # echo 'hist:key=call_site:val=bytes_req' > \
5606 +            /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5607 +
5608 +  This tells the tracing system to create a 'hist' trigger using the
5609 +  call_site field of the kmalloc event as the key for the table, which
5610 +  just means that each unique call_site address will have an entry
5611 +  created for it in the table.  The 'val=bytes_req' parameter tells
5612 +  the hist trigger that for each unique entry (call_site) in the
5613 +  table, it should keep a running total of the number of bytes
5614 +  requested by that call_site.
5615 +
5616 +  We'll let it run for awhile and then dump the contents of the 'hist'
5617 +  file in the kmalloc event's subdirectory (for readability, a number
5618 +  of entries have been omitted):
5619 +
5620 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5621 +    # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
5622 +
5623 +    { call_site: 18446744072106379007 } hitcount:          1  bytes_req:        176
5624 +    { call_site: 18446744071579557049 } hitcount:          1  bytes_req:       1024
5625 +    { call_site: 18446744071580608289 } hitcount:          1  bytes_req:      16384
5626 +    { call_site: 18446744071581827654 } hitcount:          1  bytes_req:         24
5627 +    { call_site: 18446744071580700980 } hitcount:          1  bytes_req:          8
5628 +    { call_site: 18446744071579359876 } hitcount:          1  bytes_req:        152
5629 +    { call_site: 18446744071580795365 } hitcount:          3  bytes_req:        144
5630 +    { call_site: 18446744071581303129 } hitcount:          3  bytes_req:        144
5631 +    { call_site: 18446744071580713234 } hitcount:          4  bytes_req:       2560
5632 +    { call_site: 18446744071580933750 } hitcount:          4  bytes_req:        736
5633 +    .
5634 +    .
5635 +    .
5636 +    { call_site: 18446744072106047046 } hitcount:         69  bytes_req:       5576
5637 +    { call_site: 18446744071582116407 } hitcount:         73  bytes_req:       2336
5638 +    { call_site: 18446744072106054684 } hitcount:        136  bytes_req:     140504
5639 +    { call_site: 18446744072106224230 } hitcount:        136  bytes_req:      19584
5640 +    { call_site: 18446744072106078074 } hitcount:        153  bytes_req:       2448
5641 +    { call_site: 18446744072106062406 } hitcount:        153  bytes_req:      36720
5642 +    { call_site: 18446744071582507929 } hitcount:        153  bytes_req:      37088
5643 +    { call_site: 18446744072102520590 } hitcount:        273  bytes_req:      10920
5644 +    { call_site: 18446744071582143559 } hitcount:        358  bytes_req:        716
5645 +    { call_site: 18446744072106465852 } hitcount:        417  bytes_req:      56712
5646 +    { call_site: 18446744072102523378 } hitcount:        485  bytes_req:      27160
5647 +    { call_site: 18446744072099568646 } hitcount:       1676  bytes_req:      33520
5648 +
5649 +    Totals:
5650 +        Hits: 4610
5651 +        Entries: 45
5652 +        Dropped: 0
5653 +
5654 +  The output displays a line for each entry, beginning with the key
5655 +  specified in the trigger, followed by the value(s) also specified in
5656 +  the trigger.  At the beginning of the output is a line that displays
5657 +  the trigger info, which can also be displayed by reading the
5658 +  'trigger' file:
5659 +
5660 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5661 +    hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
5662 +
5663 +  At the end of the output are a few lines that display the overall
5664 +  totals for the run.  The 'Hits' field shows the total number of
5665 +  times the event trigger was hit, the 'Entries' field shows the total
5666 +  number of used entries in the hash table, and the 'Dropped' field
5667 +  shows the number of hits that were dropped because the number of
5668 +  used entries for the run exceeded the maximum number of entries
5669 +  allowed for the table (normally 0, but if not a hint that you may
5670 +  want to increase the size of the table using the 'size' parameter).
5671 +
5672 +  Notice in the above output that there's an extra field, 'hitcount',
5673 +  which wasn't specified in the trigger.  Also notice that in the
5674 +  trigger info output, there's a parameter, 'sort=hitcount', which
5675 +  wasn't specified in the trigger either.  The reason for that is that
5676 +  every trigger implicitly keeps a count of the total number of hits
5677 +  attributed to a given entry, called the 'hitcount'.  That hitcount
5678 +  information is explicitly displayed in the output, and in the
5679 +  absence of a user-specified sort parameter, is used as the default
5680 +  sort field.
5681 +
5682 +  The value 'hitcount' can be used in place of an explicit value in
5683 +  the 'values' parameter if you don't really need to have any
5684 +  particular field summed and are mainly interested in hit
5685 +  frequencies.
5686 +
5687 +  To turn the hist trigger off, simply call up the trigger in the
5688 +  command history and re-execute it with a '!' prepended:
5689 +
5690 +    # echo '!hist:key=call_site:val=bytes_req' > \
5691 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5692 +
5693 +  Finally, notice that the call_site as displayed in the output above
5694 +  isn't really very useful.  It's an address, but normally addresses
5695 +  are displayed in hex.  To have a numeric field displayed as a hex
5696 +  value, simply append '.hex' to the field name in the trigger:
5697 +
5698 +    # echo 'hist:key=call_site.hex:val=bytes_req' > \
5699 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5700 +
5701 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5702 +    # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
5703 +
5704 +    { call_site: ffffffffa026b291 } hitcount:          1  bytes_req:        433
5705 +    { call_site: ffffffffa07186ff } hitcount:          1  bytes_req:        176
5706 +    { call_site: ffffffff811ae721 } hitcount:          1  bytes_req:      16384
5707 +    { call_site: ffffffff811c5134 } hitcount:          1  bytes_req:          8
5708 +    { call_site: ffffffffa04a9ebb } hitcount:          1  bytes_req:        511
5709 +    { call_site: ffffffff8122e0a6 } hitcount:          1  bytes_req:         12
5710 +    { call_site: ffffffff8107da84 } hitcount:          1  bytes_req:        152
5711 +    { call_site: ffffffff812d8246 } hitcount:          1  bytes_req:         24
5712 +    { call_site: ffffffff811dc1e5 } hitcount:          3  bytes_req:        144
5713 +    { call_site: ffffffffa02515e8 } hitcount:          3  bytes_req:        648
5714 +    { call_site: ffffffff81258159 } hitcount:          3  bytes_req:        144
5715 +    { call_site: ffffffff811c80f4 } hitcount:          4  bytes_req:        544
5716 +    .
5717 +    .
5718 +    .
5719 +    { call_site: ffffffffa06c7646 } hitcount:        106  bytes_req:       8024
5720 +    { call_site: ffffffffa06cb246 } hitcount:        132  bytes_req:      31680
5721 +    { call_site: ffffffffa06cef7a } hitcount:        132  bytes_req:       2112
5722 +    { call_site: ffffffff8137e399 } hitcount:        132  bytes_req:      23232
5723 +    { call_site: ffffffffa06c941c } hitcount:        185  bytes_req:     171360
5724 +    { call_site: ffffffffa06f2a66 } hitcount:        185  bytes_req:      26640
5725 +    { call_site: ffffffffa036a70e } hitcount:        265  bytes_req:      10600
5726 +    { call_site: ffffffff81325447 } hitcount:        292  bytes_req:        584
5727 +    { call_site: ffffffffa072da3c } hitcount:        446  bytes_req:      60656
5728 +    { call_site: ffffffffa036b1f2 } hitcount:        526  bytes_req:      29456
5729 +    { call_site: ffffffffa0099c06 } hitcount:       1780  bytes_req:      35600
5730 +
5731 +    Totals:
5732 +        Hits: 4775
5733 +        Entries: 46
5734 +        Dropped: 0
5735 +
5736 +  Even that's only marginally more useful - while hex values do look
5737 +  more like addresses, what users are typically more interested in
5738 +  when looking at text addresses are the corresponding symbols
5739 +  instead.  To have an address displayed as symbolic value instead,
5740 +  simply append '.sym' or '.sym-offset' to the field name in the
5741 +  trigger:
5742 +
5743 +    # echo 'hist:key=call_site.sym:val=bytes_req' > \
5744 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5745 +
5746 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5747 +    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
5748 +
5749 +    { call_site: [ffffffff810adcb9] syslog_print_all                              } hitcount:          1  bytes_req:       1024
5750 +    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
5751 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
5752 +    { call_site: [ffffffff8154acbe] usb_alloc_urb                                 } hitcount:          1  bytes_req:        192
5753 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
5754 +    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
5755 +    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
5756 +    { call_site: [ffffffff811febd5] fsnotify_alloc_group                          } hitcount:          2  bytes_req:        528
5757 +    { call_site: [ffffffff81440f58] __tty_buffer_request_room                     } hitcount:          2  bytes_req:       2624
5758 +    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          2  bytes_req:         96
5759 +    { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211]      } hitcount:          2  bytes_req:        464
5760 +    { call_site: [ffffffff81672406] tcp_get_metrics                               } hitcount:          2  bytes_req:        304
5761 +    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
5762 +    { call_site: [ffffffff81089b05] sched_create_group                            } hitcount:          2  bytes_req:       1424
5763 +    .
5764 +    .
5765 +    .
5766 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1185  bytes_req:     123240
5767 +    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:       1185  bytes_req:     104280
5768 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       1402  bytes_req:     190672
5769 +    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       1518  bytes_req:     146208
5770 +    { call_site: [ffffffffa029070e] drm_vma_node_allow [drm]                      } hitcount:       1746  bytes_req:      69840
5771 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       2021  bytes_req:     792312
5772 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       2592  bytes_req:     145152
5773 +    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2629  bytes_req:     378576
5774 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2629  bytes_req:    3783248
5775 +    { call_site: [ffffffff81325607] apparmor_file_alloc_security                  } hitcount:       5192  bytes_req:      10384
5776 +    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       5529  bytes_req:     110584
5777 +    { call_site: [ffffffff8131ebf7] aa_alloc_task_context                         } hitcount:      21943  bytes_req:     702176
5778 +    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:      55759  bytes_req:    5074265
5779 +
5780 +    Totals:
5781 +        Hits: 109928
5782 +        Entries: 71
5783 +        Dropped: 0
5784 +
5785 +  Because the default sort key above is 'hitcount', the above shows a
5786 +  the list of call_sites by increasing hitcount, so that at the bottom
5787 +  we see the functions that made the most kmalloc calls during the
5788 +  run.  If instead we we wanted to see the top kmalloc callers in
5789 +  terms of the number of bytes requested rather than the number of
5790 +  calls, and we wanted the top caller to appear at the top, we can use
5791 +  the 'sort' parameter, along with the 'descending' modifier:
5792 +
5793 +    # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
5794 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5795 +
5796 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5797 +    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
5798 +
5799 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2186  bytes_req:    3397464
5800 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1790  bytes_req:     712176
5801 +    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       8132  bytes_req:     513135
5802 +    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        106  bytes_req:     440128
5803 +    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2186  bytes_req:     314784
5804 +    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       2174  bytes_req:     208992
5805 +    { call_site: [ffffffff811ae8e1] __kmalloc                                     } hitcount:          8  bytes_req:     131072
5806 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:        859  bytes_req:     116824
5807 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       1834  bytes_req:     102704
5808 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:        972  bytes_req:     101088
5809 +    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:        972  bytes_req:      85536
5810 +    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       3333  bytes_req:      66664
5811 +    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        209  bytes_req:      61632
5812 +    .
5813 +    .
5814 +    .
5815 +    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
5816 +    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
5817 +    { call_site: [ffffffff812d8406] copy_semundo                                  } hitcount:          2  bytes_req:         48
5818 +    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          1  bytes_req:         48
5819 +    { call_site: [ffffffffa027121a] drm_getmagic [drm]                            } hitcount:          1  bytes_req:         48
5820 +    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
5821 +    { call_site: [ffffffff811c52f4] bprm_change_interp                            } hitcount:          2  bytes_req:         16
5822 +    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
5823 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
5824 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
5825 +
5826 +    Totals:
5827 +        Hits: 32133
5828 +        Entries: 81
5829 +        Dropped: 0
5830 +
5831 +  To display the offset and size information in addition to the symbol
5832 +  name, just use 'sym-offset' instead:
5833 +
5834 +    # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
5835 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5836 +
5837 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5838 +    # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
5839 +
5840 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915]                  } hitcount:       4569  bytes_req:    3163720
5841 +    { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915]                      } hitcount:       4569  bytes_req:     657936
5842 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915]      } hitcount:       1519  bytes_req:     472936
5843 +    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915]      } hitcount:       3050  bytes_req:     211832
5844 +    { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50                                 } hitcount:         34  bytes_req:     148384
5845 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915]                  } hitcount:       1385  bytes_req:     144040
5846 +    { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0                                   } hitcount:          8  bytes_req:     131072
5847 +    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm]              } hitcount:       1385  bytes_req:     121880
5848 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm]                  } hitcount:       1848  bytes_req:     103488
5849 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915]            } hitcount:        461  bytes_req:      62696
5850 +    { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm]                      } hitcount:       1541  bytes_req:      61640
5851 +    { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0                                } hitcount:         57  bytes_req:      57456
5852 +    .
5853 +    .
5854 +    .
5855 +    { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0                       } hitcount:          2  bytes_req:        128
5856 +    { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm]                      } hitcount:          3  bytes_req:         96
5857 +    { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0                         } hitcount:          8  bytes_req:         96
5858 +    { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650                            } hitcount:          3  bytes_req:         84
5859 +    { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110                              } hitcount:          1  bytes_req:          8
5860 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid]                     } hitcount:          1  bytes_req:          7
5861 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid]                    } hitcount:          1  bytes_req:          7
5862 +
5863 +    Totals:
5864 +        Hits: 26098
5865 +        Entries: 64
5866 +        Dropped: 0
5867 +
5868 +  We can also add multiple fields to the 'values' parameter.  For
5869 +  example, we might want to see the total number of bytes allocated
5870 +  alongside bytes requested, and display the result sorted by bytes
5871 +  allocated in a descending order:
5872 +
5873 +    # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
5874 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5875 +
5876 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5877 +    # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
5878 +
5879 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       7403  bytes_req:    4084360  bytes_alloc:    5958016
5880 +    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        541  bytes_req:    2213968  bytes_alloc:    2228224
5881 +    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       7404  bytes_req:    1066176  bytes_alloc:    1421568
5882 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1565  bytes_req:     557368  bytes_alloc:    1037760
5883 +    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       9557  bytes_req:     595778  bytes_alloc:     695744
5884 +    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       5839  bytes_req:     430680  bytes_alloc:     470400
5885 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       2388  bytes_req:     324768  bytes_alloc:     458496
5886 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       3911  bytes_req:     219016  bytes_alloc:     250304
5887 +    { call_site: [ffffffff815f8d7b] sk_prot_alloc                                 } hitcount:        235  bytes_req:     236880  bytes_alloc:     240640
5888 +    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        557  bytes_req:     169024  bytes_alloc:     221760
5889 +    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       9378  bytes_req:     187548  bytes_alloc:     206312
5890 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1519  bytes_req:     157976  bytes_alloc:     194432
5891 +    .
5892 +    .
5893 +    .
5894 +    { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach                 } hitcount:          2  bytes_req:        144  bytes_alloc:        192
5895 +    { call_site: [ffffffff81097ee8] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
5896 +    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
5897 +    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
5898 +    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
5899 +    { call_site: [ffffffff81213e80] load_elf_binary                               } hitcount:          3  bytes_req:         84  bytes_alloc:         96
5900 +    { call_site: [ffffffff81079a2e] kthread_create_on_node                        } hitcount:          1  bytes_req:         56  bytes_alloc:         64
5901 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7  bytes_alloc:          8
5902 +    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8  bytes_alloc:          8
5903 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7  bytes_alloc:          8
5904 +
5905 +    Totals:
5906 +        Hits: 66598
5907 +        Entries: 65
5908 +        Dropped: 0
5909 +
5910 +  Finally, to finish off our kmalloc example, instead of simply having
5911 +  the hist trigger display symbolic call_sites, we can have the hist
5912 +  trigger additionally display the complete set of kernel stack traces
5913 +  that led to each call_site.  To do that, we simply use the special
5914 +  value 'stacktrace' for the key parameter:
5915 +
5916 +    # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
5917 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
5918 +
5919 +  The above trigger will use the kernel stack trace in effect when an
5920 +  event is triggered as the key for the hash table.  This allows the
5921 +  enumeration of every kernel callpath that led up to a particular
5922 +  event, along with a running total of any of the event fields for
5923 +  that event.  Here we tally bytes requested and bytes allocated for
5924 +  every callpath in the system that led up to a kmalloc (in this case
5925 +  every callpath to a kmalloc for a kernel compile):
5926 +
5927 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
5928 +    # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
5929 +
5930 +    { stacktrace:
5931 +         __kmalloc_track_caller+0x10b/0x1a0
5932 +         kmemdup+0x20/0x50
5933 +         hidraw_report_event+0x8a/0x120 [hid]
5934 +         hid_report_raw_event+0x3ea/0x440 [hid]
5935 +         hid_input_report+0x112/0x190 [hid]
5936 +         hid_irq_in+0xc2/0x260 [usbhid]
5937 +         __usb_hcd_giveback_urb+0x72/0x120
5938 +         usb_giveback_urb_bh+0x9e/0xe0
5939 +         tasklet_hi_action+0xf8/0x100
5940 +         __do_softirq+0x114/0x2c0
5941 +         irq_exit+0xa5/0xb0
5942 +         do_IRQ+0x5a/0xf0
5943 +         ret_from_intr+0x0/0x30
5944 +         cpuidle_enter+0x17/0x20
5945 +         cpu_startup_entry+0x315/0x3e0
5946 +         rest_init+0x7c/0x80
5947 +    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
5948 +    { stacktrace:
5949 +         __kmalloc_track_caller+0x10b/0x1a0
5950 +         kmemdup+0x20/0x50
5951 +         hidraw_report_event+0x8a/0x120 [hid]
5952 +         hid_report_raw_event+0x3ea/0x440 [hid]
5953 +         hid_input_report+0x112/0x190 [hid]
5954 +         hid_irq_in+0xc2/0x260 [usbhid]
5955 +         __usb_hcd_giveback_urb+0x72/0x120
5956 +         usb_giveback_urb_bh+0x9e/0xe0
5957 +         tasklet_hi_action+0xf8/0x100
5958 +         __do_softirq+0x114/0x2c0
5959 +         irq_exit+0xa5/0xb0
5960 +         do_IRQ+0x5a/0xf0
5961 +         ret_from_intr+0x0/0x30
5962 +    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
5963 +    { stacktrace:
5964 +         kmem_cache_alloc_trace+0xeb/0x150
5965 +         aa_alloc_task_context+0x27/0x40
5966 +         apparmor_cred_prepare+0x1f/0x50
5967 +         security_prepare_creds+0x16/0x20
5968 +         prepare_creds+0xdf/0x1a0
5969 +         SyS_capset+0xb5/0x200
5970 +         system_call_fastpath+0x12/0x6a
5971 +    } hitcount:          1  bytes_req:         32  bytes_alloc:         32
5972 +    .
5973 +    .
5974 +    .
5975 +    { stacktrace:
5976 +         __kmalloc+0x11b/0x1b0
5977 +         i915_gem_execbuffer2+0x6c/0x2c0 [i915]
5978 +         drm_ioctl+0x349/0x670 [drm]
5979 +         do_vfs_ioctl+0x2f0/0x4f0
5980 +         SyS_ioctl+0x81/0xa0
5981 +         system_call_fastpath+0x12/0x6a
5982 +    } hitcount:      17726  bytes_req:   13944120  bytes_alloc:   19593808
5983 +    { stacktrace:
5984 +         __kmalloc+0x11b/0x1b0
5985 +         load_elf_phdrs+0x76/0xa0
5986 +         load_elf_binary+0x102/0x1650
5987 +         search_binary_handler+0x97/0x1d0
5988 +         do_execveat_common.isra.34+0x551/0x6e0
5989 +         SyS_execve+0x3a/0x50
5990 +         return_from_execve+0x0/0x23
5991 +    } hitcount:      33348  bytes_req:   17152128  bytes_alloc:   20226048
5992 +    { stacktrace:
5993 +         kmem_cache_alloc_trace+0xeb/0x150
5994 +         apparmor_file_alloc_security+0x27/0x40
5995 +         security_file_alloc+0x16/0x20
5996 +         get_empty_filp+0x93/0x1c0
5997 +         path_openat+0x31/0x5f0
5998 +         do_filp_open+0x3a/0x90
5999 +         do_sys_open+0x128/0x220
6000 +         SyS_open+0x1e/0x20
6001 +         system_call_fastpath+0x12/0x6a
6002 +    } hitcount:    4766422  bytes_req:    9532844  bytes_alloc:   38131376
6003 +    { stacktrace:
6004 +         __kmalloc+0x11b/0x1b0
6005 +         seq_buf_alloc+0x1b/0x50
6006 +         seq_read+0x2cc/0x370
6007 +         proc_reg_read+0x3d/0x80
6008 +         __vfs_read+0x28/0xe0
6009 +         vfs_read+0x86/0x140
6010 +         SyS_read+0x46/0xb0
6011 +         system_call_fastpath+0x12/0x6a
6012 +    } hitcount:      19133  bytes_req:   78368768  bytes_alloc:   78368768
6013 +
6014 +    Totals:
6015 +        Hits: 6085872
6016 +        Entries: 253
6017 +        Dropped: 0
6018 +
6019 +  If you key a hist trigger on common_pid, in order for example to
6020 +  gather and display sorted totals for each process, you can use the
6021 +  special .execname modifier to display the executable names for the
6022 +  processes in the table rather than raw pids.  The example below
6023 +  keeps a per-process sum of total bytes read:
6024 +
6025 +    # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
6026 +           /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
6027 +
6028 +    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
6029 +    # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
6030 +
6031 +    { common_pid: gnome-terminal  [      3196] } hitcount:        280  count:    1093512
6032 +    { common_pid: Xorg            [      1309] } hitcount:        525  count:     256640
6033 +    { common_pid: compiz          [      2889] } hitcount:         59  count:     254400
6034 +    { common_pid: bash            [      8710] } hitcount:          3  count:      66369
6035 +    { common_pid: dbus-daemon-lau [      8703] } hitcount:         49  count:      47739
6036 +    { common_pid: irqbalance      [      1252] } hitcount:         27  count:      27648
6037 +    { common_pid: 01ifupdown      [      8705] } hitcount:          3  count:      17216
6038 +    { common_pid: dbus-daemon     [       772] } hitcount:         10  count:      12396
6039 +    { common_pid: Socket Thread   [      8342] } hitcount:         11  count:      11264
6040 +    { common_pid: nm-dhcp-client. [      8701] } hitcount:          6  count:       7424
6041 +    { common_pid: gmain           [      1315] } hitcount:         18  count:       6336
6042 +    .
6043 +    .
6044 +    .
6045 +    { common_pid: postgres        [      1892] } hitcount:          2  count:         32
6046 +    { common_pid: postgres        [      1891] } hitcount:          2  count:         32
6047 +    { common_pid: gmain           [      8704] } hitcount:          2  count:         32
6048 +    { common_pid: upstart-dbus-br [      2740] } hitcount:         21  count:         21
6049 +    { common_pid: nm-dispatcher.a [      8696] } hitcount:          1  count:         16
6050 +    { common_pid: indicator-datet [      2904] } hitcount:          1  count:         16
6051 +    { common_pid: gdbus           [      2998] } hitcount:          1  count:         16
6052 +    { common_pid: rtkit-daemon    [      2052] } hitcount:          1  count:          8
6053 +    { common_pid: init            [         1] } hitcount:          2  count:          2
6054 +
6055 +    Totals:
6056 +        Hits: 2116
6057 +        Entries: 51
6058 +        Dropped: 0
6059 +
6060 +  Similarly, if you key a hist trigger on syscall id, for example to
6061 +  gather and display a list of systemwide syscall hits, you can use
6062 +  the special .syscall modifier to display the syscall names rather
6063 +  than raw ids.  The example below keeps a running total of syscall
6064 +  counts for the system during the run:
6065 +
6066 +    # echo 'hist:key=id.syscall:val=hitcount' > \
6067 +           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6068 +
6069 +    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6070 +    # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
6071 +
6072 +    { id: sys_fsync                     [ 74] } hitcount:          1
6073 +    { id: sys_newuname                  [ 63] } hitcount:          1
6074 +    { id: sys_prctl                     [157] } hitcount:          1
6075 +    { id: sys_statfs                    [137] } hitcount:          1
6076 +    { id: sys_symlink                   [ 88] } hitcount:          1
6077 +    { id: sys_sendmmsg                  [307] } hitcount:          1
6078 +    { id: sys_semctl                    [ 66] } hitcount:          1
6079 +    { id: sys_readlink                  [ 89] } hitcount:          3
6080 +    { id: sys_bind                      [ 49] } hitcount:          3
6081 +    { id: sys_getsockname               [ 51] } hitcount:          3
6082 +    { id: sys_unlink                    [ 87] } hitcount:          3
6083 +    { id: sys_rename                    [ 82] } hitcount:          4
6084 +    { id: unknown_syscall               [ 58] } hitcount:          4
6085 +    { id: sys_connect                   [ 42] } hitcount:          4
6086 +    { id: sys_getpid                    [ 39] } hitcount:          4
6087 +    .
6088 +    .
6089 +    .
6090 +    { id: sys_rt_sigprocmask            [ 14] } hitcount:        952
6091 +    { id: sys_futex                     [202] } hitcount:       1534
6092 +    { id: sys_write                     [  1] } hitcount:       2689
6093 +    { id: sys_setitimer                 [ 38] } hitcount:       2797
6094 +    { id: sys_read                      [  0] } hitcount:       3202
6095 +    { id: sys_select                    [ 23] } hitcount:       3773
6096 +    { id: sys_writev                    [ 20] } hitcount:       4531
6097 +    { id: sys_poll                      [  7] } hitcount:       8314
6098 +    { id: sys_recvmsg                   [ 47] } hitcount:      13738
6099 +    { id: sys_ioctl                     [ 16] } hitcount:      21843
6100 +
6101 +    Totals:
6102 +        Hits: 67612
6103 +        Entries: 72
6104 +        Dropped: 0
6105 +
6106 +    The syscall counts above provide a rough overall picture of system
6107 +    call activity on the system; we can see for example that the most
6108 +    popular system call on this system was the 'sys_ioctl' system call.
6109 +
6110 +    We can use 'compound' keys to refine that number and provide some
6111 +    further insight as to which processes exactly contribute to the
6112 +    overall ioctl count.
6113 +
6114 +    The command below keeps a hitcount for every unique combination of
6115 +    system call id and pid - the end result is essentially a table
6116 +    that keeps a per-pid sum of system call hits.  The results are
6117 +    sorted using the system call id as the primary key, and the
6118 +    hitcount sum as the secondary key:
6119 +
6120 +    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
6121 +           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6122 +
6123 +    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6124 +    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
6125 +
6126 +    { id: sys_read                      [  0], common_pid: rtkit-daemon    [      1877] } hitcount:          1
6127 +    { id: sys_read                      [  0], common_pid: gdbus           [      2976] } hitcount:          1
6128 +    { id: sys_read                      [  0], common_pid: console-kit-dae [      3400] } hitcount:          1
6129 +    { id: sys_read                      [  0], common_pid: postgres        [      1865] } hitcount:          1
6130 +    { id: sys_read                      [  0], common_pid: deja-dup-monito [      3543] } hitcount:          2
6131 +    { id: sys_read                      [  0], common_pid: NetworkManager  [       890] } hitcount:          2
6132 +    { id: sys_read                      [  0], common_pid: evolution-calen [      3048] } hitcount:          2
6133 +    { id: sys_read                      [  0], common_pid: postgres        [      1864] } hitcount:          2
6134 +    { id: sys_read                      [  0], common_pid: nm-applet       [      3022] } hitcount:          2
6135 +    { id: sys_read                      [  0], common_pid: whoopsie        [      1212] } hitcount:          2
6136 +    .
6137 +    .
6138 +    .
6139 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      8479] } hitcount:          1
6140 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      3472] } hitcount:         12
6141 +    { id: sys_ioctl                     [ 16], common_pid: gnome-terminal  [      3199] } hitcount:         16
6142 +    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:       1808
6143 +    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:       5580
6144 +    .
6145 +    .
6146 +    .
6147 +    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2690] } hitcount:          3
6148 +    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2688] } hitcount:         16
6149 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [       975] } hitcount:          2
6150 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3204] } hitcount:          4
6151 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2888] } hitcount:          4
6152 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3003] } hitcount:          4
6153 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2873] } hitcount:          4
6154 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3196] } hitcount:          6
6155 +    { id: sys_openat                    [257], common_pid: java            [      2623] } hitcount:          2
6156 +    { id: sys_eventfd2                  [290], common_pid: ibus-ui-gtk3    [      2760] } hitcount:          4
6157 +    { id: sys_eventfd2                  [290], common_pid: compiz          [      2994] } hitcount:          6
6158 +
6159 +    Totals:
6160 +        Hits: 31536
6161 +        Entries: 323
6162 +        Dropped: 0
6163 +
6164 +    The above list does give us a breakdown of the ioctl syscall by
6165 +    pid, but it also gives us quite a bit more than that, which we
6166 +    don't really care about at the moment.  Since we know the syscall
6167 +    id for sys_ioctl (16, displayed next to the sys_ioctl name), we
6168 +    can use that to filter out all the other syscalls:
6169 +
6170 +    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
6171 +           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
6172 +
6173 +    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
6174 +    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
6175 +
6176 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2769] } hitcount:          1
6177 +    { id: sys_ioctl                     [ 16], common_pid: evolution-addre [      8571] } hitcount:          1
6178 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      3003] } hitcount:          1
6179 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2781] } hitcount:          1
6180 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2829] } hitcount:          1
6181 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      8726] } hitcount:          1
6182 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      8508] } hitcount:          1
6183 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2970] } hitcount:          1
6184 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2768] } hitcount:          1
6185 +    .
6186 +    .
6187 +    .
6188 +    { id: sys_ioctl                     [ 16], common_pid: pool            [      8559] } hitcount:         45
6189 +    { id: sys_ioctl                     [ 16], common_pid: pool            [      8555] } hitcount:         48
6190 +    { id: sys_ioctl                     [ 16], common_pid: pool            [      8551] } hitcount:         48
6191 +    { id: sys_ioctl                     [ 16], common_pid: avahi-daemon    [       896] } hitcount:         66
6192 +    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:      26674
6193 +    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:      73443
6194 +
6195 +    Totals:
6196 +        Hits: 101162
6197 +        Entries: 103
6198 +        Dropped: 0
6199 +
6200 +    The above output shows that 'compiz' and 'Xorg' are far and away
6201 +    the heaviest ioctl callers (which might lead to questions about
6202 +    whether they really need to be making all those calls and to
6203 +    possible avenues for further investigation.)
6204 +
6205 +    The compound key examples used a key and a sum value (hitcount) to
6206 +    sort the output, but we can just as easily use two keys instead.
6207 +    Here's an example where we use a compound key composed of the the
6208 +    common_pid and size event fields.  Sorting with pid as the primary
6209 +    key and 'size' as the secondary key allows us to display an
6210 +    ordered summary of the recvfrom sizes, with counts, received by
6211 +    each process:
6212 +
6213 +    # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
6214 +           /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
6215 +
6216 +    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
6217 +    # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
6218 +
6219 +    { common_pid: smbd            [       784], size:          4 } hitcount:          1
6220 +    { common_pid: dnsmasq         [      1412], size:       4096 } hitcount:        672
6221 +    { common_pid: postgres        [      1796], size:       1000 } hitcount:          6
6222 +    { common_pid: postgres        [      1867], size:       1000 } hitcount:         10
6223 +    { common_pid: bamfdaemon      [      2787], size:         28 } hitcount:          2
6224 +    { common_pid: bamfdaemon      [      2787], size:      14360 } hitcount:          1
6225 +    { common_pid: compiz          [      2994], size:          8 } hitcount:          1
6226 +    { common_pid: compiz          [      2994], size:         20 } hitcount:         11
6227 +    { common_pid: gnome-terminal  [      3199], size:          4 } hitcount:          2
6228 +    { common_pid: firefox         [      8817], size:          4 } hitcount:          1
6229 +    { common_pid: firefox         [      8817], size:          8 } hitcount:          5
6230 +    { common_pid: firefox         [      8817], size:        588 } hitcount:          2
6231 +    { common_pid: firefox         [      8817], size:        628 } hitcount:          1
6232 +    { common_pid: firefox         [      8817], size:       6944 } hitcount:          1
6233 +    { common_pid: firefox         [      8817], size:     408880 } hitcount:          2
6234 +    { common_pid: firefox         [      8822], size:          8 } hitcount:          2
6235 +    { common_pid: firefox         [      8822], size:        160 } hitcount:          2
6236 +    { common_pid: firefox         [      8822], size:        320 } hitcount:          2
6237 +    { common_pid: firefox         [      8822], size:        352 } hitcount:          1
6238 +    .
6239 +    .
6240 +    .
6241 +    { common_pid: pool            [      8923], size:       1960 } hitcount:         10
6242 +    { common_pid: pool            [      8923], size:       2048 } hitcount:         10
6243 +    { common_pid: pool            [      8924], size:       1960 } hitcount:         10
6244 +    { common_pid: pool            [      8924], size:       2048 } hitcount:         10
6245 +    { common_pid: pool            [      8928], size:       1964 } hitcount:          4
6246 +    { common_pid: pool            [      8928], size:       1965 } hitcount:          2
6247 +    { common_pid: pool            [      8928], size:       2048 } hitcount:          6
6248 +    { common_pid: pool            [      8929], size:       1982 } hitcount:          1
6249 +    { common_pid: pool            [      8929], size:       2048 } hitcount:          1
6250 +
6251 +    Totals:
6252 +        Hits: 2016
6253 +        Entries: 224
6254 +        Dropped: 0
6255 +
6256 +  The above example also illustrates the fact that although a compound
6257 +  key is treated as a single entity for hashing purposes, the sub-keys
6258 +  it's composed of can be accessed independently.
6259 +
6260 +  The next example uses a string field as the hash key and
6261 +  demonstrates how you can manually pause and continue a hist trigger.
6262 +  In this example, we'll aggregate fork counts and don't expect a
6263 +  large number of entries in the hash table, so we'll drop it to a
6264 +  much smaller number, say 256:
6265 +
6266 +    # echo 'hist:key=child_comm:val=hitcount:size=256' > \
6267 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6268 +
6269 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6270 +    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
6271 +
6272 +    { child_comm: dconf worker                        } hitcount:          1
6273 +    { child_comm: ibus-daemon                         } hitcount:          1
6274 +    { child_comm: whoopsie                            } hitcount:          1
6275 +    { child_comm: smbd                                } hitcount:          1
6276 +    { child_comm: gdbus                               } hitcount:          1
6277 +    { child_comm: kthreadd                            } hitcount:          1
6278 +    { child_comm: dconf worker                        } hitcount:          1
6279 +    { child_comm: evolution-alarm                     } hitcount:          2
6280 +    { child_comm: Socket Thread                       } hitcount:          2
6281 +    { child_comm: postgres                            } hitcount:          2
6282 +    { child_comm: bash                                } hitcount:          3
6283 +    { child_comm: compiz                              } hitcount:          3
6284 +    { child_comm: evolution-sourc                     } hitcount:          4
6285 +    { child_comm: dhclient                            } hitcount:          4
6286 +    { child_comm: pool                                } hitcount:          5
6287 +    { child_comm: nm-dispatcher.a                     } hitcount:          8
6288 +    { child_comm: firefox                             } hitcount:          8
6289 +    { child_comm: dbus-daemon                         } hitcount:          8
6290 +    { child_comm: glib-pacrunner                      } hitcount:         10
6291 +    { child_comm: evolution                           } hitcount:         23
6292 +
6293 +    Totals:
6294 +        Hits: 89
6295 +        Entries: 20
6296 +        Dropped: 0
6297 +
6298 +  If we want to pause the hist trigger, we can simply append :pause to
6299 +  the command that started the trigger.  Notice that the trigger info
6300 +  displays as [paused]:
6301 +
6302 +    # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
6303 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6304 +
6305 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6306 +    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
6307 +
6308 +    { child_comm: dconf worker                        } hitcount:          1
6309 +    { child_comm: kthreadd                            } hitcount:          1
6310 +    { child_comm: dconf worker                        } hitcount:          1
6311 +    { child_comm: gdbus                               } hitcount:          1
6312 +    { child_comm: ibus-daemon                         } hitcount:          1
6313 +    { child_comm: Socket Thread                       } hitcount:          2
6314 +    { child_comm: evolution-alarm                     } hitcount:          2
6315 +    { child_comm: smbd                                } hitcount:          2
6316 +    { child_comm: bash                                } hitcount:          3
6317 +    { child_comm: whoopsie                            } hitcount:          3
6318 +    { child_comm: compiz                              } hitcount:          3
6319 +    { child_comm: evolution-sourc                     } hitcount:          4
6320 +    { child_comm: pool                                } hitcount:          5
6321 +    { child_comm: postgres                            } hitcount:          6
6322 +    { child_comm: firefox                             } hitcount:          8
6323 +    { child_comm: dhclient                            } hitcount:         10
6324 +    { child_comm: emacs                               } hitcount:         12
6325 +    { child_comm: dbus-daemon                         } hitcount:         20
6326 +    { child_comm: nm-dispatcher.a                     } hitcount:         20
6327 +    { child_comm: evolution                           } hitcount:         35
6328 +    { child_comm: glib-pacrunner                      } hitcount:         59
6329 +
6330 +    Totals:
6331 +        Hits: 199
6332 +        Entries: 21
6333 +        Dropped: 0
6334 +
6335 +  To manually continue having the trigger aggregate events, append
6336 +  :cont instead.  Notice that the trigger info displays as [active]
6337 +  again, and the data has changed:
6338 +
6339 +    # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
6340 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6341 +
6342 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6343 +    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
6344 +
6345 +    { child_comm: dconf worker                        } hitcount:          1
6346 +    { child_comm: dconf worker                        } hitcount:          1
6347 +    { child_comm: kthreadd                            } hitcount:          1
6348 +    { child_comm: gdbus                               } hitcount:          1
6349 +    { child_comm: ibus-daemon                         } hitcount:          1
6350 +    { child_comm: Socket Thread                       } hitcount:          2
6351 +    { child_comm: evolution-alarm                     } hitcount:          2
6352 +    { child_comm: smbd                                } hitcount:          2
6353 +    { child_comm: whoopsie                            } hitcount:          3
6354 +    { child_comm: compiz                              } hitcount:          3
6355 +    { child_comm: evolution-sourc                     } hitcount:          4
6356 +    { child_comm: bash                                } hitcount:          5
6357 +    { child_comm: pool                                } hitcount:          5
6358 +    { child_comm: postgres                            } hitcount:          6
6359 +    { child_comm: firefox                             } hitcount:          8
6360 +    { child_comm: dhclient                            } hitcount:         11
6361 +    { child_comm: emacs                               } hitcount:         12
6362 +    { child_comm: dbus-daemon                         } hitcount:         22
6363 +    { child_comm: nm-dispatcher.a                     } hitcount:         22
6364 +    { child_comm: evolution                           } hitcount:         35
6365 +    { child_comm: glib-pacrunner                      } hitcount:         59
6366 +
6367 +    Totals:
6368 +        Hits: 206
6369 +        Entries: 21
6370 +        Dropped: 0
6371 +
6372 +  The previous example showed how to start and stop a hist trigger by
6373 +  appending 'pause' and 'continue' to the hist trigger command.  A
6374 +  hist trigger can also be started in a paused state by initially
6375 +  starting the trigger with ':pause' appended.  This allows you to
6376 +  start the trigger only when you're ready to start collecting data
6377 +  and not before.  For example, you could start the trigger in a
6378 +  paused state, then unpause it and do something you want to measure,
6379 +  then pause the trigger again when done.
6380 +
6381 +  Of course, doing this manually can be difficult and error-prone, but
6382 +  it is possible to automatically start and stop a hist trigger based
6383 +  on some condition, via the enable_hist and disable_hist triggers.
6384 +
6385 +  For example, suppose we wanted to take a look at the relative
6386 +  weights in terms of skb length for each callpath that leads to a
6387 +  netif_receieve_skb event when downloading a decent-sized file using
6388 +  wget.
6389 +
6390 +  First we set up an initially paused stacktrace trigger on the
6391 +  netif_receive_skb event:
6392 +
6393 +    # echo 'hist:key=stacktrace:vals=len:pause' > \
6394 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6395 +
6396 +  Next, we set up an 'enable_hist' trigger on the sched_process_exec
6397 +  event, with an 'if filename==/usr/bin/wget' filter.  The effect of
6398 +  this new trigger is that it will 'unpause' the hist trigger we just
6399 +  set up on netif_receive_skb if and only if it sees a
6400 +  sched_process_exec event with a filename of '/usr/bin/wget'.  When
6401 +  that happens, all netif_receive_skb events are aggregated into a
6402 +  hash table keyed on stacktrace:
6403 +
6404 +    # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
6405 +           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6406 +
6407 +  The aggregation continues until the netif_receive_skb is paused
6408 +  again, which is what the following disable_hist event does by
6409 +  creating a similar setup on the sched_process_exit event, using the
6410 +  filter 'comm==wget':
6411 +
6412 +    # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
6413 +           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6414 +
6415 +  Whenever a process exits and the comm field of the disable_hist
6416 +  trigger filter matches 'comm==wget', the netif_receive_skb hist
6417 +  trigger is disabled.
6418 +
6419 +  The overall effect is that netif_receive_skb events are aggregated
6420 +  into the hash table for only the duration of the wget.  Executing a
6421 +  wget command and then listing the 'hist' file will display the
6422 +  output generated by the wget command:
6423 +
6424 +    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
6425 +
6426 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6427 +    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
6428 +
6429 +    { stacktrace:
6430 +         __netif_receive_skb_core+0x46d/0x990
6431 +         __netif_receive_skb+0x18/0x60
6432 +         netif_receive_skb_internal+0x23/0x90
6433 +         napi_gro_receive+0xc8/0x100
6434 +         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
6435 +         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
6436 +         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
6437 +         ieee80211_rx+0x31d/0x900 [mac80211]
6438 +         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
6439 +         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
6440 +         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
6441 +         irq_thread_fn+0x20/0x50
6442 +         irq_thread+0x11f/0x150
6443 +         kthread+0xd2/0xf0
6444 +         ret_from_fork+0x42/0x70
6445 +    } hitcount:         85  len:      28884
6446 +    { stacktrace:
6447 +         __netif_receive_skb_core+0x46d/0x990
6448 +         __netif_receive_skb+0x18/0x60
6449 +         netif_receive_skb_internal+0x23/0x90
6450 +         napi_gro_complete+0xa4/0xe0
6451 +         dev_gro_receive+0x23a/0x360
6452 +         napi_gro_receive+0x30/0x100
6453 +         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
6454 +         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
6455 +         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
6456 +         ieee80211_rx+0x31d/0x900 [mac80211]
6457 +         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
6458 +         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
6459 +         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
6460 +         irq_thread_fn+0x20/0x50
6461 +         irq_thread+0x11f/0x150
6462 +         kthread+0xd2/0xf0
6463 +    } hitcount:         98  len:     664329
6464 +    { stacktrace:
6465 +         __netif_receive_skb_core+0x46d/0x990
6466 +         __netif_receive_skb+0x18/0x60
6467 +         process_backlog+0xa8/0x150
6468 +         net_rx_action+0x15d/0x340
6469 +         __do_softirq+0x114/0x2c0
6470 +         do_softirq_own_stack+0x1c/0x30
6471 +         do_softirq+0x65/0x70
6472 +         __local_bh_enable_ip+0xb5/0xc0
6473 +         ip_finish_output+0x1f4/0x840
6474 +         ip_output+0x6b/0xc0
6475 +         ip_local_out_sk+0x31/0x40
6476 +         ip_send_skb+0x1a/0x50
6477 +         udp_send_skb+0x173/0x2a0
6478 +         udp_sendmsg+0x2bf/0x9f0
6479 +         inet_sendmsg+0x64/0xa0
6480 +         sock_sendmsg+0x3d/0x50
6481 +    } hitcount:        115  len:      13030
6482 +    { stacktrace:
6483 +         __netif_receive_skb_core+0x46d/0x990
6484 +         __netif_receive_skb+0x18/0x60
6485 +         netif_receive_skb_internal+0x23/0x90
6486 +         napi_gro_complete+0xa4/0xe0
6487 +         napi_gro_flush+0x6d/0x90
6488 +         iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
6489 +         irq_thread_fn+0x20/0x50
6490 +         irq_thread+0x11f/0x150
6491 +         kthread+0xd2/0xf0
6492 +         ret_from_fork+0x42/0x70
6493 +    } hitcount:        934  len:    5512212
6494 +
6495 +    Totals:
6496 +        Hits: 1232
6497 +        Entries: 4
6498 +        Dropped: 0
6499 +
6500 +  The above shows all the netif_receive_skb callpaths and their total
6501 +  lengths for the duration of the wget command.
6502 +
6503 +  The 'clear' hist trigger param can be used to clear the hash table.
6504 +  Suppose we wanted to try another run of the previous example but
6505 +  this time also wanted to see the complete list of events that went
6506 +  into the histogram.  In order to avoid having to set everything up
6507 +  again, we can just clear the histogram first:
6508 +
6509 +    # echo 'hist:key=stacktrace:vals=len:clear' >> \
6510 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6511 +
6512 +  Just to verify that it is in fact cleared, here's what we now see in
6513 +  the hist file:
6514 +
6515 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6516 +    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
6517 +
6518 +    Totals:
6519 +        Hits: 0
6520 +        Entries: 0
6521 +        Dropped: 0
6522 +
6523 +  Since we want to see the detailed list of every netif_receive_skb
6524 +  event occurring during the new run, which are in fact the same
6525 +  events being aggregated into the hash table, we add some additional
6526 +  'enable_event' events to the triggering sched_process_exec and
6527 +  sched_process_exit events as such:
6528 +
6529 +    # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
6530 +           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6531 +
6532 +    # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
6533 +           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6534 +
6535 +  If you read the trigger files for the sched_process_exec and
6536 +  sched_process_exit triggers, you should see two triggers for each:
6537 +  one enabling/disabling the hist aggregation and the other
6538 +  enabling/disabling the logging of events:
6539 +
6540 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
6541 +    enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
6542 +    enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
6543 +
6544 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
6545 +    enable_event:net:netif_receive_skb:unlimited if comm==wget
6546 +    disable_hist:net:netif_receive_skb:unlimited if comm==wget
6547 +
6548 +  In other words, whenever either of the sched_process_exec or
6549 +  sched_process_exit events is hit and matches 'wget', it enables or
6550 +  disables both the histogram and the event log, and what you end up
6551 +  with is a hash table and set of events just covering the specified
6552 +  duration.  Run the wget command again:
6553 +
6554 +    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
6555 +
6556 +  Displaying the 'hist' file should show something similar to what you
6557 +  saw in the last run, but this time you should also see the
6558 +  individual events in the trace file:
6559 +
6560 +    # cat /sys/kernel/debug/tracing/trace
6561 +
6562 +    # tracer: nop
6563 +    #
6564 +    # entries-in-buffer/entries-written: 183/1426   #P:4
6565 +    #
6566 +    #                              _-----=> irqs-off
6567 +    #                             / _----=> need-resched
6568 +    #                            | / _---=> hardirq/softirq
6569 +    #                            || / _--=> preempt-depth
6570 +    #                            ||| /     delay
6571 +    #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
6572 +    #              | |       |   ||||       |         |
6573 +                wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
6574 +                wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
6575 +             dnsmasq-1382  [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
6576 +             dnsmasq-1382  [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
6577 +    ##### CPU 2 buffer started ####
6578 +      irq/29-iwlwifi-559   [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
6579 +      irq/29-iwlwifi-559   [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
6580 +      irq/29-iwlwifi-559   [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
6581 +      irq/29-iwlwifi-559   [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
6582 +      irq/29-iwlwifi-559   [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
6583 +    .
6584 +    .
6585 +    .
6586 +
6587 +  The following example demonstrates how multiple hist triggers can be
6588 +  attached to a given event.  This capability can be useful for
6589 +  creating a set of different summaries derived from the same set of
6590 +  events, or for comparing the effects of different filters, among
6591 +  other things.
6592 +
6593 +    # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
6594 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6595 +    # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
6596 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6597 +    # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
6598 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6599 +    # echo 'hist:keys=skbaddr.hex:vals=len' >> \
6600 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6601 +    # echo 'hist:keys=len:vals=common_preempt_count' >> \
6602 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6603 +
6604 +  The above set of commands create four triggers differing only in
6605 +  their filters, along with a completely different though fairly
6606 +  nonsensical trigger.  Note that in order to append multiple hist
6607 +  triggers to the same file, you should use the '>>' operator to
6608 +  append them ('>' will also add the new hist trigger, but will remove
6609 +  any existing hist triggers beforehand).
6610 +
6611 +  Displaying the contents of the 'hist' file for the event shows the
6612 +  contents of all five histograms:
6613 +
6614 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
6615 +
6616 +    # event histogram
6617 +    #
6618 +    # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
6619 +    #
6620 +
6621 +    { len:        176 } hitcount:          1  common_preempt_count:          0
6622 +    { len:        223 } hitcount:          1  common_preempt_count:          0
6623 +    { len:       4854 } hitcount:          1  common_preempt_count:          0
6624 +    { len:        395 } hitcount:          1  common_preempt_count:          0
6625 +    { len:        177 } hitcount:          1  common_preempt_count:          0
6626 +    { len:        446 } hitcount:          1  common_preempt_count:          0
6627 +    { len:       1601 } hitcount:          1  common_preempt_count:          0
6628 +    .
6629 +    .
6630 +    .
6631 +    { len:       1280 } hitcount:         66  common_preempt_count:          0
6632 +    { len:        116 } hitcount:         81  common_preempt_count:         40
6633 +    { len:        708 } hitcount:        112  common_preempt_count:          0
6634 +    { len:         46 } hitcount:        221  common_preempt_count:          0
6635 +    { len:       1264 } hitcount:        458  common_preempt_count:          0
6636 +
6637 +    Totals:
6638 +        Hits: 1428
6639 +        Entries: 147
6640 +        Dropped: 0
6641 +
6642 +
6643 +    # event histogram
6644 +    #
6645 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6646 +    #
6647 +
6648 +    { skbaddr: ffff8800baee5e00 } hitcount:          1  len:        130
6649 +    { skbaddr: ffff88005f3d5600 } hitcount:          1  len:       1280
6650 +    { skbaddr: ffff88005f3d4900 } hitcount:          1  len:       1280
6651 +    { skbaddr: ffff88009fed6300 } hitcount:          1  len:        115
6652 +    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:        115
6653 +    { skbaddr: ffff88008cdb1900 } hitcount:          1  len:         46
6654 +    { skbaddr: ffff880064b5ef00 } hitcount:          1  len:        118
6655 +    { skbaddr: ffff880044e3c700 } hitcount:          1  len:         60
6656 +    { skbaddr: ffff880100065900 } hitcount:          1  len:         46
6657 +    { skbaddr: ffff8800d46bd500 } hitcount:          1  len:        116
6658 +    { skbaddr: ffff88005f3d5f00 } hitcount:          1  len:       1280
6659 +    { skbaddr: ffff880100064700 } hitcount:          1  len:        365
6660 +    { skbaddr: ffff8800badb6f00 } hitcount:          1  len:         60
6661 +    .
6662 +    .
6663 +    .
6664 +    { skbaddr: ffff88009fe0be00 } hitcount:         27  len:      24677
6665 +    { skbaddr: ffff88009fe0a400 } hitcount:         27  len:      23052
6666 +    { skbaddr: ffff88009fe0b700 } hitcount:         31  len:      25589
6667 +    { skbaddr: ffff88009fe0b600 } hitcount:         32  len:      27326
6668 +    { skbaddr: ffff88006a462800 } hitcount:         68  len:      71678
6669 +    { skbaddr: ffff88006a463700 } hitcount:         70  len:      72678
6670 +    { skbaddr: ffff88006a462b00 } hitcount:         71  len:      77589
6671 +    { skbaddr: ffff88006a463600 } hitcount:         73  len:      71307
6672 +    { skbaddr: ffff88006a462200 } hitcount:         81  len:      81032
6673 +
6674 +    Totals:
6675 +        Hits: 1451
6676 +        Entries: 318
6677 +        Dropped: 0
6678 +
6679 +
6680 +    # event histogram
6681 +    #
6682 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
6683 +    #
6684 +
6685 +
6686 +    Totals:
6687 +        Hits: 0
6688 +        Entries: 0
6689 +        Dropped: 0
6690 +
6691 +
6692 +    # event histogram
6693 +    #
6694 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
6695 +    #
6696 +
6697 +    { skbaddr: ffff88009fd2c300 } hitcount:          1  len:       7212
6698 +    { skbaddr: ffff8800d2bcce00 } hitcount:          1  len:       7212
6699 +    { skbaddr: ffff8800d2bcd700 } hitcount:          1  len:       7212
6700 +    { skbaddr: ffff8800d2bcda00 } hitcount:          1  len:      21492
6701 +    { skbaddr: ffff8800ae2e2d00 } hitcount:          1  len:       7212
6702 +    { skbaddr: ffff8800d2bcdb00 } hitcount:          1  len:       7212
6703 +    { skbaddr: ffff88006a4df500 } hitcount:          1  len:       4854
6704 +    { skbaddr: ffff88008ce47b00 } hitcount:          1  len:      18636
6705 +    { skbaddr: ffff8800ae2e2200 } hitcount:          1  len:      12924
6706 +    { skbaddr: ffff88005f3e1000 } hitcount:          1  len:       4356
6707 +    { skbaddr: ffff8800d2bcdc00 } hitcount:          2  len:      24420
6708 +    { skbaddr: ffff8800d2bcc200 } hitcount:          2  len:      12996
6709 +
6710 +    Totals:
6711 +        Hits: 14
6712 +        Entries: 12
6713 +        Dropped: 0
6714 +
6715 +
6716 +    # event histogram
6717 +    #
6718 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
6719 +    #
6720 +
6721 +
6722 +    Totals:
6723 +        Hits: 0
6724 +        Entries: 0
6725 +        Dropped: 0
6726 +
6727 +  Named triggers can be used to have triggers share a common set of
6728 +  histogram data.  This capability is mostly useful for combining the
6729 +  output of events generated by tracepoints contained inside inline
6730 +  functions, but names can be used in a hist trigger on any event.
6731 +  For example, these two triggers when hit will update the same 'len'
6732 +  field in the shared 'foo' histogram data:
6733 +
6734 +    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
6735 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
6736 +    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
6737 +           /sys/kernel/debug/tracing/events/net/netif_rx/trigger
6738 +
6739 +  You can see that they're updating common histogram data by reading
6740 +  each event's hist files at the same time:
6741 +
6742 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
6743 +      cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
6744 +
6745 +    # event histogram
6746 +    #
6747 +    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6748 +    #
6749 +
6750 +    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
6751 +    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
6752 +    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
6753 +    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
6754 +    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
6755 +    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
6756 +    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
6757 +    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
6758 +    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
6759 +    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
6760 +    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
6761 +    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
6762 +    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
6763 +    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
6764 +    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
6765 +    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
6766 +    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
6767 +    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
6768 +    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
6769 +    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
6770 +    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
6771 +    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
6772 +    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
6773 +    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
6774 +    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
6775 +    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
6776 +    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
6777 +    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
6778 +    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
6779 +    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
6780 +    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
6781 +    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
6782 +    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
6783 +    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
6784 +    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
6785 +    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
6786 +    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
6787 +    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
6788 +    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
6789 +    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
6790 +    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
6791 +    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
6792 +
6793 +    Totals:
6794 +        Hits: 81
6795 +        Entries: 42
6796 +        Dropped: 0
6797 +    # event histogram
6798 +    #
6799 +    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
6800 +    #
6801 +
6802 +    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
6803 +    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
6804 +    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
6805 +    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
6806 +    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
6807 +    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
6808 +    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
6809 +    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
6810 +    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
6811 +    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
6812 +    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
6813 +    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
6814 +    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
6815 +    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
6816 +    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
6817 +    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
6818 +    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
6819 +    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
6820 +    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
6821 +    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
6822 +    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
6823 +    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
6824 +    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
6825 +    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
6826 +    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
6827 +    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
6828 +    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
6829 +    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
6830 +    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
6831 +    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
6832 +    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
6833 +    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
6834 +    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
6835 +    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
6836 +    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
6837 +    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
6838 +    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
6839 +    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
6840 +    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
6841 +    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
6842 +    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
6843 +    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
6844 +
6845 +    Totals:
6846 +        Hits: 81
6847 +        Entries: 42
6848 +        Dropped: 0
6849 +
6850 +  And here's an example that shows how to combine histogram data from
6851 +  any two events even if they don't share any 'compatible' fields
6852 +  other than 'hitcount' and 'stacktrace'.  These commands create a
6853 +  couple of triggers named 'bar' using those fields:
6854 +
6855 +    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
6856 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
6857 +    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
6858 +          /sys/kernel/debug/tracing/events/net/netif_rx/trigger
6859 +
6860 +  And displaying the output of either shows some interesting if
6861 +  somewhat confusing output:
6862 +
6863 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
6864 +    # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
6865 +
6866 +    # event histogram
6867 +    #
6868 +    # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
6869 +    #
6870 +
6871 +    { stacktrace:
6872 +             _do_fork+0x18e/0x330
6873 +             kernel_thread+0x29/0x30
6874 +             kthreadd+0x154/0x1b0
6875 +             ret_from_fork+0x3f/0x70
6876 +    } hitcount:          1
6877 +    { stacktrace:
6878 +             netif_rx_internal+0xb2/0xd0
6879 +             netif_rx_ni+0x20/0x70
6880 +             dev_loopback_xmit+0xaa/0xd0
6881 +             ip_mc_output+0x126/0x240
6882 +             ip_local_out_sk+0x31/0x40
6883 +             igmp_send_report+0x1e9/0x230
6884 +             igmp_timer_expire+0xe9/0x120
6885 +             call_timer_fn+0x39/0xf0
6886 +             run_timer_softirq+0x1e1/0x290
6887 +             __do_softirq+0xfd/0x290
6888 +             irq_exit+0x98/0xb0
6889 +             smp_apic_timer_interrupt+0x4a/0x60
6890 +             apic_timer_interrupt+0x6d/0x80
6891 +             cpuidle_enter+0x17/0x20
6892 +             call_cpuidle+0x3b/0x60
6893 +             cpu_startup_entry+0x22d/0x310
6894 +    } hitcount:          1
6895 +    { stacktrace:
6896 +             netif_rx_internal+0xb2/0xd0
6897 +             netif_rx_ni+0x20/0x70
6898 +             dev_loopback_xmit+0xaa/0xd0
6899 +             ip_mc_output+0x17f/0x240
6900 +             ip_local_out_sk+0x31/0x40
6901 +             ip_send_skb+0x1a/0x50
6902 +             udp_send_skb+0x13e/0x270
6903 +             udp_sendmsg+0x2bf/0x980
6904 +             inet_sendmsg+0x67/0xa0
6905 +             sock_sendmsg+0x38/0x50
6906 +             SYSC_sendto+0xef/0x170
6907 +             SyS_sendto+0xe/0x10
6908 +             entry_SYSCALL_64_fastpath+0x12/0x6a
6909 +    } hitcount:          2
6910 +    { stacktrace:
6911 +             netif_rx_internal+0xb2/0xd0
6912 +             netif_rx+0x1c/0x60
6913 +             loopback_xmit+0x6c/0xb0
6914 +             dev_hard_start_xmit+0x219/0x3a0
6915 +             __dev_queue_xmit+0x415/0x4f0
6916 +             dev_queue_xmit_sk+0x13/0x20
6917 +             ip_finish_output2+0x237/0x340
6918 +             ip_finish_output+0x113/0x1d0
6919 +             ip_output+0x66/0xc0
6920 +             ip_local_out_sk+0x31/0x40
6921 +             ip_send_skb+0x1a/0x50
6922 +             udp_send_skb+0x16d/0x270
6923 +             udp_sendmsg+0x2bf/0x980
6924 +             inet_sendmsg+0x67/0xa0
6925 +             sock_sendmsg+0x38/0x50
6926 +             ___sys_sendmsg+0x14e/0x270
6927 +    } hitcount:         76
6928 +    { stacktrace:
6929 +             netif_rx_internal+0xb2/0xd0
6930 +             netif_rx+0x1c/0x60
6931 +             loopback_xmit+0x6c/0xb0
6932 +             dev_hard_start_xmit+0x219/0x3a0
6933 +             __dev_queue_xmit+0x415/0x4f0
6934 +             dev_queue_xmit_sk+0x13/0x20
6935 +             ip_finish_output2+0x237/0x340
6936 +             ip_finish_output+0x113/0x1d0
6937 +             ip_output+0x66/0xc0
6938 +             ip_local_out_sk+0x31/0x40
6939 +             ip_send_skb+0x1a/0x50
6940 +             udp_send_skb+0x16d/0x270
6941 +             udp_sendmsg+0x2bf/0x980
6942 +             inet_sendmsg+0x67/0xa0
6943 +             sock_sendmsg+0x38/0x50
6944 +             ___sys_sendmsg+0x269/0x270
6945 +    } hitcount:         77
6946 +    { stacktrace:
6947 +             netif_rx_internal+0xb2/0xd0
6948 +             netif_rx+0x1c/0x60
6949 +             loopback_xmit+0x6c/0xb0
6950 +             dev_hard_start_xmit+0x219/0x3a0
6951 +             __dev_queue_xmit+0x415/0x4f0
6952 +             dev_queue_xmit_sk+0x13/0x20
6953 +             ip_finish_output2+0x237/0x340
6954 +             ip_finish_output+0x113/0x1d0
6955 +             ip_output+0x66/0xc0
6956 +             ip_local_out_sk+0x31/0x40
6957 +             ip_send_skb+0x1a/0x50
6958 +             udp_send_skb+0x16d/0x270
6959 +             udp_sendmsg+0x2bf/0x980
6960 +             inet_sendmsg+0x67/0xa0
6961 +             sock_sendmsg+0x38/0x50
6962 +             SYSC_sendto+0xef/0x170
6963 +    } hitcount:         88
6964 +    { stacktrace:
6965 +             _do_fork+0x18e/0x330
6966 +             SyS_clone+0x19/0x20
6967 +             entry_SYSCALL_64_fastpath+0x12/0x6a
6968 +    } hitcount:        244
6969 +
6970 +    Totals:
6971 +        Hits: 489
6972 +        Entries: 7
6973 +        Dropped: 0
6974 +
6975 +
6976 +2.2 Inter-event hist triggers
6977 +-----------------------------
6978 +
6979 +Inter-event hist triggers are hist triggers that combine values from
6980 +one or more other events and create a histogram using that data.  Data
6981 +from an inter-event histogram can in turn become the source for
6982 +further combined histograms, thus providing a chain of related
6983 +histograms, which is important for some applications.
6984 +
6985 +The most important example of an inter-event quantity that can be used
6986 +in this manner is latency, which is simply a difference in timestamps
6987 +between two events.  Although latency is the most important
6988 +inter-event quantity, note that because the support is completely
6989 +general across the trace event subsystem, any event field can be used
6990 +in an inter-event quantity.
6991 +
6992 +An example of a histogram that combines data from other histograms
6993 +into a useful chain would be a 'wakeupswitch latency' histogram that
6994 +combines a 'wakeup latency' histogram and a 'switch latency'
6995 +histogram.
6996 +
6997 +Normally, a hist trigger specification consists of a (possibly
6998 +compound) key along with one or more numeric values, which are
6999 +continually updated sums associated with that key.  A histogram
7000 +specification in this case consists of individual key and value
7001 +specifications that refer to trace event fields associated with a
7002 +single event type.
7003 +
7004 +The inter-event hist trigger extension allows fields from multiple
7005 +events to be referenced and combined into a multi-event histogram
7006 +specification.  In support of this overall goal, a few enabling
7007 +features have been added to the hist trigger support:
7008 +
7009 +  - In order to compute an inter-event quantity, a value from one
7010 +    event needs to saved and then referenced from another event.  This
7011 +    requires the introduction of support for histogram 'variables'.
7012 +
7013 +  - The computation of inter-event quantities and their combination
7014 +    require some minimal amount of support for applying simple
7015 +    expressions to variables (+ and -).
7016 +
7017 +  - A histogram consisting of inter-event quantities isn't logically a
7018 +    histogram on either event (so having the 'hist' file for either
7019 +    event host the histogram output doesn't really make sense).  To
7020 +    address the idea that the histogram is associated with a
7021 +    combination of events, support is added allowing the creation of
7022 +    'synthetic' events that are events derived from other events.
7023 +    These synthetic events are full-fledged events just like any other
7024 +    and can be used as such, as for instance to create the
7025 +    'combination' histograms mentioned previously.
7026 +
7027 +  - A set of 'actions' can be associated with histogram entries -
7028 +    these can be used to generate the previously mentioned synthetic
7029 +    events, but can also be used for other purposes, such as for
7030 +    example saving context when a 'max' latency has been hit.
7031 +
7032 +  - Trace events don't have a 'timestamp' associated with them, but
7033 +    there is an implicit timestamp saved along with an event in the
7034 +    underlying ftrace ring buffer.  This timestamp is now exposed as a
7035 +    a synthetic field named 'common_timestamp' which can be used in
7036 +    histograms as if it were any other event field; it isn't an actual
7037 +    field in the trace format but rather is a synthesized value that
7038 +    nonetheless can be used as if it were an actual field.  By default
7039 +    it is in units of nanoseconds; appending '.usecs' to a
7040 +    common_timestamp field changes the units to microseconds.
7041 +
7042 +A note on inter-event timestamps: If common_timestamp is used in a
7043 +histogram, the trace buffer is automatically switched over to using
7044 +absolute timestamps and the "global" trace clock, in order to avoid
7045 +bogus timestamp differences with other clocks that aren't coherent
7046 +across CPUs.  This can be overridden by specifying one of the other
7047 +trace clocks instead, using the "clock=XXX" hist trigger attribute,
7048 +where XXX is any of the clocks listed in the tracing/trace_clock
7049 +pseudo-file.
7050 +
7051 +These features are described in more detail in the following sections.
7052 +
7053 +2.2.1 Histogram Variables
7054 +-------------------------
7055 +
7056 +Variables are simply named locations used for saving and retrieving
7057 +values between matching events.  A 'matching' event is defined as an
7058 +event that has a matching key - if a variable is saved for a histogram
7059 +entry corresponding to that key, any subsequent event with a matching
7060 +key can access that variable.
7061 +
7062 +A variable's value is normally available to any subsequent event until
7063 +it is set to something else by a subsequent event.  The one exception
7064 +to that rule is that any variable used in an expression is essentially
7065 +'read-once' - once it's used by an expression in a subsequent event,
7066 +it's reset to its 'unset' state, which means it can't be used again
7067 +unless it's set again.  This ensures not only that an event doesn't
7068 +use an uninitialized variable in a calculation, but that that variable
7069 +is used only once and not for any unrelated subsequent match.
7070 +
7071 +The basic syntax for saving a variable is to simply prefix a unique
7072 +variable name not corresponding to any keyword along with an '=' sign
7073 +to any event field.
7074 +
7075 +Either keys or values can be saved and retrieved in this way.  This
7076 +creates a variable named 'ts0' for a histogram entry with the key
7077 +'next_pid':
7078 +
7079 +  # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
7080 +       event/trigger
7081 +
7082 +The ts0 variable can be accessed by any subsequent event having the
7083 +same pid as 'next_pid'.
7084 +
7085 +Variable references are formed by prepending the variable name with
7086 +the '$' sign.  Thus for example, the ts0 variable above would be
7087 +referenced as '$ts0' in expressions.
7088 +
7089 +Because 'vals=' is used, the common_timestamp variable value above
7090 +will also be summed as a normal histogram value would (though for a
7091 +timestamp it makes little sense).
7092 +
7093 +The below shows that a key value can also be saved in the same way:
7094 +
7095 +  # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
7096 +
7097 +If a variable isn't a key variable or prefixed with 'vals=', the
7098 +associated event field will be saved in a variable but won't be summed
7099 +as a value:
7100 +
7101 +  # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
7102 +
7103 +Multiple variables can be assigned at the same time.  The below would
7104 +result in both ts0 and b being created as variables, with both
7105 +common_timestamp and field1 additionally being summed as values:
7106 +
7107 +  # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
7108 +       event/trigger
7109 +
7110 +Note that variable assignments can appear either preceding or
7111 +following their use.  The command below behaves identically to the
7112 +command above:
7113 +
7114 +  # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
7115 +       event/trigger
7116 +
7117 +Any number of variables not bound to a 'vals=' prefix can also be
7118 +assigned by simply separating them with colons.  Below is the same
7119 +thing but without the values being summed in the histogram:
7120 +
7121 +  # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
7122 +
7123 +Variables set as above can be referenced and used in expressions on
7124 +another event.
7125 +
7126 +For example, here's how a latency can be calculated:
7127 +
7128 +  # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
7129 +  # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
7130 +
7131 +In the first line above, the event's timetamp is saved into the
7132 +variable ts0.  In the next line, ts0 is subtracted from the second
7133 +event's timestamp to produce the latency, which is then assigned into
7134 +yet another variable, 'wakeup_lat'.  The hist trigger below in turn
7135 +makes use of the wakeup_lat variable to compute a combined latency
7136 +using the same key and variable from yet another event:
7137 +
7138 +  # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
7139 +
7140 +2.2.2 Synthetic Events
7141 +----------------------
7142 +
7143 +Synthetic events are user-defined events generated from hist trigger
7144 +variables or fields associated with one or more other events.  Their
7145 +purpose is to provide a mechanism for displaying data spanning
7146 +multiple events consistent with the existing and already familiar
7147 +usage for normal events.
7148 +
7149 +To define a synthetic event, the user writes a simple specification
7150 +consisting of the name of the new event along with one or more
7151 +variables and their types, which can be any valid field type,
7152 +separated by semicolons, to the tracing/synthetic_events file.
7153 +
7154 +For instance, the following creates a new event named 'wakeup_latency'
7155 +with 3 fields: lat, pid, and prio.  Each of those fields is simply a
7156 +variable reference to a variable on another event:
7157 +
7158 +  # echo 'wakeup_latency \
7159 +          u64 lat; \
7160 +          pid_t pid; \
7161 +         int prio' >> \
7162 +         /sys/kernel/debug/tracing/synthetic_events
7163 +
7164 +Reading the tracing/synthetic_events file lists all the currently
7165 +defined synthetic events, in this case the event defined above:
7166 +
7167 +  # cat /sys/kernel/debug/tracing/synthetic_events
7168 +    wakeup_latency u64 lat; pid_t pid; int prio
7169 +
7170 +An existing synthetic event definition can be removed by prepending
7171 +the command that defined it with a '!':
7172 +
7173 +  # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
7174 +    /sys/kernel/debug/tracing/synthetic_events
7175 +
7176 +At this point, there isn't yet an actual 'wakeup_latency' event
7177 +instantiated in the event subsytem - for this to happen, a 'hist
7178 +trigger action' needs to be instantiated and bound to actual fields
7179 +and variables defined on other events (see Section 6.3.3 below).
7180 +
7181 +Once that is done, an event instance is created, and a histogram can
7182 +be defined using it:
7183 +
7184 +  # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
7185 +        /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
7186 +
7187 +The new event is created under the tracing/events/synthetic/ directory
7188 +and looks and behaves just like any other event:
7189 +
7190 +  # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
7191 +        enable  filter  format  hist  id  trigger
7192 +
7193 +Like any other event, once a histogram is enabled for the event, the
7194 +output can be displayed by reading the event's 'hist' file.
7195 +
7196 +2.2.3 Hist trigger 'actions'
7197 +----------------------------
7198 +
7199 +A hist trigger 'action' is a function that's executed whenever a
7200 +histogram entry is added or updated.
7201 +
7202 +The default 'action' if no special function is explicity specified is
7203 +as it always has been, to simply update the set of values associated
7204 +with an entry.  Some applications, however, may want to perform
7205 +additional actions at that point, such as generate another event, or
7206 +compare and save a maximum.
7207 +
7208 +The following additional actions are available.  To specify an action
7209 +for a given event, simply specify the action between colons in the
7210 +hist trigger specification.
7211 +
7212 +  - onmatch(matching.event).<synthetic_event_name>(param list)
7213 +
7214 +    The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
7215 +    trigger action is invoked whenever an event matches and the
7216 +    histogram entry would be added or updated.  It causes the named
7217 +    synthetic event to be generated with the values given in the
7218 +    'param list'.  The result is the generation of a synthetic event
7219 +    that consists of the values contained in those variables at the
7220 +    time the invoking event was hit.
7221 +
7222 +    The 'param list' consists of one or more parameters which may be
7223 +    either variables or fields defined on either the 'matching.event'
7224 +    or the target event.  The variables or fields specified in the
7225 +    param list may be either fully-qualified or unqualified.  If a
7226 +    variable is specified as unqualified, it must be unique between
7227 +    the two events.  A field name used as a param can be unqualified
7228 +    if it refers to the target event, but must be fully qualified if
7229 +    it refers to the matching event.  A fully-qualified name is of the
7230 +    form 'system.event_name.$var_name' or 'system.event_name.field'.
7231 +
7232 +    The 'matching.event' specification is simply the fully qualified
7233 +    event name of the event that matches the target event for the
7234 +    onmatch() functionality, in the form 'system.event_name'.
7235 +
7236 +    Finally, the number and type of variables/fields in the 'param
7237 +    list' must match the number and types of the fields in the
7238 +    synthetic event being generated.
7239 +
7240 +    As an example the below defines a simple synthetic event and uses
7241 +    a variable defined on the sched_wakeup_new event as a parameter
7242 +    when invoking the synthetic event.  Here we define the synthetic
7243 +    event:
7244 +
7245 +    # echo 'wakeup_new_test pid_t pid' >> \
7246 +           /sys/kernel/debug/tracing/synthetic_events
7247 +
7248 +    # cat /sys/kernel/debug/tracing/synthetic_events
7249 +          wakeup_new_test pid_t pid
7250 +
7251 +    The following hist trigger both defines the missing testpid
7252 +    variable and specifies an onmatch() action that generates a
7253 +    wakeup_new_test synthetic event whenever a sched_wakeup_new event
7254 +    occurs, which because of the 'if comm == "cyclictest"' filter only
7255 +    happens when the executable is cyclictest:
7256 +
7257 +    # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
7258 +            wakeup_new_test($testpid) if comm=="cyclictest"' >> \
7259 +            /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
7260 +
7261 +    Creating and displaying a histogram based on those events is now
7262 +    just a matter of using the fields and new synthetic event in the
7263 +    tracing/events/synthetic directory, as usual:
7264 +
7265 +    # echo 'hist:keys=pid:sort=pid' >> \
7266 +           /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
7267 +
7268 +    Running 'cyclictest' should cause wakeup_new events to generate
7269 +    wakeup_new_test synthetic events which should result in histogram
7270 +    output in the wakeup_new_test event's hist file:
7271 +
7272 +    # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
7273 +
7274 +    A more typical usage would be to use two events to calculate a
7275 +    latency.  The following example uses a set of hist triggers to
7276 +    produce a 'wakeup_latency' histogram:
7277 +
7278 +    First, we define a 'wakeup_latency' synthetic event:
7279 +
7280 +    # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
7281 +            /sys/kernel/debug/tracing/synthetic_events
7282 +
7283 +    Next, we specify that whenever we see a sched_waking event for a
7284 +    cyclictest thread, save the timestamp in a 'ts0' variable:
7285 +
7286 +    # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
7287 +            if comm=="cyclictest"' >> \
7288 +           /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
7289 +
7290 +    Then, when the corresponding thread is actually scheduled onto the
7291 +    CPU by a sched_switch event, calculate the latency and use that
7292 +    along with another variable and an event field to generate a
7293 +    wakeup_latency synthetic event:
7294 +
7295 +    # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
7296 +            onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
7297 +                   $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
7298 +           /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
7299 +
7300 +    We also need to create a histogram on the wakeup_latency synthetic
7301 +    event in order to aggregate the generated synthetic event data:
7302 +
7303 +    # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
7304 +            /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
7305 +
7306 +    Finally, once we've run cyclictest to actually generate some
7307 +    events, we can see the output by looking at the wakeup_latency
7308 +    synthetic event's hist file:
7309 +
7310 +    # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
7311 +
7312 +  - onmax(var).save(field,..   .)
7313 +
7314 +    The 'onmax(var).save(field,...)' hist trigger action is invoked
7315 +    whenever the value of 'var' associated with a histogram entry
7316 +    exceeds the current maximum contained in that variable.
7317 +
7318 +    The end result is that the trace event fields specified as the
7319 +    onmax.save() params will be saved if 'var' exceeds the current
7320 +    maximum for that hist trigger entry.  This allows context from the
7321 +    event that exhibited the new maximum to be saved for later
7322 +    reference.  When the histogram is displayed, additional fields
7323 +    displaying the saved values will be printed.
7324 +
7325 +    As an example the below defines a couple of hist triggers, one for
7326 +    sched_waking and another for sched_switch, keyed on pid.  Whenever
7327 +    a sched_waking occurs, the timestamp is saved in the entry
7328 +    corresponding to the current pid, and when the scheduler switches
7329 +    back to that pid, the timestamp difference is calculated.  If the
7330 +    resulting latency, stored in wakeup_lat, exceeds the current
7331 +    maximum latency, the values specified in the save() fields are
7332 +    recoreded:
7333 +
7334 +    # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
7335 +            if comm=="cyclictest"' >> \
7336 +            /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
7337 +
7338 +    # echo 'hist:keys=next_pid:\
7339 +            wakeup_lat=common_timestamp.usecs-$ts0:\
7340 +            onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
7341 +            if next_comm=="cyclictest"' >> \
7342 +            /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
7343 +
7344 +    When the histogram is displayed, the max value and the saved
7345 +    values corresponding to the max are displayed following the rest
7346 +    of the fields:
7347 +
7348 +    # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
7349 +      { next_pid:       2255 } hitcount:        239
7350 +        common_timestamp-ts0:          0
7351 +        max:         27
7352 +       next_comm: cyclictest
7353 +        prev_pid:          0  prev_prio:        120  prev_comm: swapper/1
7354 +
7355 +      { next_pid:       2256 } hitcount:       2355
7356 +        common_timestamp-ts0: 0
7357 +        max:         49  next_comm: cyclictest
7358 +        prev_pid:          0  prev_prio:        120  prev_comm: swapper/0
7359 +
7360 +      Totals:
7361 +          Hits: 12970
7362 +          Entries: 2
7363 +          Dropped: 0
7364 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/acglobal.h linux-4.14/drivers/acpi/acpica/acglobal.h
7365 --- linux-4.14.orig/drivers/acpi/acpica/acglobal.h      2017-11-12 19:46:13.000000000 +0100
7366 +++ linux-4.14/drivers/acpi/acpica/acglobal.h   2018-09-05 11:05:07.000000000 +0200
7367 @@ -116,7 +116,7 @@
7368   * interrupt level
7369   */
7370  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
7371 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
7372 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
7373  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
7374  
7375  /* Mutex for _OSI support */
7376 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/hwregs.c linux-4.14/drivers/acpi/acpica/hwregs.c
7377 --- linux-4.14.orig/drivers/acpi/acpica/hwregs.c        2017-11-12 19:46:13.000000000 +0100
7378 +++ linux-4.14/drivers/acpi/acpica/hwregs.c     2018-09-05 11:05:07.000000000 +0200
7379 @@ -428,14 +428,14 @@
7380                           ACPI_BITMASK_ALL_FIXED_STATUS,
7381                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
7382  
7383 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7384 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7385  
7386         /* Clear the fixed events in PM1 A/B */
7387  
7388         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
7389                                         ACPI_BITMASK_ALL_FIXED_STATUS);
7390  
7391 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7392 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7393  
7394         if (ACPI_FAILURE(status)) {
7395                 goto exit;
7396 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/hwxface.c linux-4.14/drivers/acpi/acpica/hwxface.c
7397 --- linux-4.14.orig/drivers/acpi/acpica/hwxface.c       2017-11-12 19:46:13.000000000 +0100
7398 +++ linux-4.14/drivers/acpi/acpica/hwxface.c    2018-09-05 11:05:07.000000000 +0200
7399 @@ -373,7 +373,7 @@
7400                 return_ACPI_STATUS(AE_BAD_PARAMETER);
7401         }
7402  
7403 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7404 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7405  
7406         /*
7407          * At this point, we know that the parent register is one of the
7408 @@ -434,7 +434,7 @@
7409  
7410  unlock_and_exit:
7411  
7412 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7413 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7414         return_ACPI_STATUS(status);
7415  }
7416  
7417 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/acpi/acpica/utmutex.c linux-4.14/drivers/acpi/acpica/utmutex.c
7418 --- linux-4.14.orig/drivers/acpi/acpica/utmutex.c       2017-11-12 19:46:13.000000000 +0100
7419 +++ linux-4.14/drivers/acpi/acpica/utmutex.c    2018-09-05 11:05:07.000000000 +0200
7420 @@ -88,7 +88,7 @@
7421                 return_ACPI_STATUS (status);
7422         }
7423  
7424 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
7425 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
7426         if (ACPI_FAILURE (status)) {
7427                 return_ACPI_STATUS (status);
7428         }
7429 @@ -145,7 +145,7 @@
7430         /* Delete the spinlocks */
7431  
7432         acpi_os_delete_lock(acpi_gbl_gpe_lock);
7433 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
7434 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
7435         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
7436  
7437         /* Delete the reader/writer lock */
7438 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ata/libata-sff.c linux-4.14/drivers/ata/libata-sff.c
7439 --- linux-4.14.orig/drivers/ata/libata-sff.c    2017-11-12 19:46:13.000000000 +0100
7440 +++ linux-4.14/drivers/ata/libata-sff.c 2018-09-05 11:05:07.000000000 +0200
7441 @@ -679,9 +679,9 @@
7442         unsigned long flags;
7443         unsigned int consumed;
7444  
7445 -       local_irq_save(flags);
7446 +       local_irq_save_nort(flags);
7447         consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
7448 -       local_irq_restore(flags);
7449 +       local_irq_restore_nort(flags);
7450  
7451         return consumed;
7452  }
7453 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/brd.c linux-4.14/drivers/block/brd.c
7454 --- linux-4.14.orig/drivers/block/brd.c 2017-11-12 19:46:13.000000000 +0100
7455 +++ linux-4.14/drivers/block/brd.c      2018-09-05 11:05:07.000000000 +0200
7456 @@ -60,7 +60,6 @@
7457  /*
7458   * Look up and return a brd's page for a given sector.
7459   */
7460 -static DEFINE_MUTEX(brd_mutex);
7461  static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
7462  {
7463         pgoff_t idx;
7464 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zcomp.c linux-4.14/drivers/block/zram/zcomp.c
7465 --- linux-4.14.orig/drivers/block/zram/zcomp.c  2017-11-12 19:46:13.000000000 +0100
7466 +++ linux-4.14/drivers/block/zram/zcomp.c       2018-09-05 11:05:07.000000000 +0200
7467 @@ -116,12 +116,20 @@
7468  
7469  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
7470  {
7471 -       return *get_cpu_ptr(comp->stream);
7472 +       struct zcomp_strm *zstrm;
7473 +
7474 +       zstrm = *get_local_ptr(comp->stream);
7475 +       spin_lock(&zstrm->zcomp_lock);
7476 +       return zstrm;
7477  }
7478  
7479  void zcomp_stream_put(struct zcomp *comp)
7480  {
7481 -       put_cpu_ptr(comp->stream);
7482 +       struct zcomp_strm *zstrm;
7483 +
7484 +       zstrm = *this_cpu_ptr(comp->stream);
7485 +       spin_unlock(&zstrm->zcomp_lock);
7486 +       put_local_ptr(zstrm);
7487  }
7488  
7489  int zcomp_compress(struct zcomp_strm *zstrm,
7490 @@ -171,6 +179,7 @@
7491                 pr_err("Can't allocate a compression stream\n");
7492                 return -ENOMEM;
7493         }
7494 +       spin_lock_init(&zstrm->zcomp_lock);
7495         *per_cpu_ptr(comp->stream, cpu) = zstrm;
7496         return 0;
7497  }
7498 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zcomp.h linux-4.14/drivers/block/zram/zcomp.h
7499 --- linux-4.14.orig/drivers/block/zram/zcomp.h  2017-11-12 19:46:13.000000000 +0100
7500 +++ linux-4.14/drivers/block/zram/zcomp.h       2018-09-05 11:05:07.000000000 +0200
7501 @@ -14,6 +14,7 @@
7502         /* compression/decompression buffer */
7503         void *buffer;
7504         struct crypto_comp *tfm;
7505 +       spinlock_t zcomp_lock;
7506  };
7507  
7508  /* dynamic per-device compression frontend */
7509 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zram_drv.c linux-4.14/drivers/block/zram/zram_drv.c
7510 --- linux-4.14.orig/drivers/block/zram/zram_drv.c       2017-11-12 19:46:13.000000000 +0100
7511 +++ linux-4.14/drivers/block/zram/zram_drv.c    2018-09-05 11:05:07.000000000 +0200
7512 @@ -756,6 +756,30 @@
7513  static DEVICE_ATTR_RO(mm_stat);
7514  static DEVICE_ATTR_RO(debug_stat);
7515  
7516 +#ifdef CONFIG_PREEMPT_RT_BASE
7517 +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
7518 +{
7519 +       size_t index;
7520 +
7521 +       for (index = 0; index < num_pages; index++)
7522 +               spin_lock_init(&zram->table[index].lock);
7523 +}
7524 +
7525 +static void zram_slot_lock(struct zram *zram, u32 index)
7526 +{
7527 +       spin_lock(&zram->table[index].lock);
7528 +       __set_bit(ZRAM_ACCESS, &zram->table[index].value);
7529 +}
7530 +
7531 +static void zram_slot_unlock(struct zram *zram, u32 index)
7532 +{
7533 +       __clear_bit(ZRAM_ACCESS, &zram->table[index].value);
7534 +       spin_unlock(&zram->table[index].lock);
7535 +}
7536 +
7537 +#else
7538 +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
7539 +
7540  static void zram_slot_lock(struct zram *zram, u32 index)
7541  {
7542         bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
7543 @@ -765,6 +789,7 @@
7544  {
7545         bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
7546  }
7547 +#endif
7548  
7549  static void zram_meta_free(struct zram *zram, u64 disksize)
7550  {
7551 @@ -794,6 +819,7 @@
7552                 return false;
7553         }
7554  
7555 +       zram_meta_init_table_locks(zram, num_pages);
7556         return true;
7557  }
7558  
7559 @@ -845,6 +871,7 @@
7560         unsigned long handle;
7561         unsigned int size;
7562         void *src, *dst;
7563 +       struct zcomp_strm *zstrm;
7564  
7565         if (zram_wb_enabled(zram)) {
7566                 zram_slot_lock(zram, index);
7567 @@ -879,6 +906,7 @@
7568  
7569         size = zram_get_obj_size(zram, index);
7570  
7571 +       zstrm = zcomp_stream_get(zram->comp);
7572         src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
7573         if (size == PAGE_SIZE) {
7574                 dst = kmap_atomic(page);
7575 @@ -886,14 +914,13 @@
7576                 kunmap_atomic(dst);
7577                 ret = 0;
7578         } else {
7579 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
7580  
7581                 dst = kmap_atomic(page);
7582                 ret = zcomp_decompress(zstrm, src, size, dst);
7583                 kunmap_atomic(dst);
7584 -               zcomp_stream_put(zram->comp);
7585         }
7586         zs_unmap_object(zram->mem_pool, handle);
7587 +       zcomp_stream_put(zram->comp);
7588         zram_slot_unlock(zram, index);
7589  
7590         /* Should NEVER happen. Return bio error if it does. */
7591 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/block/zram/zram_drv.h linux-4.14/drivers/block/zram/zram_drv.h
7592 --- linux-4.14.orig/drivers/block/zram/zram_drv.h       2017-11-12 19:46:13.000000000 +0100
7593 +++ linux-4.14/drivers/block/zram/zram_drv.h    2018-09-05 11:05:07.000000000 +0200
7594 @@ -77,6 +77,9 @@
7595                 unsigned long element;
7596         };
7597         unsigned long value;
7598 +#ifdef CONFIG_PREEMPT_RT_BASE
7599 +       spinlock_t lock;
7600 +#endif
7601  };
7602  
7603  struct zram_stats {
7604 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/char/random.c linux-4.14/drivers/char/random.c
7605 --- linux-4.14.orig/drivers/char/random.c       2018-09-05 11:03:20.000000000 +0200
7606 +++ linux-4.14/drivers/char/random.c    2018-09-05 11:05:07.000000000 +0200
7607 @@ -265,6 +265,7 @@
7608  #include <linux/syscalls.h>
7609  #include <linux/completion.h>
7610  #include <linux/uuid.h>
7611 +#include <linux/locallock.h>
7612  #include <crypto/chacha20.h>
7613  
7614  #include <asm/processor.h>
7615 @@ -856,7 +857,7 @@
7616                 invalidate_batched_entropy();
7617                 crng_init = 1;
7618                 wake_up_interruptible(&crng_init_wait);
7619 -               pr_notice("random: fast init done\n");
7620 +               /* pr_notice("random: fast init done\n"); */
7621         }
7622         return 1;
7623  }
7624 @@ -941,17 +942,21 @@
7625                 crng_init = 2;
7626                 process_random_ready_list();
7627                 wake_up_interruptible(&crng_init_wait);
7628 -               pr_notice("random: crng init done\n");
7629 +               /* pr_notice("random: crng init done\n"); */
7630                 if (unseeded_warning.missed) {
7631 +#if 0
7632                         pr_notice("random: %d get_random_xx warning(s) missed "
7633                                   "due to ratelimiting\n",
7634                                   unseeded_warning.missed);
7635 +#endif
7636                         unseeded_warning.missed = 0;
7637                 }
7638                 if (urandom_warning.missed) {
7639 +#if 0
7640                         pr_notice("random: %d urandom warning(s) missed "
7641                                   "due to ratelimiting\n",
7642                                   urandom_warning.missed);
7643 +#endif
7644                         urandom_warning.missed = 0;
7645                 }
7646         }
7647 @@ -1122,8 +1127,6 @@
7648         } sample;
7649         long delta, delta2, delta3;
7650  
7651 -       preempt_disable();
7652 -
7653         sample.jiffies = jiffies;
7654         sample.cycles = random_get_entropy();
7655         sample.num = num;
7656 @@ -1164,7 +1167,6 @@
7657                  */
7658                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
7659         }
7660 -       preempt_enable();
7661  }
7662  
7663  void add_input_randomness(unsigned int type, unsigned int code,
7664 @@ -1221,28 +1223,27 @@
7665         return *ptr;
7666  }
7667  
7668 -void add_interrupt_randomness(int irq, int irq_flags)
7669 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
7670  {
7671         struct entropy_store    *r;
7672         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
7673 -       struct pt_regs          *regs = get_irq_regs();
7674         unsigned long           now = jiffies;
7675         cycles_t                cycles = random_get_entropy();
7676         __u32                   c_high, j_high;
7677 -       __u64                   ip;
7678         unsigned long           seed;
7679         int                     credit = 0;
7680  
7681         if (cycles == 0)
7682 -               cycles = get_reg(fast_pool, regs);
7683 +               cycles = get_reg(fast_pool, NULL);
7684         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
7685         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
7686         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
7687         fast_pool->pool[1] ^= now ^ c_high;
7688 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
7689 +       if (!ip)
7690 +               ip = _RET_IP_;
7691         fast_pool->pool[2] ^= ip;
7692         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
7693 -               get_reg(fast_pool, regs);
7694 +               get_reg(fast_pool, NULL);
7695  
7696         fast_mix(fast_pool);
7697         add_interrupt_bench(cycles);
7698 @@ -2200,6 +2201,7 @@
7699   * at any point prior.
7700   */
7701  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
7702 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock);
7703  u64 get_random_u64(void)
7704  {
7705         u64 ret;
7706 @@ -2220,7 +2222,7 @@
7707         warn_unseeded_randomness(&previous);
7708  
7709         use_lock = READ_ONCE(crng_init) < 2;
7710 -       batch = &get_cpu_var(batched_entropy_u64);
7711 +       batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7712         if (use_lock)
7713                 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7714         if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
7715 @@ -2230,12 +2232,13 @@
7716         ret = batch->entropy_u64[batch->position++];
7717         if (use_lock)
7718                 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7719 -       put_cpu_var(batched_entropy_u64);
7720 +       put_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7721         return ret;
7722  }
7723  EXPORT_SYMBOL(get_random_u64);
7724  
7725  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
7726 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock);
7727  u32 get_random_u32(void)
7728  {
7729         u32 ret;
7730 @@ -2250,7 +2253,7 @@
7731         warn_unseeded_randomness(&previous);
7732  
7733         use_lock = READ_ONCE(crng_init) < 2;
7734 -       batch = &get_cpu_var(batched_entropy_u32);
7735 +       batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7736         if (use_lock)
7737                 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7738         if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
7739 @@ -2260,7 +2263,7 @@
7740         ret = batch->entropy_u32[batch->position++];
7741         if (use_lock)
7742                 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7743 -       put_cpu_var(batched_entropy_u32);
7744 +       put_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7745         return ret;
7746  }
7747  EXPORT_SYMBOL(get_random_u32);
7748 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/char/tpm/tpm_tis.c linux-4.14/drivers/char/tpm/tpm_tis.c
7749 --- linux-4.14.orig/drivers/char/tpm/tpm_tis.c  2018-09-05 11:03:20.000000000 +0200
7750 +++ linux-4.14/drivers/char/tpm/tpm_tis.c       2018-09-05 11:05:07.000000000 +0200
7751 @@ -52,6 +52,31 @@
7752         return container_of(data, struct tpm_tis_tcg_phy, priv);
7753  }
7754  
7755 +#ifdef CONFIG_PREEMPT_RT_FULL
7756 +/*
7757 + * Flushes previous write operations to chip so that a subsequent
7758 + * ioread*()s won't stall a cpu.
7759 + */
7760 +static inline void tpm_tis_flush(void __iomem *iobase)
7761 +{
7762 +       ioread8(iobase + TPM_ACCESS(0));
7763 +}
7764 +#else
7765 +#define tpm_tis_flush(iobase) do { } while (0)
7766 +#endif
7767 +
7768 +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
7769 +{
7770 +       iowrite8(b, iobase + addr);
7771 +       tpm_tis_flush(iobase);
7772 +}
7773 +
7774 +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
7775 +{
7776 +       iowrite32(b, iobase + addr);
7777 +       tpm_tis_flush(iobase);
7778 +}
7779 +
7780  static bool interrupts = true;
7781  module_param(interrupts, bool, 0444);
7782  MODULE_PARM_DESC(interrupts, "Enable interrupts");
7783 @@ -149,7 +174,7 @@
7784         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7785  
7786         while (len--)
7787 -               iowrite8(*value++, phy->iobase + addr);
7788 +               tpm_tis_iowrite8(*value++, phy->iobase, addr);
7789  
7790         return 0;
7791  }
7792 @@ -176,7 +201,7 @@
7793  {
7794         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7795  
7796 -       iowrite32(value, phy->iobase + addr);
7797 +       tpm_tis_iowrite32(value, phy->iobase, addr);
7798  
7799         return 0;
7800  }
7801 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/tcb_clksrc.c linux-4.14/drivers/clocksource/tcb_clksrc.c
7802 --- linux-4.14.orig/drivers/clocksource/tcb_clksrc.c    2017-11-12 19:46:13.000000000 +0100
7803 +++ linux-4.14/drivers/clocksource/tcb_clksrc.c 2018-09-05 11:05:07.000000000 +0200
7804 @@ -25,8 +25,7 @@
7805   *     this 32 bit free-running counter. the second channel is not used.
7806   *
7807   *   - The third channel may be used to provide a 16-bit clockevent
7808 - *     source, used in either periodic or oneshot mode.  This runs
7809 - *     at 32 KiHZ, and can handle delays of up to two seconds.
7810 + *     source, used in either periodic or oneshot mode.
7811   *
7812   * A boot clocksource and clockevent source are also currently needed,
7813   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
7814 @@ -126,6 +125,8 @@
7815  struct tc_clkevt_device {
7816         struct clock_event_device       clkevt;
7817         struct clk                      *clk;
7818 +       bool                            clk_enabled;
7819 +       u32                             freq;
7820         void __iomem                    *regs;
7821  };
7822  
7823 @@ -134,15 +135,26 @@
7824         return container_of(clkevt, struct tc_clkevt_device, clkevt);
7825  }
7826  
7827 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
7828 - * because using one of the divided clocks would usually mean the
7829 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
7830 - *
7831 - * A divided clock could be good for high resolution timers, since
7832 - * 30.5 usec resolution can seem "low".
7833 - */
7834  static u32 timer_clock;
7835  
7836 +static void tc_clk_disable(struct clock_event_device *d)
7837 +{
7838 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7839 +
7840 +       clk_disable(tcd->clk);
7841 +       tcd->clk_enabled = false;
7842 +}
7843 +
7844 +static void tc_clk_enable(struct clock_event_device *d)
7845 +{
7846 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7847 +
7848 +       if (tcd->clk_enabled)
7849 +               return;
7850 +       clk_enable(tcd->clk);
7851 +       tcd->clk_enabled = true;
7852 +}
7853 +
7854  static int tc_shutdown(struct clock_event_device *d)
7855  {
7856         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7857 @@ -150,8 +162,14 @@
7858  
7859         writel(0xff, regs + ATMEL_TC_REG(2, IDR));
7860         writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
7861 +       return 0;
7862 +}
7863 +
7864 +static int tc_shutdown_clk_off(struct clock_event_device *d)
7865 +{
7866 +       tc_shutdown(d);
7867         if (!clockevent_state_detached(d))
7868 -               clk_disable(tcd->clk);
7869 +               tc_clk_disable(d);
7870  
7871         return 0;
7872  }
7873 @@ -164,9 +182,9 @@
7874         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
7875                 tc_shutdown(d);
7876  
7877 -       clk_enable(tcd->clk);
7878 +       tc_clk_enable(d);
7879  
7880 -       /* slow clock, count up to RC, then irq and stop */
7881 +       /* count up to RC, then irq and stop */
7882         writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
7883                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
7884         writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
7885 @@ -186,12 +204,12 @@
7886         /* By not making the gentime core emulate periodic mode on top
7887          * of oneshot, we get lower overhead and improved accuracy.
7888          */
7889 -       clk_enable(tcd->clk);
7890 +       tc_clk_enable(d);
7891  
7892 -       /* slow clock, count up to RC, then irq and restart */
7893 +       /* count up to RC, then irq and restart */
7894         writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
7895                      regs + ATMEL_TC_REG(2, CMR));
7896 -       writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
7897 +       writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
7898  
7899         /* Enable clock and interrupts on RC compare */
7900         writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
7901 @@ -218,9 +236,13 @@
7902                 .features               = CLOCK_EVT_FEAT_PERIODIC |
7903                                           CLOCK_EVT_FEAT_ONESHOT,
7904                 /* Should be lower than at91rm9200's system timer */
7905 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
7906                 .rating                 = 125,
7907 +#else
7908 +               .rating                 = 200,
7909 +#endif
7910                 .set_next_event         = tc_next_event,
7911 -               .set_state_shutdown     = tc_shutdown,
7912 +               .set_state_shutdown     = tc_shutdown_clk_off,
7913                 .set_state_periodic     = tc_set_periodic,
7914                 .set_state_oneshot      = tc_set_oneshot,
7915         },
7916 @@ -240,8 +262,9 @@
7917         return IRQ_NONE;
7918  }
7919  
7920 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
7921 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
7922  {
7923 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
7924         int ret;
7925         struct clk *t2_clk = tc->clk[2];
7926         int irq = tc->irq[2];
7927 @@ -262,7 +285,11 @@
7928         clkevt.regs = tc->regs;
7929         clkevt.clk = t2_clk;
7930  
7931 -       timer_clock = clk32k_divisor_idx;
7932 +       timer_clock = divisor_idx;
7933 +       if (!divisor)
7934 +               clkevt.freq = 32768;
7935 +       else
7936 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
7937  
7938         clkevt.clkevt.cpumask = cpumask_of(0);
7939  
7940 @@ -273,7 +300,7 @@
7941                 return ret;
7942         }
7943  
7944 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
7945 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
7946  
7947         return ret;
7948  }
7949 @@ -410,7 +437,11 @@
7950                 goto err_disable_t1;
7951  
7952         /* channel 2:  periodic and oneshot timer support */
7953 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
7954         ret = setup_clkevents(tc, clk32k_divisor_idx);
7955 +#else
7956 +       ret = setup_clkevents(tc, best_divisor_idx);
7957 +#endif
7958         if (ret)
7959                 goto err_unregister_clksrc;
7960  
7961 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/timer-atmel-pit.c linux-4.14/drivers/clocksource/timer-atmel-pit.c
7962 --- linux-4.14.orig/drivers/clocksource/timer-atmel-pit.c       2017-11-12 19:46:13.000000000 +0100
7963 +++ linux-4.14/drivers/clocksource/timer-atmel-pit.c    2018-09-05 11:05:07.000000000 +0200
7964 @@ -46,6 +46,7 @@
7965         u32             cycle;
7966         u32             cnt;
7967         unsigned int    irq;
7968 +       bool            irq_requested;
7969         struct clk      *mck;
7970  };
7971  
7972 @@ -96,15 +97,29 @@
7973  
7974         /* disable irq, leaving the clocksource active */
7975         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
7976 +       if (data->irq_requested) {
7977 +               free_irq(data->irq, data);
7978 +               data->irq_requested = false;
7979 +       }
7980         return 0;
7981  }
7982  
7983 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
7984  /*
7985   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
7986   */
7987  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
7988  {
7989         struct pit_data *data = clkevt_to_pit_data(dev);
7990 +       int ret;
7991 +
7992 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
7993 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
7994 +                         "at91_tick", data);
7995 +       if (ret)
7996 +               panic(pr_fmt("Unable to setup IRQ\n"));
7997 +
7998 +       data->irq_requested = true;
7999  
8000         /* update clocksource counter */
8001         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8002 @@ -230,15 +245,6 @@
8003                 return ret;
8004         }
8005  
8006 -       /* Set up irq handler */
8007 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8008 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8009 -                         "at91_tick", data);
8010 -       if (ret) {
8011 -               pr_err("Unable to setup IRQ\n");
8012 -               return ret;
8013 -       }
8014 -
8015         /* Set up and register clockevents */
8016         data->clkevt.name = "pit";
8017         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8018 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/clocksource/timer-atmel-st.c linux-4.14/drivers/clocksource/timer-atmel-st.c
8019 --- linux-4.14.orig/drivers/clocksource/timer-atmel-st.c        2017-11-12 19:46:13.000000000 +0100
8020 +++ linux-4.14/drivers/clocksource/timer-atmel-st.c     2018-09-05 11:05:07.000000000 +0200
8021 @@ -115,18 +115,29 @@
8022         last_crtr = read_CRTR();
8023  }
8024  
8025 +static int atmel_st_irq;
8026 +
8027  static int clkevt32k_shutdown(struct clock_event_device *evt)
8028  {
8029         clkdev32k_disable_and_flush_irq();
8030         irqmask = 0;
8031         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8032 +       free_irq(atmel_st_irq, regmap_st);
8033         return 0;
8034  }
8035  
8036  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8037  {
8038 +       int ret;
8039 +
8040         clkdev32k_disable_and_flush_irq();
8041  
8042 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8043 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8044 +                         "at91_tick", regmap_st);
8045 +       if (ret)
8046 +               panic(pr_fmt("Unable to setup IRQ\n"));
8047 +
8048         /*
8049          * ALM for oneshot irqs, set by next_event()
8050          * before 32 seconds have passed.
8051 @@ -139,8 +150,16 @@
8052  
8053  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8054  {
8055 +       int ret;
8056 +
8057         clkdev32k_disable_and_flush_irq();
8058  
8059 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8060 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8061 +                         "at91_tick", regmap_st);
8062 +       if (ret)
8063 +               panic(pr_fmt("Unable to setup IRQ\n"));
8064 +
8065         /* PIT for periodic irqs; fixed rate of 1/HZ */
8066         irqmask = AT91_ST_PITS;
8067         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8068 @@ -198,7 +217,7 @@
8069  {
8070         struct clk *sclk;
8071         unsigned int sclk_rate, val;
8072 -       int irq, ret;
8073 +       int ret;
8074  
8075         regmap_st = syscon_node_to_regmap(node);
8076         if (IS_ERR(regmap_st)) {
8077 @@ -212,21 +231,12 @@
8078         regmap_read(regmap_st, AT91_ST_SR, &val);
8079  
8080         /* Get the interrupts property */
8081 -       irq  = irq_of_parse_and_map(node, 0);
8082 -       if (!irq) {
8083 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8084 +       if (!atmel_st_irq) {
8085                 pr_err("Unable to get IRQ from DT\n");
8086                 return -EINVAL;
8087         }
8088  
8089 -       /* Make IRQs happen for the system timer */
8090 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8091 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8092 -                         "at91_tick", regmap_st);
8093 -       if (ret) {
8094 -               pr_err("Unable to setup IRQ\n");
8095 -               return ret;
8096 -       }
8097 -
8098         sclk = of_clk_get(node, 0);
8099         if (IS_ERR(sclk)) {
8100                 pr_err("Unable to get slow clock\n");
8101 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/connector/cn_proc.c linux-4.14/drivers/connector/cn_proc.c
8102 --- linux-4.14.orig/drivers/connector/cn_proc.c 2017-11-12 19:46:13.000000000 +0100
8103 +++ linux-4.14/drivers/connector/cn_proc.c      2018-09-05 11:05:07.000000000 +0200
8104 @@ -32,6 +32,7 @@
8105  #include <linux/pid_namespace.h>
8106  
8107  #include <linux/cn_proc.h>
8108 +#include <linux/locallock.h>
8109  
8110  /*
8111   * Size of a cn_msg followed by a proc_event structure.  Since the
8112 @@ -54,10 +55,11 @@
8113  
8114  /* proc_event_counts is used as the sequence number of the netlink message */
8115  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
8116 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
8117  
8118  static inline void send_msg(struct cn_msg *msg)
8119  {
8120 -       preempt_disable();
8121 +       local_lock(send_msg_lock);
8122  
8123         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
8124         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
8125 @@ -70,7 +72,7 @@
8126          */
8127         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
8128  
8129 -       preempt_enable();
8130 +       local_unlock(send_msg_lock);
8131  }
8132  
8133  void proc_fork_connector(struct task_struct *task)
8134 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/cpufreq/Kconfig.x86 linux-4.14/drivers/cpufreq/Kconfig.x86
8135 --- linux-4.14.orig/drivers/cpufreq/Kconfig.x86 2017-11-12 19:46:13.000000000 +0100
8136 +++ linux-4.14/drivers/cpufreq/Kconfig.x86      2018-09-05 11:05:07.000000000 +0200
8137 @@ -125,7 +125,7 @@
8138  
8139  config X86_POWERNOW_K8
8140         tristate "AMD Opteron/Athlon64 PowerNow!"
8141 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8142 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8143         help
8144           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8145           Support for K10 and newer processors is now in acpi-cpufreq.
8146 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/i915_gem_timeline.c linux-4.14/drivers/gpu/drm/i915/i915_gem_timeline.c
8147 --- linux-4.14.orig/drivers/gpu/drm/i915/i915_gem_timeline.c    2017-11-12 19:46:13.000000000 +0100
8148 +++ linux-4.14/drivers/gpu/drm/i915/i915_gem_timeline.c 2018-09-05 11:05:07.000000000 +0200
8149 @@ -33,11 +33,8 @@
8150  {
8151         tl->fence_context = context;
8152         tl->common = parent;
8153 -#ifdef CONFIG_DEBUG_SPINLOCK
8154 -       __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
8155 -#else
8156         spin_lock_init(&tl->lock);
8157 -#endif
8158 +       lockdep_set_class_and_name(&tl->lock, lockclass, lockname);
8159         init_request_active(&tl->last_request, NULL);
8160         INIT_LIST_HEAD(&tl->requests);
8161         i915_syncmap_init(&tl->sync);
8162 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/i915_irq.c linux-4.14/drivers/gpu/drm/i915/i915_irq.c
8163 --- linux-4.14.orig/drivers/gpu/drm/i915/i915_irq.c     2018-09-05 11:03:21.000000000 +0200
8164 +++ linux-4.14/drivers/gpu/drm/i915/i915_irq.c  2018-09-05 11:05:07.000000000 +0200
8165 @@ -867,6 +867,7 @@
8166         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8167  
8168         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8169 +       preempt_disable_rt();
8170  
8171         /* Get optional system timestamp before query. */
8172         if (stime)
8173 @@ -918,6 +919,7 @@
8174                 *etime = ktime_get();
8175  
8176         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8177 +       preempt_enable_rt();
8178  
8179         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8180  
8181 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/i915/intel_sprite.c linux-4.14/drivers/gpu/drm/i915/intel_sprite.c
8182 --- linux-4.14.orig/drivers/gpu/drm/i915/intel_sprite.c 2018-09-05 11:03:21.000000000 +0200
8183 +++ linux-4.14/drivers/gpu/drm/i915/intel_sprite.c      2018-09-05 11:05:07.000000000 +0200
8184 @@ -36,6 +36,7 @@
8185  #include <drm/drm_rect.h>
8186  #include <drm/drm_atomic.h>
8187  #include <drm/drm_plane_helper.h>
8188 +#include <linux/locallock.h>
8189  #include "intel_drv.h"
8190  #include "intel_frontbuffer.h"
8191  #include <drm/i915_drm.h>
8192 @@ -67,7 +68,7 @@
8193  }
8194  
8195  #define VBLANK_EVASION_TIME_US 100
8196 -
8197 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8198  /**
8199   * intel_pipe_update_start() - start update of a set of display registers
8200   * @crtc: the crtc of which the registers are going to be updated
8201 @@ -102,7 +103,7 @@
8202                                                       VBLANK_EVASION_TIME_US);
8203         max = vblank_start - 1;
8204  
8205 -       local_irq_disable();
8206 +       local_lock_irq(pipe_update_lock);
8207  
8208         if (min <= 0 || max <= 0)
8209                 return;
8210 @@ -132,11 +133,11 @@
8211                         break;
8212                 }
8213  
8214 -               local_irq_enable();
8215 +               local_unlock_irq(pipe_update_lock);
8216  
8217                 timeout = schedule_timeout(timeout);
8218  
8219 -               local_irq_disable();
8220 +               local_lock_irq(pipe_update_lock);
8221         }
8222  
8223         finish_wait(wq, &wait);
8224 @@ -201,7 +202,7 @@
8225                 crtc->base.state->event = NULL;
8226         }
8227  
8228 -       local_irq_enable();
8229 +       local_unlock_irq(pipe_update_lock);
8230  
8231         if (intel_vgpu_active(dev_priv))
8232                 return;
8233 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/gpu/drm/radeon/radeon_display.c linux-4.14/drivers/gpu/drm/radeon/radeon_display.c
8234 --- linux-4.14.orig/drivers/gpu/drm/radeon/radeon_display.c     2017-11-12 19:46:13.000000000 +0100
8235 +++ linux-4.14/drivers/gpu/drm/radeon/radeon_display.c  2018-09-05 11:05:07.000000000 +0200
8236 @@ -1839,6 +1839,7 @@
8237         struct radeon_device *rdev = dev->dev_private;
8238  
8239         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8240 +       preempt_disable_rt();
8241  
8242         /* Get optional system timestamp before query. */
8243         if (stime)
8244 @@ -1931,6 +1932,7 @@
8245                 *etime = ktime_get();
8246  
8247         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8248 +       preempt_enable_rt();
8249  
8250         /* Decode into vertical and horizontal scanout position. */
8251         *vpos = position & 0x1fff;
8252 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/hv/vmbus_drv.c linux-4.14/drivers/hv/vmbus_drv.c
8253 --- linux-4.14.orig/drivers/hv/vmbus_drv.c      2018-09-05 11:03:21.000000000 +0200
8254 +++ linux-4.14/drivers/hv/vmbus_drv.c   2018-09-05 11:05:37.000000000 +0200
8255 @@ -39,6 +39,7 @@
8256  #include <asm/hyperv.h>
8257  #include <asm/hypervisor.h>
8258  #include <asm/mshyperv.h>
8259 +#include <asm/irq_regs.h>
8260  #include <linux/notifier.h>
8261  #include <linux/ptrace.h>
8262  #include <linux/screen_info.h>
8263 @@ -966,6 +967,8 @@
8264         void *page_addr = hv_cpu->synic_event_page;
8265         struct hv_message *msg;
8266         union hv_synic_event_flags *event;
8267 +       struct pt_regs *regs = get_irq_regs();
8268 +       u64 ip = regs ? instruction_pointer(regs) : 0;
8269         bool handled = false;
8270  
8271         if (unlikely(page_addr == NULL))
8272 @@ -1009,7 +1012,7 @@
8273                         tasklet_schedule(&hv_cpu->msg_dpc);
8274         }
8275  
8276 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8277 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
8278  }
8279  
8280  
8281 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/alim15x3.c linux-4.14/drivers/ide/alim15x3.c
8282 --- linux-4.14.orig/drivers/ide/alim15x3.c      2017-11-12 19:46:13.000000000 +0100
8283 +++ linux-4.14/drivers/ide/alim15x3.c   2018-09-05 11:05:07.000000000 +0200
8284 @@ -234,7 +234,7 @@
8285  
8286         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8287  
8288 -       local_irq_save(flags);
8289 +       local_irq_save_nort(flags);
8290  
8291         if (m5229_revision < 0xC2) {
8292                 /*
8293 @@ -325,7 +325,7 @@
8294         }
8295         pci_dev_put(north);
8296         pci_dev_put(isa_dev);
8297 -       local_irq_restore(flags);
8298 +       local_irq_restore_nort(flags);
8299         return 0;
8300  }
8301  
8302 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/hpt366.c linux-4.14/drivers/ide/hpt366.c
8303 --- linux-4.14.orig/drivers/ide/hpt366.c        2017-11-12 19:46:13.000000000 +0100
8304 +++ linux-4.14/drivers/ide/hpt366.c     2018-09-05 11:05:07.000000000 +0200
8305 @@ -1236,7 +1236,7 @@
8306  
8307         dma_old = inb(base + 2);
8308  
8309 -       local_irq_save(flags);
8310 +       local_irq_save_nort(flags);
8311  
8312         dma_new = dma_old;
8313         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8314 @@ -1247,7 +1247,7 @@
8315         if (dma_new != dma_old)
8316                 outb(dma_new, base + 2);
8317  
8318 -       local_irq_restore(flags);
8319 +       local_irq_restore_nort(flags);
8320  
8321         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8322                          hwif->name, base, base + 7);
8323 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-io.c linux-4.14/drivers/ide/ide-io.c
8324 --- linux-4.14.orig/drivers/ide/ide-io.c        2017-11-12 19:46:13.000000000 +0100
8325 +++ linux-4.14/drivers/ide/ide-io.c     2018-09-05 11:05:07.000000000 +0200
8326 @@ -660,7 +660,7 @@
8327                 /* disable_irq_nosync ?? */
8328                 disable_irq(hwif->irq);
8329                 /* local CPU only, as if we were handling an interrupt */
8330 -               local_irq_disable();
8331 +               local_irq_disable_nort();
8332                 if (hwif->polling) {
8333                         startstop = handler(drive);
8334                 } else if (drive_is_ready(drive)) {
8335 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-iops.c linux-4.14/drivers/ide/ide-iops.c
8336 --- linux-4.14.orig/drivers/ide/ide-iops.c      2017-11-12 19:46:13.000000000 +0100
8337 +++ linux-4.14/drivers/ide/ide-iops.c   2018-09-05 11:05:07.000000000 +0200
8338 @@ -129,12 +129,12 @@
8339                                 if ((stat & ATA_BUSY) == 0)
8340                                         break;
8341  
8342 -                               local_irq_restore(flags);
8343 +                               local_irq_restore_nort(flags);
8344                                 *rstat = stat;
8345                                 return -EBUSY;
8346                         }
8347                 }
8348 -               local_irq_restore(flags);
8349 +               local_irq_restore_nort(flags);
8350         }
8351         /*
8352          * Allow status to settle, then read it again.
8353 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-io-std.c linux-4.14/drivers/ide/ide-io-std.c
8354 --- linux-4.14.orig/drivers/ide/ide-io-std.c    2017-11-12 19:46:13.000000000 +0100
8355 +++ linux-4.14/drivers/ide/ide-io-std.c 2018-09-05 11:05:07.000000000 +0200
8356 @@ -175,7 +175,7 @@
8357                 unsigned long uninitialized_var(flags);
8358  
8359                 if ((io_32bit & 2) && !mmio) {
8360 -                       local_irq_save(flags);
8361 +                       local_irq_save_nort(flags);
8362                         ata_vlb_sync(io_ports->nsect_addr);
8363                 }
8364  
8365 @@ -186,7 +186,7 @@
8366                         insl(data_addr, buf, words);
8367  
8368                 if ((io_32bit & 2) && !mmio)
8369 -                       local_irq_restore(flags);
8370 +                       local_irq_restore_nort(flags);
8371  
8372                 if (((len + 1) & 3) < 2)
8373                         return;
8374 @@ -219,7 +219,7 @@
8375                 unsigned long uninitialized_var(flags);
8376  
8377                 if ((io_32bit & 2) && !mmio) {
8378 -                       local_irq_save(flags);
8379 +                       local_irq_save_nort(flags);
8380                         ata_vlb_sync(io_ports->nsect_addr);
8381                 }
8382  
8383 @@ -230,7 +230,7 @@
8384                         outsl(data_addr, buf, words);
8385  
8386                 if ((io_32bit & 2) && !mmio)
8387 -                       local_irq_restore(flags);
8388 +                       local_irq_restore_nort(flags);
8389  
8390                 if (((len + 1) & 3) < 2)
8391                         return;
8392 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-probe.c linux-4.14/drivers/ide/ide-probe.c
8393 --- linux-4.14.orig/drivers/ide/ide-probe.c     2017-11-12 19:46:13.000000000 +0100
8394 +++ linux-4.14/drivers/ide/ide-probe.c  2018-09-05 11:05:07.000000000 +0200
8395 @@ -196,10 +196,10 @@
8396         int bswap = 1;
8397  
8398         /* local CPU only; some systems need this */
8399 -       local_irq_save(flags);
8400 +       local_irq_save_nort(flags);
8401         /* read 512 bytes of id info */
8402         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8403 -       local_irq_restore(flags);
8404 +       local_irq_restore_nort(flags);
8405  
8406         drive->dev_flags |= IDE_DFLAG_ID_READ;
8407  #ifdef DEBUG
8408 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/ide/ide-taskfile.c linux-4.14/drivers/ide/ide-taskfile.c
8409 --- linux-4.14.orig/drivers/ide/ide-taskfile.c  2017-11-12 19:46:13.000000000 +0100
8410 +++ linux-4.14/drivers/ide/ide-taskfile.c       2018-09-05 11:05:07.000000000 +0200
8411 @@ -251,7 +251,7 @@
8412  
8413                 page_is_high = PageHighMem(page);
8414                 if (page_is_high)
8415 -                       local_irq_save(flags);
8416 +                       local_irq_save_nort(flags);
8417  
8418                 buf = kmap_atomic(page) + offset;
8419  
8420 @@ -272,7 +272,7 @@
8421                 kunmap_atomic(buf);
8422  
8423                 if (page_is_high)
8424 -                       local_irq_restore(flags);
8425 +                       local_irq_restore_nort(flags);
8426  
8427                 len -= nr_bytes;
8428         }
8429 @@ -415,7 +415,7 @@
8430         }
8431  
8432         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8433 -               local_irq_disable();
8434 +               local_irq_disable_nort();
8435  
8436         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8437  
8438 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/hfi1/affinity.c linux-4.14/drivers/infiniband/hw/hfi1/affinity.c
8439 --- linux-4.14.orig/drivers/infiniband/hw/hfi1/affinity.c       2018-09-05 11:03:22.000000000 +0200
8440 +++ linux-4.14/drivers/infiniband/hw/hfi1/affinity.c    2018-09-05 11:05:07.000000000 +0200
8441 @@ -575,7 +575,7 @@
8442         struct hfi1_affinity_node *entry;
8443         cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
8444         const struct cpumask *node_mask,
8445 -               *proc_mask = &current->cpus_allowed;
8446 +               *proc_mask = current->cpus_ptr;
8447         struct hfi1_affinity_node_list *affinity = &node_affinity;
8448         struct cpu_mask_set *set = &affinity->proc;
8449  
8450 @@ -583,7 +583,7 @@
8451          * check whether process/context affinity has already
8452          * been set
8453          */
8454 -       if (cpumask_weight(proc_mask) == 1) {
8455 +       if (current->nr_cpus_allowed == 1) {
8456                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
8457                           current->pid, current->comm,
8458                           cpumask_pr_args(proc_mask));
8459 @@ -594,7 +594,7 @@
8460                 cpu = cpumask_first(proc_mask);
8461                 cpumask_set_cpu(cpu, &set->used);
8462                 goto done;
8463 -       } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
8464 +       } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
8465                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
8466                           current->pid, current->comm,
8467                           cpumask_pr_args(proc_mask));
8468 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/hfi1/sdma.c linux-4.14/drivers/infiniband/hw/hfi1/sdma.c
8469 --- linux-4.14.orig/drivers/infiniband/hw/hfi1/sdma.c   2017-11-12 19:46:13.000000000 +0100
8470 +++ linux-4.14/drivers/infiniband/hw/hfi1/sdma.c        2018-09-05 11:05:07.000000000 +0200
8471 @@ -856,14 +856,13 @@
8472  {
8473         struct sdma_rht_node *rht_node;
8474         struct sdma_engine *sde = NULL;
8475 -       const struct cpumask *current_mask = &current->cpus_allowed;
8476         unsigned long cpu_id;
8477  
8478         /*
8479          * To ensure that always the same sdma engine(s) will be
8480          * selected make sure the process is pinned to this CPU only.
8481          */
8482 -       if (cpumask_weight(current_mask) != 1)
8483 +       if (current->nr_cpus_allowed != 1)
8484                 goto out;
8485  
8486         cpu_id = smp_processor_id();
8487 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/hw/qib/qib_file_ops.c linux-4.14/drivers/infiniband/hw/qib/qib_file_ops.c
8488 --- linux-4.14.orig/drivers/infiniband/hw/qib/qib_file_ops.c    2018-09-05 11:03:22.000000000 +0200
8489 +++ linux-4.14/drivers/infiniband/hw/qib/qib_file_ops.c 2018-09-05 11:05:07.000000000 +0200
8490 @@ -1167,7 +1167,7 @@
8491  static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
8492  {
8493         struct qib_filedata *fd = fp->private_data;
8494 -       const unsigned int weight = cpumask_weight(&current->cpus_allowed);
8495 +       const unsigned int weight = current->nr_cpus_allowed;
8496         const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8497         int local_cpu;
8498  
8499 @@ -1648,9 +1648,8 @@
8500                 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
8501         else {
8502                 int unit;
8503 -               const unsigned int cpu = cpumask_first(&current->cpus_allowed);
8504 -               const unsigned int weight =
8505 -                       cpumask_weight(&current->cpus_allowed);
8506 +               const unsigned int cpu = cpumask_first(current->cpus_ptr);
8507 +               const unsigned int weight = current->nr_cpus_allowed;
8508  
8509                 if (weight == 1 && !test_bit(cpu, qib_cpulist))
8510                         if (!find_hca(cpu, &unit) && unit >= 0)
8511 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-4.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8512 --- linux-4.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c      2018-09-05 11:03:22.000000000 +0200
8513 +++ linux-4.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c   2018-09-05 11:05:07.000000000 +0200
8514 @@ -898,7 +898,7 @@
8515  
8516         ipoib_dbg_mcast(priv, "restarting multicast task\n");
8517  
8518 -       local_irq_save(flags);
8519 +       local_irq_save_nort(flags);
8520         netif_addr_lock(dev);
8521         spin_lock(&priv->lock);
8522  
8523 @@ -980,7 +980,7 @@
8524  
8525         spin_unlock(&priv->lock);
8526         netif_addr_unlock(dev);
8527 -       local_irq_restore(flags);
8528 +       local_irq_restore_nort(flags);
8529  
8530         ipoib_mcast_remove_list(&remove_list);
8531  
8532 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/input/gameport/gameport.c linux-4.14/drivers/input/gameport/gameport.c
8533 --- linux-4.14.orig/drivers/input/gameport/gameport.c   2017-11-12 19:46:13.000000000 +0100
8534 +++ linux-4.14/drivers/input/gameport/gameport.c        2018-09-05 11:05:07.000000000 +0200
8535 @@ -91,13 +91,13 @@
8536         tx = ~0;
8537  
8538         for (i = 0; i < 50; i++) {
8539 -               local_irq_save(flags);
8540 +               local_irq_save_nort(flags);
8541                 t1 = ktime_get_ns();
8542                 for (t = 0; t < 50; t++)
8543                         gameport_read(gameport);
8544                 t2 = ktime_get_ns();
8545                 t3 = ktime_get_ns();
8546 -               local_irq_restore(flags);
8547 +               local_irq_restore_nort(flags);
8548                 udelay(i * 10);
8549                 t = (t2 - t1) - (t3 - t2);
8550                 if (t < tx)
8551 @@ -124,12 +124,12 @@
8552         tx = 1 << 30;
8553  
8554         for(i = 0; i < 50; i++) {
8555 -               local_irq_save(flags);
8556 +               local_irq_save_nort(flags);
8557                 GET_TIME(t1);
8558                 for (t = 0; t < 50; t++) gameport_read(gameport);
8559                 GET_TIME(t2);
8560                 GET_TIME(t3);
8561 -               local_irq_restore(flags);
8562 +               local_irq_restore_nort(flags);
8563                 udelay(i * 10);
8564                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
8565         }
8566 @@ -148,11 +148,11 @@
8567         tx = 1 << 30;
8568  
8569         for(i = 0; i < 50; i++) {
8570 -               local_irq_save(flags);
8571 +               local_irq_save_nort(flags);
8572                 t1 = rdtsc();
8573                 for (t = 0; t < 50; t++) gameport_read(gameport);
8574                 t2 = rdtsc();
8575 -               local_irq_restore(flags);
8576 +               local_irq_restore_nort(flags);
8577                 udelay(i * 10);
8578                 if (t2 - t1 < tx) tx = t2 - t1;
8579         }
8580 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu.c linux-4.14/drivers/iommu/amd_iommu.c
8581 --- linux-4.14.orig/drivers/iommu/amd_iommu.c   2018-09-05 11:03:22.000000000 +0200
8582 +++ linux-4.14/drivers/iommu/amd_iommu.c        2018-09-05 11:05:07.000000000 +0200
8583 @@ -81,11 +81,12 @@
8584   */
8585  #define AMD_IOMMU_PGSIZES      ((~0xFFFUL) & ~(2ULL << 38))
8586  
8587 -static DEFINE_RWLOCK(amd_iommu_devtable_lock);
8588 +static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
8589 +static DEFINE_SPINLOCK(pd_bitmap_lock);
8590 +static DEFINE_SPINLOCK(iommu_table_lock);
8591  
8592  /* List of all available dev_data structures */
8593 -static LIST_HEAD(dev_data_list);
8594 -static DEFINE_SPINLOCK(dev_data_list_lock);
8595 +static LLIST_HEAD(dev_data_list);
8596  
8597  LIST_HEAD(ioapic_map);
8598  LIST_HEAD(hpet_map);
8599 @@ -204,40 +205,33 @@
8600  static struct iommu_dev_data *alloc_dev_data(u16 devid)
8601  {
8602         struct iommu_dev_data *dev_data;
8603 -       unsigned long flags;
8604  
8605         dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
8606         if (!dev_data)
8607                 return NULL;
8608  
8609         dev_data->devid = devid;
8610 -
8611 -       spin_lock_irqsave(&dev_data_list_lock, flags);
8612 -       list_add_tail(&dev_data->dev_data_list, &dev_data_list);
8613 -       spin_unlock_irqrestore(&dev_data_list_lock, flags);
8614 -
8615         ratelimit_default_init(&dev_data->rs);
8616  
8617 +       llist_add(&dev_data->dev_data_list, &dev_data_list);
8618         return dev_data;
8619  }
8620  
8621  static struct iommu_dev_data *search_dev_data(u16 devid)
8622  {
8623         struct iommu_dev_data *dev_data;
8624 -       unsigned long flags;
8625 +       struct llist_node *node;
8626  
8627 -       spin_lock_irqsave(&dev_data_list_lock, flags);
8628 -       list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
8629 +       if (llist_empty(&dev_data_list))
8630 +               return NULL;
8631 +
8632 +       node = dev_data_list.first;
8633 +       llist_for_each_entry(dev_data, node, dev_data_list) {
8634                 if (dev_data->devid == devid)
8635 -                       goto out_unlock;
8636 +                       return dev_data;
8637         }
8638  
8639 -       dev_data = NULL;
8640 -
8641 -out_unlock:
8642 -       spin_unlock_irqrestore(&dev_data_list_lock, flags);
8643 -
8644 -       return dev_data;
8645 +       return NULL;
8646  }
8647  
8648  static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
8649 @@ -1056,9 +1050,9 @@
8650         unsigned long flags;
8651         int ret;
8652  
8653 -       spin_lock_irqsave(&iommu->lock, flags);
8654 +       raw_spin_lock_irqsave(&iommu->lock, flags);
8655         ret = __iommu_queue_command_sync(iommu, cmd, sync);
8656 -       spin_unlock_irqrestore(&iommu->lock, flags);
8657 +       raw_spin_unlock_irqrestore(&iommu->lock, flags);
8658  
8659         return ret;
8660  }
8661 @@ -1084,7 +1078,7 @@
8662  
8663         build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
8664  
8665 -       spin_lock_irqsave(&iommu->lock, flags);
8666 +       raw_spin_lock_irqsave(&iommu->lock, flags);
8667  
8668         iommu->cmd_sem = 0;
8669  
8670 @@ -1095,7 +1089,7 @@
8671         ret = wait_on_sem(&iommu->cmd_sem);
8672  
8673  out_unlock:
8674 -       spin_unlock_irqrestore(&iommu->lock, flags);
8675 +       raw_spin_unlock_irqrestore(&iommu->lock, flags);
8676  
8677         return ret;
8678  }
8679 @@ -1604,29 +1598,26 @@
8680  
8681  static u16 domain_id_alloc(void)
8682  {
8683 -       unsigned long flags;
8684         int id;
8685  
8686 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8687 +       spin_lock(&pd_bitmap_lock);
8688         id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
8689         BUG_ON(id == 0);
8690         if (id > 0 && id < MAX_DOMAIN_ID)
8691                 __set_bit(id, amd_iommu_pd_alloc_bitmap);
8692         else
8693                 id = 0;
8694 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8695 +       spin_unlock(&pd_bitmap_lock);
8696  
8697         return id;
8698  }
8699  
8700  static void domain_id_free(int id)
8701  {
8702 -       unsigned long flags;
8703 -
8704 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8705 +       spin_lock(&pd_bitmap_lock);
8706         if (id > 0 && id < MAX_DOMAIN_ID)
8707                 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
8708 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8709 +       spin_unlock(&pd_bitmap_lock);
8710  }
8711  
8712  #define DEFINE_FREE_PT_FN(LVL, FN)                             \
8713 @@ -1946,10 +1937,10 @@
8714         int ret;
8715  
8716         /*
8717 -        * Must be called with IRQs disabled. Warn here to detect early
8718 -        * when its not.
8719 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
8720 +        * detect early when its not.
8721          */
8722 -       WARN_ON(!irqs_disabled());
8723 +       WARN_ON_NONRT(!irqs_disabled());
8724  
8725         /* lock domain */
8726         spin_lock(&domain->lock);
8727 @@ -2095,9 +2086,9 @@
8728         }
8729  
8730  skip_ats_check:
8731 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8732 +       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8733         ret = __attach_device(dev_data, domain);
8734 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8735 +       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8736  
8737         /*
8738          * We might boot into a crash-kernel here. The crashed kernel
8739 @@ -2117,10 +2108,10 @@
8740         struct protection_domain *domain;
8741  
8742         /*
8743 -        * Must be called with IRQs disabled. Warn here to detect early
8744 -        * when its not.
8745 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
8746 +        * detect early when its not.
8747          */
8748 -       WARN_ON(!irqs_disabled());
8749 +       WARN_ON_NONRT(!irqs_disabled());
8750  
8751         if (WARN_ON(!dev_data->domain))
8752                 return;
8753 @@ -2147,9 +2138,9 @@
8754         domain   = dev_data->domain;
8755  
8756         /* lock device table */
8757 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8758 +       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8759         __detach_device(dev_data);
8760 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8761 +       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8762  
8763         if (!dev_is_pci(dev))
8764                 return;
8765 @@ -2813,7 +2804,7 @@
8766         struct iommu_dev_data *entry;
8767         unsigned long flags;
8768  
8769 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8770 +       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8771  
8772         while (!list_empty(&domain->dev_list)) {
8773                 entry = list_first_entry(&domain->dev_list,
8774 @@ -2821,7 +2812,7 @@
8775                 __detach_device(entry);
8776         }
8777  
8778 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8779 +       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8780  }
8781  
8782  static void protection_domain_free(struct protection_domain *domain)
8783 @@ -3588,14 +3579,62 @@
8784         amd_iommu_dev_table[devid].data[2] = dte;
8785  }
8786  
8787 -static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
8788 +static struct irq_remap_table *get_irq_table(u16 devid)
8789 +{
8790 +       struct irq_remap_table *table;
8791 +
8792 +       if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
8793 +                     "%s: no iommu for devid %x\n", __func__, devid))
8794 +               return NULL;
8795 +
8796 +       table = irq_lookup_table[devid];
8797 +       if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
8798 +               return NULL;
8799 +
8800 +       return table;
8801 +}
8802 +
8803 +static struct irq_remap_table *__alloc_irq_table(void)
8804 +{
8805 +       struct irq_remap_table *table;
8806 +
8807 +       table = kzalloc(sizeof(*table), GFP_KERNEL);
8808 +       if (!table)
8809 +               return NULL;
8810 +
8811 +       table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
8812 +       if (!table->table) {
8813 +               kfree(table);
8814 +               return NULL;
8815 +       }
8816 +       raw_spin_lock_init(&table->lock);
8817 +
8818 +       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
8819 +               memset(table->table, 0,
8820 +                      MAX_IRQS_PER_TABLE * sizeof(u32));
8821 +       else
8822 +               memset(table->table, 0,
8823 +                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
8824 +       return table;
8825 +}
8826 +
8827 +static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
8828 +                                 struct irq_remap_table *table)
8829 +{
8830 +       irq_lookup_table[devid] = table;
8831 +       set_dte_irq_entry(devid, table);
8832 +       iommu_flush_dte(iommu, devid);
8833 +}
8834 +
8835 +static struct irq_remap_table *alloc_irq_table(u16 devid)
8836  {
8837         struct irq_remap_table *table = NULL;
8838 +       struct irq_remap_table *new_table = NULL;
8839         struct amd_iommu *iommu;
8840         unsigned long flags;
8841         u16 alias;
8842  
8843 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8844 +       spin_lock_irqsave(&iommu_table_lock, flags);
8845  
8846         iommu = amd_iommu_rlookup_table[devid];
8847         if (!iommu)
8848 @@ -3608,60 +3647,45 @@
8849         alias = amd_iommu_alias_table[devid];
8850         table = irq_lookup_table[alias];
8851         if (table) {
8852 -               irq_lookup_table[devid] = table;
8853 -               set_dte_irq_entry(devid, table);
8854 -               iommu_flush_dte(iommu, devid);
8855 -               goto out;
8856 +               set_remap_table_entry(iommu, devid, table);
8857 +               goto out_wait;
8858         }
8859 +       spin_unlock_irqrestore(&iommu_table_lock, flags);
8860  
8861         /* Nothing there yet, allocate new irq remapping table */
8862 -       table = kzalloc(sizeof(*table), GFP_ATOMIC);
8863 -       if (!table)
8864 -               goto out_unlock;
8865 -
8866 -       /* Initialize table spin-lock */
8867 -       spin_lock_init(&table->lock);
8868 +       new_table = __alloc_irq_table();
8869 +       if (!new_table)
8870 +               return NULL;
8871  
8872 -       if (ioapic)
8873 -               /* Keep the first 32 indexes free for IOAPIC interrupts */
8874 -               table->min_index = 32;
8875 +       spin_lock_irqsave(&iommu_table_lock, flags);
8876  
8877 -       table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
8878 -       if (!table->table) {
8879 -               kfree(table);
8880 -               table = NULL;
8881 +       table = irq_lookup_table[devid];
8882 +       if (table)
8883                 goto out_unlock;
8884 -       }
8885 -
8886 -       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
8887 -               memset(table->table, 0,
8888 -                      MAX_IRQS_PER_TABLE * sizeof(u32));
8889 -       else
8890 -               memset(table->table, 0,
8891 -                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
8892 -
8893 -       if (ioapic) {
8894 -               int i;
8895  
8896 -               for (i = 0; i < 32; ++i)
8897 -                       iommu->irte_ops->set_allocated(table, i);
8898 +       table = irq_lookup_table[alias];
8899 +       if (table) {
8900 +               set_remap_table_entry(iommu, devid, table);
8901 +               goto out_wait;
8902         }
8903  
8904 -       irq_lookup_table[devid] = table;
8905 -       set_dte_irq_entry(devid, table);
8906 -       iommu_flush_dte(iommu, devid);
8907 -       if (devid != alias) {
8908 -               irq_lookup_table[alias] = table;
8909 -               set_dte_irq_entry(alias, table);
8910 -               iommu_flush_dte(iommu, alias);
8911 -       }
8912 +       table = new_table;
8913 +       new_table = NULL;
8914  
8915 -out:
8916 +       set_remap_table_entry(iommu, devid, table);
8917 +       if (devid != alias)
8918 +               set_remap_table_entry(iommu, alias, table);
8919 +
8920 +out_wait:
8921         iommu_completion_wait(iommu);
8922  
8923  out_unlock:
8924 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8925 +       spin_unlock_irqrestore(&iommu_table_lock, flags);
8926  
8927 +       if (new_table) {
8928 +               kmem_cache_free(amd_iommu_irq_cache, new_table->table);
8929 +               kfree(new_table);
8930 +       }
8931         return table;
8932  }
8933  
8934 @@ -3675,11 +3699,11 @@
8935         if (!iommu)
8936                 return -ENODEV;
8937  
8938 -       table = get_irq_table(devid, false);
8939 +       table = alloc_irq_table(devid);
8940         if (!table)
8941                 return -ENODEV;
8942  
8943 -       spin_lock_irqsave(&table->lock, flags);
8944 +       raw_spin_lock_irqsave(&table->lock, flags);
8945  
8946         /* Scan table for free entries */
8947         for (c = 0, index = table->min_index;
8948 @@ -3702,7 +3726,7 @@
8949         index = -ENOSPC;
8950  
8951  out:
8952 -       spin_unlock_irqrestore(&table->lock, flags);
8953 +       raw_spin_unlock_irqrestore(&table->lock, flags);
8954  
8955         return index;
8956  }
8957 @@ -3719,11 +3743,11 @@
8958         if (iommu == NULL)
8959                 return -EINVAL;
8960  
8961 -       table = get_irq_table(devid, false);
8962 +       table = get_irq_table(devid);
8963         if (!table)
8964                 return -ENOMEM;
8965  
8966 -       spin_lock_irqsave(&table->lock, flags);
8967 +       raw_spin_lock_irqsave(&table->lock, flags);
8968  
8969         entry = (struct irte_ga *)table->table;
8970         entry = &entry[index];
8971 @@ -3734,7 +3758,7 @@
8972         if (data)
8973                 data->ref = entry;
8974  
8975 -       spin_unlock_irqrestore(&table->lock, flags);
8976 +       raw_spin_unlock_irqrestore(&table->lock, flags);
8977  
8978         iommu_flush_irt(iommu, devid);
8979         iommu_completion_wait(iommu);
8980 @@ -3752,13 +3776,13 @@
8981         if (iommu == NULL)
8982                 return -EINVAL;
8983  
8984 -       table = get_irq_table(devid, false);
8985 +       table = get_irq_table(devid);
8986         if (!table)
8987                 return -ENOMEM;
8988  
8989 -       spin_lock_irqsave(&table->lock, flags);
8990 +       raw_spin_lock_irqsave(&table->lock, flags);
8991         table->table[index] = irte->val;
8992 -       spin_unlock_irqrestore(&table->lock, flags);
8993 +       raw_spin_unlock_irqrestore(&table->lock, flags);
8994  
8995         iommu_flush_irt(iommu, devid);
8996         iommu_completion_wait(iommu);
8997 @@ -3776,13 +3800,13 @@
8998         if (iommu == NULL)
8999                 return;
9000  
9001 -       table = get_irq_table(devid, false);
9002 +       table = get_irq_table(devid);
9003         if (!table)
9004                 return;
9005  
9006 -       spin_lock_irqsave(&table->lock, flags);
9007 +       raw_spin_lock_irqsave(&table->lock, flags);
9008         iommu->irte_ops->clear_allocated(table, index);
9009 -       spin_unlock_irqrestore(&table->lock, flags);
9010 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9011  
9012         iommu_flush_irt(iommu, devid);
9013         iommu_completion_wait(iommu);
9014 @@ -3863,10 +3887,8 @@
9015                                  u8 vector, u32 dest_apicid)
9016  {
9017         struct irte_ga *irte = (struct irte_ga *) entry;
9018 -       struct iommu_dev_data *dev_data = search_dev_data(devid);
9019  
9020 -       if (!dev_data || !dev_data->use_vapic ||
9021 -           !irte->lo.fields_remap.guest_mode) {
9022 +       if (!irte->lo.fields_remap.guest_mode) {
9023                 irte->hi.fields.vector = vector;
9024                 irte->lo.fields_remap.destination = dest_apicid;
9025                 modify_irte_ga(devid, index, irte, NULL);
9026 @@ -4072,7 +4094,7 @@
9027         struct amd_ir_data *data = NULL;
9028         struct irq_cfg *cfg;
9029         int i, ret, devid;
9030 -       int index = -1;
9031 +       int index;
9032  
9033         if (!info)
9034                 return -EINVAL;
9035 @@ -4096,10 +4118,26 @@
9036                 return ret;
9037  
9038         if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
9039 -               if (get_irq_table(devid, true))
9040 +               struct irq_remap_table *table;
9041 +               struct amd_iommu *iommu;
9042 +
9043 +               table = alloc_irq_table(devid);
9044 +               if (table) {
9045 +                       if (!table->min_index) {
9046 +                               /*
9047 +                                * Keep the first 32 indexes free for IOAPIC
9048 +                                * interrupts.
9049 +                                */
9050 +                               table->min_index = 32;
9051 +                               iommu = amd_iommu_rlookup_table[devid];
9052 +                               for (i = 0; i < 32; ++i)
9053 +                                       iommu->irte_ops->set_allocated(table, i);
9054 +                       }
9055 +                       WARN_ON(table->min_index != 32);
9056                         index = info->ioapic_pin;
9057 -               else
9058 -                       ret = -ENOMEM;
9059 +               } else {
9060 +                       index = -ENOMEM;
9061 +               }
9062         } else {
9063                 index = alloc_irq_index(devid, nr_irqs);
9064         }
9065 @@ -4343,7 +4381,7 @@
9066  {
9067         unsigned long flags;
9068         struct amd_iommu *iommu;
9069 -       struct irq_remap_table *irt;
9070 +       struct irq_remap_table *table;
9071         struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
9072         int devid = ir_data->irq_2_irte.devid;
9073         struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
9074 @@ -4357,11 +4395,11 @@
9075         if (!iommu)
9076                 return -ENODEV;
9077  
9078 -       irt = get_irq_table(devid, false);
9079 -       if (!irt)
9080 +       table = get_irq_table(devid);
9081 +       if (!table)
9082                 return -ENODEV;
9083  
9084 -       spin_lock_irqsave(&irt->lock, flags);
9085 +       raw_spin_lock_irqsave(&table->lock, flags);
9086  
9087         if (ref->lo.fields_vapic.guest_mode) {
9088                 if (cpu >= 0)
9089 @@ -4370,7 +4408,7 @@
9090                 barrier();
9091         }
9092  
9093 -       spin_unlock_irqrestore(&irt->lock, flags);
9094 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9095  
9096         iommu_flush_irt(iommu, devid);
9097         iommu_completion_wait(iommu);
9098 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu_init.c linux-4.14/drivers/iommu/amd_iommu_init.c
9099 --- linux-4.14.orig/drivers/iommu/amd_iommu_init.c      2017-11-12 19:46:13.000000000 +0100
9100 +++ linux-4.14/drivers/iommu/amd_iommu_init.c   2018-09-05 11:05:07.000000000 +0200
9101 @@ -1474,7 +1474,7 @@
9102  {
9103         int ret;
9104  
9105 -       spin_lock_init(&iommu->lock);
9106 +       raw_spin_lock_init(&iommu->lock);
9107  
9108         /* Add IOMMU to internal data structures */
9109         list_add_tail(&iommu->list, &amd_iommu_list);
9110 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/amd_iommu_types.h linux-4.14/drivers/iommu/amd_iommu_types.h
9111 --- linux-4.14.orig/drivers/iommu/amd_iommu_types.h     2017-11-12 19:46:13.000000000 +0100
9112 +++ linux-4.14/drivers/iommu/amd_iommu_types.h  2018-09-05 11:05:07.000000000 +0200
9113 @@ -406,7 +406,7 @@
9114  #define IRQ_TABLE_ALIGNMENT    128
9115  
9116  struct irq_remap_table {
9117 -       spinlock_t lock;
9118 +       raw_spinlock_t lock;
9119         unsigned min_index;
9120         u32 *table;
9121  };
9122 @@ -488,7 +488,7 @@
9123         int index;
9124  
9125         /* locks the accesses to the hardware */
9126 -       spinlock_t lock;
9127 +       raw_spinlock_t lock;
9128  
9129         /* Pointer to PCI device of this IOMMU */
9130         struct pci_dev *dev;
9131 @@ -625,7 +625,7 @@
9132   */
9133  struct iommu_dev_data {
9134         struct list_head list;            /* For domain->dev_list */
9135 -       struct list_head dev_data_list;   /* For global dev_data_list */
9136 +       struct llist_node dev_data_list;  /* For global dev_data_list */
9137         struct protection_domain *domain; /* Domain the device is bound to */
9138         u16 devid;                        /* PCI Device ID */
9139         u16 alias;                        /* Alias Device ID */
9140 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/iommu/iova.c linux-4.14/drivers/iommu/iova.c
9141 --- linux-4.14.orig/drivers/iommu/iova.c        2017-11-12 19:46:13.000000000 +0100
9142 +++ linux-4.14/drivers/iommu/iova.c     2018-09-05 11:05:07.000000000 +0200
9143 @@ -570,7 +570,7 @@
9144                 unsigned long pfn, unsigned long pages,
9145                 unsigned long data)
9146  {
9147 -       struct iova_fq *fq = get_cpu_ptr(iovad->fq);
9148 +       struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
9149         unsigned long flags;
9150         unsigned idx;
9151  
9152 @@ -600,8 +600,6 @@
9153         if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
9154                 mod_timer(&iovad->fq_timer,
9155                           jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
9156 -
9157 -       put_cpu_ptr(iovad->fq);
9158  }
9159  EXPORT_SYMBOL_GPL(queue_iova);
9160  
9161 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/leds/trigger/Kconfig linux-4.14/drivers/leds/trigger/Kconfig
9162 --- linux-4.14.orig/drivers/leds/trigger/Kconfig        2017-11-12 19:46:13.000000000 +0100
9163 +++ linux-4.14/drivers/leds/trigger/Kconfig     2018-09-05 11:05:07.000000000 +0200
9164 @@ -69,7 +69,7 @@
9165  
9166  config LEDS_TRIGGER_CPU
9167         bool "LED CPU Trigger"
9168 -       depends on LEDS_TRIGGERS
9169 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9170         help
9171           This allows LEDs to be controlled by active CPUs. This shows
9172           the active CPUs across an array of LEDs so you can see which
9173 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/bcache/Kconfig linux-4.14/drivers/md/bcache/Kconfig
9174 --- linux-4.14.orig/drivers/md/bcache/Kconfig   2017-11-12 19:46:13.000000000 +0100
9175 +++ linux-4.14/drivers/md/bcache/Kconfig        2018-09-05 11:05:07.000000000 +0200
9176 @@ -1,6 +1,7 @@
9177  
9178  config BCACHE
9179         tristate "Block device as cache"
9180 +       depends on !PREEMPT_RT_FULL
9181         ---help---
9182         Allows a block device to be used as cache for other devices; uses
9183         a btree for indexing and the layout is optimized for SSDs.
9184 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/dm-rq.c linux-4.14/drivers/md/dm-rq.c
9185 --- linux-4.14.orig/drivers/md/dm-rq.c  2017-11-12 19:46:13.000000000 +0100
9186 +++ linux-4.14/drivers/md/dm-rq.c       2018-09-05 11:05:07.000000000 +0200
9187 @@ -671,7 +671,7 @@
9188                 /* Establish tio->ti before queuing work (map_tio_request) */
9189                 tio->ti = ti;
9190                 kthread_queue_work(&md->kworker, &tio->work);
9191 -               BUG_ON(!irqs_disabled());
9192 +               BUG_ON_NONRT(!irqs_disabled());
9193         }
9194  }
9195  
9196 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/raid5.c linux-4.14/drivers/md/raid5.c
9197 --- linux-4.14.orig/drivers/md/raid5.c  2018-09-05 11:03:22.000000000 +0200
9198 +++ linux-4.14/drivers/md/raid5.c       2018-09-05 11:05:07.000000000 +0200
9199 @@ -410,7 +410,7 @@
9200                 md_wakeup_thread(conf->mddev->thread);
9201         return;
9202  slow_path:
9203 -       local_irq_save(flags);
9204 +       local_irq_save_nort(flags);
9205         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9206         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9207                 INIT_LIST_HEAD(&list);
9208 @@ -419,7 +419,7 @@
9209                 spin_unlock(&conf->device_lock);
9210                 release_inactive_stripe_list(conf, &list, hash);
9211         }
9212 -       local_irq_restore(flags);
9213 +       local_irq_restore_nort(flags);
9214  }
9215  
9216  static inline void remove_hash(struct stripe_head *sh)
9217 @@ -2067,8 +2067,9 @@
9218         struct raid5_percpu *percpu;
9219         unsigned long cpu;
9220  
9221 -       cpu = get_cpu();
9222 +       cpu = get_cpu_light();
9223         percpu = per_cpu_ptr(conf->percpu, cpu);
9224 +       spin_lock(&percpu->lock);
9225         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9226                 ops_run_biofill(sh);
9227                 overlap_clear++;
9228 @@ -2127,7 +2128,8 @@
9229                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9230                                 wake_up(&sh->raid_conf->wait_for_overlap);
9231                 }
9232 -       put_cpu();
9233 +       spin_unlock(&percpu->lock);
9234 +       put_cpu_light();
9235  }
9236  
9237  static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
9238 @@ -6775,6 +6777,7 @@
9239                         __func__, cpu);
9240                 return -ENOMEM;
9241         }
9242 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9243         return 0;
9244  }
9245  
9246 @@ -6785,7 +6788,6 @@
9247         conf->percpu = alloc_percpu(struct raid5_percpu);
9248         if (!conf->percpu)
9249                 return -ENOMEM;
9250 -
9251         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
9252         if (!err) {
9253                 conf->scribble_disks = max(conf->raid_disks,
9254 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/md/raid5.h linux-4.14/drivers/md/raid5.h
9255 --- linux-4.14.orig/drivers/md/raid5.h  2017-11-12 19:46:13.000000000 +0100
9256 +++ linux-4.14/drivers/md/raid5.h       2018-09-05 11:05:07.000000000 +0200
9257 @@ -624,6 +624,7 @@
9258         int                     recovery_disabled;
9259         /* per cpu variables */
9260         struct raid5_percpu {
9261 +               spinlock_t      lock;           /* Protection for -RT */
9262                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9263                 struct flex_array *scribble;   /* space for constructing buffer
9264                                               * lists and performing address
9265 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/mfd/atmel-smc.c linux-4.14/drivers/mfd/atmel-smc.c
9266 --- linux-4.14.orig/drivers/mfd/atmel-smc.c     2017-11-12 19:46:13.000000000 +0100
9267 +++ linux-4.14/drivers/mfd/atmel-smc.c  2018-09-05 11:05:07.000000000 +0200
9268 @@ -12,6 +12,7 @@
9269   */
9270  
9271  #include <linux/mfd/syscon/atmel-smc.h>
9272 +#include <linux/string.h>
9273  
9274  /**
9275   * atmel_smc_cs_conf_init - initialize a SMC CS conf
9276 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/misc/Kconfig linux-4.14/drivers/misc/Kconfig
9277 --- linux-4.14.orig/drivers/misc/Kconfig        2017-11-12 19:46:13.000000000 +0100
9278 +++ linux-4.14/drivers/misc/Kconfig     2018-09-05 11:05:07.000000000 +0200
9279 @@ -54,6 +54,7 @@
9280  config ATMEL_TCLIB
9281         bool "Atmel AT32/AT91 Timer/Counter Library"
9282         depends on (AVR32 || ARCH_AT91)
9283 +       default y if PREEMPT_RT_FULL
9284         help
9285           Select this if you want a library to allocate the Timer/Counter
9286           blocks found on many Atmel processors.  This facilitates using
9287 @@ -69,8 +70,7 @@
9288           are combined to make a single 32-bit timer.
9289  
9290           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9291 -         may be used as a clock event device supporting oneshot mode
9292 -         (delays of up to two seconds) based on the 32 KiHz clock.
9293 +         may be used as a clock event device supporting oneshot mode.
9294  
9295  config ATMEL_TCB_CLKSRC_BLOCK
9296         int
9297 @@ -84,6 +84,15 @@
9298           TC can be used for other purposes, such as PWM generation and
9299           interval timing.
9300  
9301 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9302 +       bool "TC Block use 32 KiHz clock"
9303 +       depends on ATMEL_TCB_CLKSRC
9304 +       default y if !PREEMPT_RT_FULL
9305 +       help
9306 +         Select this to use 32 KiHz base clock rate as TC block clock
9307 +         source for clock events.
9308 +
9309 +
9310  config DUMMY_IRQ
9311         tristate "Dummy IRQ handler"
9312         default n
9313 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/mmc/host/mmci.c linux-4.14/drivers/mmc/host/mmci.c
9314 --- linux-4.14.orig/drivers/mmc/host/mmci.c     2017-11-12 19:46:13.000000000 +0100
9315 +++ linux-4.14/drivers/mmc/host/mmci.c  2018-09-05 11:05:07.000000000 +0200
9316 @@ -1200,15 +1200,12 @@
9317         struct sg_mapping_iter *sg_miter = &host->sg_miter;
9318         struct variant_data *variant = host->variant;
9319         void __iomem *base = host->base;
9320 -       unsigned long flags;
9321         u32 status;
9322  
9323         status = readl(base + MMCISTATUS);
9324  
9325         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
9326  
9327 -       local_irq_save(flags);
9328 -
9329         do {
9330                 unsigned int remain, len;
9331                 char *buffer;
9332 @@ -1248,8 +1245,6 @@
9333  
9334         sg_miter_stop(sg_miter);
9335  
9336 -       local_irq_restore(flags);
9337 -
9338         /*
9339          * If we have less than the fifo 'half-full' threshold to transfer,
9340          * trigger a PIO interrupt as soon as any data is available.
9341 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/ethernet/3com/3c59x.c linux-4.14/drivers/net/ethernet/3com/3c59x.c
9342 --- linux-4.14.orig/drivers/net/ethernet/3com/3c59x.c   2017-11-12 19:46:13.000000000 +0100
9343 +++ linux-4.14/drivers/net/ethernet/3com/3c59x.c        2018-09-05 11:05:07.000000000 +0200
9344 @@ -842,9 +842,9 @@
9345  {
9346         struct vortex_private *vp = netdev_priv(dev);
9347         unsigned long flags;
9348 -       local_irq_save(flags);
9349 +       local_irq_save_nort(flags);
9350         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
9351 -       local_irq_restore(flags);
9352 +       local_irq_restore_nort(flags);
9353  }
9354  #endif
9355  
9356 @@ -1908,12 +1908,12 @@
9357                          * Block interrupts because vortex_interrupt does a bare spin_lock()
9358                          */
9359                         unsigned long flags;
9360 -                       local_irq_save(flags);
9361 +                       local_irq_save_nort(flags);
9362                         if (vp->full_bus_master_tx)
9363                                 boomerang_interrupt(dev->irq, dev);
9364                         else
9365                                 vortex_interrupt(dev->irq, dev);
9366 -                       local_irq_restore(flags);
9367 +                       local_irq_restore_nort(flags);
9368                 }
9369         }
9370  
9371 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/ethernet/marvell/mvpp2.c linux-4.14/drivers/net/ethernet/marvell/mvpp2.c
9372 --- linux-4.14.orig/drivers/net/ethernet/marvell/mvpp2.c        2018-09-05 11:03:22.000000000 +0200
9373 +++ linux-4.14/drivers/net/ethernet/marvell/mvpp2.c     2018-09-05 11:05:07.000000000 +0200
9374 @@ -830,9 +830,8 @@
9375  /* Per-CPU port control */
9376  struct mvpp2_port_pcpu {
9377         struct hrtimer tx_done_timer;
9378 +       struct net_device *dev;
9379         bool timer_scheduled;
9380 -       /* Tasklet for egress finalization */
9381 -       struct tasklet_struct tx_done_tasklet;
9382  };
9383  
9384  struct mvpp2_queue_vector {
9385 @@ -5954,46 +5953,34 @@
9386         }
9387  }
9388  
9389 -static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
9390 -{
9391 -       ktime_t interval;
9392 -
9393 -       if (!port_pcpu->timer_scheduled) {
9394 -               port_pcpu->timer_scheduled = true;
9395 -               interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
9396 -               hrtimer_start(&port_pcpu->tx_done_timer, interval,
9397 -                             HRTIMER_MODE_REL_PINNED);
9398 -       }
9399 -}
9400 -
9401 -static void mvpp2_tx_proc_cb(unsigned long data)
9402 +static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9403  {
9404 -       struct net_device *dev = (struct net_device *)data;
9405 -       struct mvpp2_port *port = netdev_priv(dev);
9406 -       struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9407 +       struct net_device *dev;
9408 +       struct mvpp2_port *port;
9409 +       struct mvpp2_port_pcpu *port_pcpu;
9410         unsigned int tx_todo, cause;
9411  
9412 +       port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer);
9413 +       dev = port_pcpu->dev;
9414 +
9415         if (!netif_running(dev))
9416 -               return;
9417 +               return HRTIMER_NORESTART;
9418 +
9419         port_pcpu->timer_scheduled = false;
9420 +       port = netdev_priv(dev);
9421  
9422         /* Process all the Tx queues */
9423         cause = (1 << port->ntxqs) - 1;
9424         tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
9425  
9426         /* Set the timer in case not all the packets were processed */
9427 -       if (tx_todo)
9428 -               mvpp2_timer_set(port_pcpu);
9429 -}
9430 -
9431 -static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9432 -{
9433 -       struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
9434 -                                                        struct mvpp2_port_pcpu,
9435 -                                                        tx_done_timer);
9436 -
9437 -       tasklet_schedule(&port_pcpu->tx_done_tasklet);
9438 +       if (tx_todo && !port_pcpu->timer_scheduled) {
9439 +               port_pcpu->timer_scheduled = true;
9440 +               hrtimer_forward_now(&port_pcpu->tx_done_timer,
9441 +                                   MVPP2_TXDONE_HRTIMER_PERIOD_NS);
9442  
9443 +               return HRTIMER_RESTART;
9444 +       }
9445         return HRTIMER_NORESTART;
9446  }
9447  
9448 @@ -6482,7 +6469,12 @@
9449             txq_pcpu->count > 0) {
9450                 struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9451  
9452 -               mvpp2_timer_set(port_pcpu);
9453 +               if (!port_pcpu->timer_scheduled) {
9454 +                       port_pcpu->timer_scheduled = true;
9455 +                       hrtimer_start(&port_pcpu->tx_done_timer,
9456 +                                     MVPP2_TXDONE_HRTIMER_PERIOD_NS,
9457 +                                     HRTIMER_MODE_REL_PINNED_SOFT);
9458 +               }
9459         }
9460  
9461         return NETDEV_TX_OK;
9462 @@ -6871,7 +6863,6 @@
9463  
9464                         hrtimer_cancel(&port_pcpu->tx_done_timer);
9465                         port_pcpu->timer_scheduled = false;
9466 -                       tasklet_kill(&port_pcpu->tx_done_tasklet);
9467                 }
9468         }
9469         mvpp2_cleanup_rxqs(port);
9470 @@ -7644,13 +7635,10 @@
9471                         port_pcpu = per_cpu_ptr(port->pcpu, cpu);
9472  
9473                         hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
9474 -                                    HRTIMER_MODE_REL_PINNED);
9475 +                                    HRTIMER_MODE_REL_PINNED_SOFT);
9476                         port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
9477                         port_pcpu->timer_scheduled = false;
9478 -
9479 -                       tasklet_init(&port_pcpu->tx_done_tasklet,
9480 -                                    mvpp2_tx_proc_cb,
9481 -                                    (unsigned long)dev);
9482 +                       port_pcpu->dev = dev;
9483                 }
9484         }
9485  
9486 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c linux-4.14/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9487 --- linux-4.14.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2017-11-12 19:46:13.000000000 +0100
9488 +++ linux-4.14/drivers/net/wireless/intersil/orinoco/orinoco_usb.c      2018-09-05 11:05:07.000000000 +0200
9489 @@ -697,7 +697,7 @@
9490                         while (!ctx->done.done && msecs--)
9491                                 udelay(1000);
9492                 } else {
9493 -                       wait_event_interruptible(ctx->done.wait,
9494 +                       swait_event_interruptible(ctx->done.wait,
9495                                                  ctx->done.done);
9496                 }
9497                 break;
9498 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/net/wireless/mac80211_hwsim.c linux-4.14/drivers/net/wireless/mac80211_hwsim.c
9499 --- linux-4.14.orig/drivers/net/wireless/mac80211_hwsim.c       2018-09-05 11:03:22.000000000 +0200
9500 +++ linux-4.14/drivers/net/wireless/mac80211_hwsim.c    2018-09-05 11:05:07.000000000 +0200
9501 @@ -537,7 +537,7 @@
9502         unsigned int rx_filter;
9503         bool started, idle, scanning;
9504         struct mutex mutex;
9505 -       struct tasklet_hrtimer beacon_timer;
9506 +       struct hrtimer beacon_timer;
9507         enum ps_mode {
9508                 PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
9509         } ps;
9510 @@ -1423,7 +1423,7 @@
9511  {
9512         struct mac80211_hwsim_data *data = hw->priv;
9513         data->started = false;
9514 -       tasklet_hrtimer_cancel(&data->beacon_timer);
9515 +       hrtimer_cancel(&data->beacon_timer);
9516         wiphy_debug(hw->wiphy, "%s\n", __func__);
9517  }
9518  
9519 @@ -1546,14 +1546,12 @@
9520  mac80211_hwsim_beacon(struct hrtimer *timer)
9521  {
9522         struct mac80211_hwsim_data *data =
9523 -               container_of(timer, struct mac80211_hwsim_data,
9524 -                            beacon_timer.timer);
9525 +               container_of(timer, struct mac80211_hwsim_data, beacon_timer);
9526         struct ieee80211_hw *hw = data->hw;
9527         u64 bcn_int = data->beacon_int;
9528 -       ktime_t next_bcn;
9529  
9530         if (!data->started)
9531 -               goto out;
9532 +               return HRTIMER_NORESTART;
9533  
9534         ieee80211_iterate_active_interfaces_atomic(
9535                 hw, IEEE80211_IFACE_ITER_NORMAL,
9536 @@ -1565,11 +1563,9 @@
9537                 data->bcn_delta = 0;
9538         }
9539  
9540 -       next_bcn = ktime_add(hrtimer_get_expires(timer),
9541 -                            ns_to_ktime(bcn_int * 1000));
9542 -       tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS);
9543 -out:
9544 -       return HRTIMER_NORESTART;
9545 +       hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer),
9546 +                       ns_to_ktime(bcn_int * NSEC_PER_USEC));
9547 +       return HRTIMER_RESTART;
9548  }
9549  
9550  static const char * const hwsim_chanwidths[] = {
9551 @@ -1643,15 +1639,15 @@
9552         mutex_unlock(&data->mutex);
9553  
9554         if (!data->started || !data->beacon_int)
9555 -               tasklet_hrtimer_cancel(&data->beacon_timer);
9556 -       else if (!hrtimer_is_queued(&data->beacon_timer.timer)) {
9557 +               hrtimer_cancel(&data->beacon_timer);
9558 +       else if (!hrtimer_is_queued(&data->beacon_timer)) {
9559                 u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
9560                 u32 bcn_int = data->beacon_int;
9561                 u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9562  
9563 -               tasklet_hrtimer_start(&data->beacon_timer,
9564 -                                     ns_to_ktime(until_tbtt * 1000),
9565 -                                     HRTIMER_MODE_REL);
9566 +               hrtimer_start(&data->beacon_timer,
9567 +                             ns_to_ktime(until_tbtt * 1000),
9568 +                             HRTIMER_MODE_REL_SOFT);
9569         }
9570  
9571         return 0;
9572 @@ -1714,7 +1710,7 @@
9573                             info->enable_beacon, info->beacon_int);
9574                 vp->bcn_en = info->enable_beacon;
9575                 if (data->started &&
9576 -                   !hrtimer_is_queued(&data->beacon_timer.timer) &&
9577 +                   !hrtimer_is_queued(&data->beacon_timer) &&
9578                     info->enable_beacon) {
9579                         u64 tsf, until_tbtt;
9580                         u32 bcn_int;
9581 @@ -1722,9 +1718,9 @@
9582                         tsf = mac80211_hwsim_get_tsf(hw, vif);
9583                         bcn_int = data->beacon_int;
9584                         until_tbtt = bcn_int - do_div(tsf, bcn_int);
9585 -                       tasklet_hrtimer_start(&data->beacon_timer,
9586 -                                             ns_to_ktime(until_tbtt * 1000),
9587 -                                             HRTIMER_MODE_REL);
9588 +                       hrtimer_start(&data->beacon_timer,
9589 +                                     ns_to_ktime(until_tbtt * 1000),
9590 +                                     HRTIMER_MODE_REL_SOFT);
9591                 } else if (!info->enable_beacon) {
9592                         unsigned int count = 0;
9593                         ieee80211_iterate_active_interfaces_atomic(
9594 @@ -1733,7 +1729,7 @@
9595                         wiphy_debug(hw->wiphy, "  beaconing vifs remaining: %u",
9596                                     count);
9597                         if (count == 0) {
9598 -                               tasklet_hrtimer_cancel(&data->beacon_timer);
9599 +                               hrtimer_cancel(&data->beacon_timer);
9600                                 data->beacon_int = 0;
9601                         }
9602                 }
9603 @@ -2725,9 +2721,9 @@
9604                                     data->debugfs,
9605                                     data, &hwsim_simulate_radar);
9606  
9607 -       tasklet_hrtimer_init(&data->beacon_timer,
9608 -                            mac80211_hwsim_beacon,
9609 -                            CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
9610 +       hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC,
9611 +                    HRTIMER_MODE_ABS_SOFT);
9612 +       data->beacon_timer.function = mac80211_hwsim_beacon;
9613  
9614         spin_lock_bh(&hwsim_radio_lock);
9615         list_add_tail(&data->list, &hwsim_radios);
9616 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/pci/switch/switchtec.c linux-4.14/drivers/pci/switch/switchtec.c
9617 --- linux-4.14.orig/drivers/pci/switch/switchtec.c      2017-11-12 19:46:13.000000000 +0100
9618 +++ linux-4.14/drivers/pci/switch/switchtec.c   2018-09-05 11:05:07.000000000 +0200
9619 @@ -306,10 +306,11 @@
9620  
9621         enum mrpc_state state;
9622  
9623 -       struct completion comp;
9624 +       wait_queue_head_t cmd_comp;
9625         struct kref kref;
9626         struct list_head list;
9627  
9628 +       bool cmd_done;
9629         u32 cmd;
9630         u32 status;
9631         u32 return_code;
9632 @@ -331,7 +332,7 @@
9633         stuser->stdev = stdev;
9634         kref_init(&stuser->kref);
9635         INIT_LIST_HEAD(&stuser->list);
9636 -       init_completion(&stuser->comp);
9637 +       init_waitqueue_head(&stuser->cmd_comp);
9638         stuser->event_cnt = atomic_read(&stdev->event_cnt);
9639  
9640         dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
9641 @@ -414,7 +415,7 @@
9642         kref_get(&stuser->kref);
9643         stuser->read_len = sizeof(stuser->data);
9644         stuser_set_state(stuser, MRPC_QUEUED);
9645 -       init_completion(&stuser->comp);
9646 +       stuser->cmd_done = false;
9647         list_add_tail(&stuser->list, &stdev->mrpc_queue);
9648  
9649         mrpc_cmd_submit(stdev);
9650 @@ -451,7 +452,8 @@
9651                       stuser->read_len);
9652  
9653  out:
9654 -       complete_all(&stuser->comp);
9655 +       stuser->cmd_done = true;
9656 +       wake_up_interruptible(&stuser->cmd_comp);
9657         list_del_init(&stuser->list);
9658         stuser_put(stuser);
9659         stdev->mrpc_busy = 0;
9660 @@ -721,10 +723,11 @@
9661         mutex_unlock(&stdev->mrpc_mutex);
9662  
9663         if (filp->f_flags & O_NONBLOCK) {
9664 -               if (!try_wait_for_completion(&stuser->comp))
9665 +               if (!READ_ONCE(stuser->cmd_done))
9666                         return -EAGAIN;
9667         } else {
9668 -               rc = wait_for_completion_interruptible(&stuser->comp);
9669 +               rc = wait_event_interruptible(stuser->cmd_comp,
9670 +                                             stuser->cmd_done);
9671                 if (rc < 0)
9672                         return rc;
9673         }
9674 @@ -772,7 +775,7 @@
9675         struct switchtec_dev *stdev = stuser->stdev;
9676         int ret = 0;
9677  
9678 -       poll_wait(filp, &stuser->comp.wait, wait);
9679 +       poll_wait(filp, &stuser->cmd_comp, wait);
9680         poll_wait(filp, &stdev->event_wq, wait);
9681  
9682         if (lock_mutex_and_test_alive(stdev))
9683 @@ -780,7 +783,7 @@
9684  
9685         mutex_unlock(&stdev->mrpc_mutex);
9686  
9687 -       if (try_wait_for_completion(&stuser->comp))
9688 +       if (READ_ONCE(stuser->cmd_done))
9689                 ret |= POLLIN | POLLRDNORM;
9690  
9691         if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
9692 @@ -1255,7 +1258,8 @@
9693  
9694         /* Wake up and kill any users waiting on an MRPC request */
9695         list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
9696 -               complete_all(&stuser->comp);
9697 +               stuser->cmd_done = true;
9698 +               wake_up_interruptible(&stuser->cmd_comp);
9699                 list_del_init(&stuser->list);
9700                 stuser_put(stuser);
9701         }
9702 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/fcoe/fcoe.c linux-4.14/drivers/scsi/fcoe/fcoe.c
9703 --- linux-4.14.orig/drivers/scsi/fcoe/fcoe.c    2017-11-12 19:46:13.000000000 +0100
9704 +++ linux-4.14/drivers/scsi/fcoe/fcoe.c 2018-09-05 11:05:07.000000000 +0200
9705 @@ -1464,11 +1464,11 @@
9706  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
9707  {
9708         struct fcoe_percpu_s *fps;
9709 -       int rc;
9710 +       int rc, cpu = get_cpu_light();
9711  
9712 -       fps = &get_cpu_var(fcoe_percpu);
9713 +       fps = &per_cpu(fcoe_percpu, cpu);
9714         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
9715 -       put_cpu_var(fcoe_percpu);
9716 +       put_cpu_light();
9717  
9718         return rc;
9719  }
9720 @@ -1655,11 +1655,11 @@
9721                 return 0;
9722         }
9723  
9724 -       stats = per_cpu_ptr(lport->stats, get_cpu());
9725 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
9726         stats->InvalidCRCCount++;
9727         if (stats->InvalidCRCCount < 5)
9728                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
9729 -       put_cpu();
9730 +       put_cpu_light();
9731         return -EINVAL;
9732  }
9733  
9734 @@ -1702,7 +1702,7 @@
9735          */
9736         hp = (struct fcoe_hdr *) skb_network_header(skb);
9737  
9738 -       stats = per_cpu_ptr(lport->stats, get_cpu());
9739 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
9740         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
9741                 if (stats->ErrorFrames < 5)
9742                         printk(KERN_WARNING "fcoe: FCoE version "
9743 @@ -1734,13 +1734,13 @@
9744                 goto drop;
9745  
9746         if (!fcoe_filter_frames(lport, fp)) {
9747 -               put_cpu();
9748 +               put_cpu_light();
9749                 fc_exch_recv(lport, fp);
9750                 return;
9751         }
9752  drop:
9753         stats->ErrorFrames++;
9754 -       put_cpu();
9755 +       put_cpu_light();
9756         kfree_skb(skb);
9757  }
9758  
9759 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-4.14/drivers/scsi/fcoe/fcoe_ctlr.c
9760 --- linux-4.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c       2017-11-12 19:46:13.000000000 +0100
9761 +++ linux-4.14/drivers/scsi/fcoe/fcoe_ctlr.c    2018-09-05 11:05:07.000000000 +0200
9762 @@ -835,7 +835,7 @@
9763  
9764         INIT_LIST_HEAD(&del_list);
9765  
9766 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
9767 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
9768  
9769         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
9770                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
9771 @@ -871,7 +871,7 @@
9772                                 sel_time = fcf->time;
9773                 }
9774         }
9775 -       put_cpu();
9776 +       put_cpu_light();
9777  
9778         list_for_each_entry_safe(fcf, next, &del_list, list) {
9779                 /* Removes fcf from current list */
9780 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/libfc/fc_exch.c linux-4.14/drivers/scsi/libfc/fc_exch.c
9781 --- linux-4.14.orig/drivers/scsi/libfc/fc_exch.c        2017-11-12 19:46:13.000000000 +0100
9782 +++ linux-4.14/drivers/scsi/libfc/fc_exch.c     2018-09-05 11:05:07.000000000 +0200
9783 @@ -833,10 +833,10 @@
9784         }
9785         memset(ep, 0, sizeof(*ep));
9786  
9787 -       cpu = get_cpu();
9788 +       cpu = get_cpu_light();
9789         pool = per_cpu_ptr(mp->pool, cpu);
9790         spin_lock_bh(&pool->lock);
9791 -       put_cpu();
9792 +       put_cpu_light();
9793  
9794         /* peek cache of free slot */
9795         if (pool->left != FC_XID_UNKNOWN) {
9796 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/libsas/sas_ata.c linux-4.14/drivers/scsi/libsas/sas_ata.c
9797 --- linux-4.14.orig/drivers/scsi/libsas/sas_ata.c       2017-11-12 19:46:13.000000000 +0100
9798 +++ linux-4.14/drivers/scsi/libsas/sas_ata.c    2018-09-05 11:05:07.000000000 +0200
9799 @@ -190,7 +190,7 @@
9800         /* TODO: audit callers to ensure they are ready for qc_issue to
9801          * unconditionally re-enable interrupts
9802          */
9803 -       local_irq_save(flags);
9804 +       local_irq_save_nort(flags);
9805         spin_unlock(ap->lock);
9806  
9807         /* If the device fell off, no sense in issuing commands */
9808 @@ -252,7 +252,7 @@
9809  
9810   out:
9811         spin_lock(ap->lock);
9812 -       local_irq_restore(flags);
9813 +       local_irq_restore_nort(flags);
9814         return ret;
9815  }
9816  
9817 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/scsi/qla2xxx/qla_inline.h linux-4.14/drivers/scsi/qla2xxx/qla_inline.h
9818 --- linux-4.14.orig/drivers/scsi/qla2xxx/qla_inline.h   2018-09-05 11:03:22.000000000 +0200
9819 +++ linux-4.14/drivers/scsi/qla2xxx/qla_inline.h        2018-09-05 11:05:07.000000000 +0200
9820 @@ -59,12 +59,12 @@
9821  {
9822         unsigned long flags;
9823         struct qla_hw_data *ha = rsp->hw;
9824 -       local_irq_save(flags);
9825 +       local_irq_save_nort(flags);
9826         if (IS_P3P_TYPE(ha))
9827                 qla82xx_poll(0, rsp);
9828         else
9829                 ha->isp_ops->intr_handler(0, rsp);
9830 -       local_irq_restore(flags);
9831 +       local_irq_restore_nort(flags);
9832  }
9833  
9834  static inline uint8_t *
9835 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/staging/greybus/audio_manager.c linux-4.14/drivers/staging/greybus/audio_manager.c
9836 --- linux-4.14.orig/drivers/staging/greybus/audio_manager.c     2017-11-12 19:46:13.000000000 +0100
9837 +++ linux-4.14/drivers/staging/greybus/audio_manager.c  2018-09-05 11:05:07.000000000 +0200
9838 @@ -10,7 +10,7 @@
9839  #include <linux/sysfs.h>
9840  #include <linux/module.h>
9841  #include <linux/init.h>
9842 -#include <linux/rwlock.h>
9843 +#include <linux/spinlock.h>
9844  #include <linux/idr.h>
9845  
9846  #include "audio_manager.h"
9847 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/target/target_core_tmr.c linux-4.14/drivers/target/target_core_tmr.c
9848 --- linux-4.14.orig/drivers/target/target_core_tmr.c    2018-09-05 11:03:22.000000000 +0200
9849 +++ linux-4.14/drivers/target/target_core_tmr.c 2018-09-05 11:05:07.000000000 +0200
9850 @@ -114,8 +114,6 @@
9851  {
9852         struct se_session *sess = se_cmd->se_sess;
9853  
9854 -       assert_spin_locked(&sess->sess_cmd_lock);
9855 -       WARN_ON_ONCE(!irqs_disabled());
9856         /*
9857          * If command already reached CMD_T_COMPLETE state within
9858          * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
9859 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/target/target_core_transport.c linux-4.14/drivers/target/target_core_transport.c
9860 --- linux-4.14.orig/drivers/target/target_core_transport.c      2018-09-05 11:03:22.000000000 +0200
9861 +++ linux-4.14/drivers/target/target_core_transport.c   2018-09-05 11:05:07.000000000 +0200
9862 @@ -2966,9 +2966,6 @@
9863         __acquires(&cmd->t_state_lock)
9864  {
9865  
9866 -       assert_spin_locked(&cmd->t_state_lock);
9867 -       WARN_ON_ONCE(!irqs_disabled());
9868 -
9869         if (fabric_stop)
9870                 cmd->transport_state |= CMD_T_FABRIC_STOP;
9871  
9872 @@ -3238,9 +3235,6 @@
9873  {
9874         int ret;
9875  
9876 -       assert_spin_locked(&cmd->t_state_lock);
9877 -       WARN_ON_ONCE(!irqs_disabled());
9878 -
9879         if (!(cmd->transport_state & CMD_T_ABORTED))
9880                 return 0;
9881         /*
9882 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-4.14/drivers/thermal/x86_pkg_temp_thermal.c
9883 --- linux-4.14.orig/drivers/thermal/x86_pkg_temp_thermal.c      2017-11-12 19:46:13.000000000 +0100
9884 +++ linux-4.14/drivers/thermal/x86_pkg_temp_thermal.c   2018-09-05 11:05:07.000000000 +0200
9885 @@ -29,6 +29,7 @@
9886  #include <linux/pm.h>
9887  #include <linux/thermal.h>
9888  #include <linux/debugfs.h>
9889 +#include <linux/swork.h>
9890  #include <asm/cpu_device_id.h>
9891  #include <asm/mce.h>
9892  
9893 @@ -329,7 +330,7 @@
9894         schedule_delayed_work_on(cpu, work, ms);
9895  }
9896  
9897 -static int pkg_thermal_notify(u64 msr_val)
9898 +static void pkg_thermal_notify_work(struct swork_event *event)
9899  {
9900         int cpu = smp_processor_id();
9901         struct pkg_device *pkgdev;
9902 @@ -348,8 +349,46 @@
9903         }
9904  
9905         spin_unlock_irqrestore(&pkg_temp_lock, flags);
9906 +}
9907 +
9908 +#ifdef CONFIG_PREEMPT_RT_FULL
9909 +static struct swork_event notify_work;
9910 +
9911 +static int pkg_thermal_notify_work_init(void)
9912 +{
9913 +       int err;
9914 +
9915 +       err = swork_get();
9916 +       if (err)
9917 +               return err;
9918 +
9919 +       INIT_SWORK(&notify_work, pkg_thermal_notify_work);
9920 +       return 0;
9921 +}
9922 +
9923 +static void pkg_thermal_notify_work_cleanup(void)
9924 +{
9925 +       swork_put();
9926 +}
9927 +
9928 +static int pkg_thermal_notify(u64 msr_val)
9929 +{
9930 +       swork_queue(&notify_work);
9931 +       return 0;
9932 +}
9933 +
9934 +#else  /* !CONFIG_PREEMPT_RT_FULL */
9935 +
9936 +static int pkg_thermal_notify_work_init(void) { return 0; }
9937 +
9938 +static void pkg_thermal_notify_work_cleanup(void) {  }
9939 +
9940 +static int pkg_thermal_notify(u64 msr_val)
9941 +{
9942 +       pkg_thermal_notify_work(NULL);
9943         return 0;
9944  }
9945 +#endif /* CONFIG_PREEMPT_RT_FULL */
9946  
9947  static int pkg_temp_thermal_device_add(unsigned int cpu)
9948  {
9949 @@ -515,10 +554,15 @@
9950         if (!x86_match_cpu(pkg_temp_thermal_ids))
9951                 return -ENODEV;
9952  
9953 +       if (!pkg_thermal_notify_work_init())
9954 +               return -ENODEV;
9955 +
9956         max_packages = topology_max_packages();
9957         packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL);
9958 -       if (!packages)
9959 -               return -ENOMEM;
9960 +       if (!packages) {
9961 +               ret = -ENOMEM;
9962 +               goto err;
9963 +       }
9964  
9965         ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
9966                                 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
9967 @@ -536,6 +580,7 @@
9968         return 0;
9969  
9970  err:
9971 +       pkg_thermal_notify_work_cleanup();
9972         kfree(packages);
9973         return ret;
9974  }
9975 @@ -549,6 +594,7 @@
9976         cpuhp_remove_state(pkg_thermal_hp_state);
9977         debugfs_remove_recursive(debugfs);
9978         kfree(packages);
9979 +       pkg_thermal_notify_work_cleanup();
9980  }
9981  module_exit(pkg_temp_thermal_exit)
9982  
9983 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/8250/8250_core.c linux-4.14/drivers/tty/serial/8250/8250_core.c
9984 --- linux-4.14.orig/drivers/tty/serial/8250/8250_core.c 2017-11-12 19:46:13.000000000 +0100
9985 +++ linux-4.14/drivers/tty/serial/8250/8250_core.c      2018-09-05 11:05:07.000000000 +0200
9986 @@ -58,7 +58,16 @@
9987  
9988  static unsigned int skip_txen_test; /* force skip of txen test at init time */
9989  
9990 -#define PASS_LIMIT     512
9991 +/*
9992 + * On -rt we can have a more delays, and legitimately
9993 + * so - so don't drop work spuriously and spam the
9994 + * syslog:
9995 + */
9996 +#ifdef CONFIG_PREEMPT_RT_FULL
9997 +# define PASS_LIMIT    1000000
9998 +#else
9999 +# define PASS_LIMIT    512
10000 +#endif
10001  
10002  #include <asm/serial.h>
10003  /*
10004 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/8250/8250_port.c linux-4.14/drivers/tty/serial/8250/8250_port.c
10005 --- linux-4.14.orig/drivers/tty/serial/8250/8250_port.c 2018-09-05 11:03:22.000000000 +0200
10006 +++ linux-4.14/drivers/tty/serial/8250/8250_port.c      2018-09-05 11:05:07.000000000 +0200
10007 @@ -35,6 +35,7 @@
10008  #include <linux/nmi.h>
10009  #include <linux/mutex.h>
10010  #include <linux/slab.h>
10011 +#include <linux/kdb.h>
10012  #include <linux/uaccess.h>
10013  #include <linux/pm_runtime.h>
10014  #include <linux/ktime.h>
10015 @@ -3224,9 +3225,9 @@
10016  
10017         serial8250_rpm_get(up);
10018  
10019 -       if (port->sysrq)
10020 +       if (port->sysrq || oops_in_progress)
10021                 locked = 0;
10022 -       else if (oops_in_progress)
10023 +       else if (in_kdb_printk())
10024                 locked = spin_trylock_irqsave(&port->lock, flags);
10025         else
10026                 spin_lock_irqsave(&port->lock, flags);
10027 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/amba-pl011.c linux-4.14/drivers/tty/serial/amba-pl011.c
10028 --- linux-4.14.orig/drivers/tty/serial/amba-pl011.c     2018-09-05 11:03:22.000000000 +0200
10029 +++ linux-4.14/drivers/tty/serial/amba-pl011.c  2018-09-05 11:05:07.000000000 +0200
10030 @@ -2236,13 +2236,19 @@
10031  
10032         clk_enable(uap->clk);
10033  
10034 -       local_irq_save(flags);
10035 +       /*
10036 +        * local_irq_save(flags);
10037 +        *
10038 +        * This local_irq_save() is nonsense. If we come in via sysrq
10039 +        * handling then interrupts are already disabled. Aside of
10040 +        * that the port.sysrq check is racy on SMP regardless.
10041 +       */
10042         if (uap->port.sysrq)
10043                 locked = 0;
10044         else if (oops_in_progress)
10045 -               locked = spin_trylock(&uap->port.lock);
10046 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
10047         else
10048 -               spin_lock(&uap->port.lock);
10049 +               spin_lock_irqsave(&uap->port.lock, flags);
10050  
10051         /*
10052          *      First save the CR then disable the interrupts
10053 @@ -2268,8 +2274,7 @@
10054                 pl011_write(old_cr, uap, REG_CR);
10055  
10056         if (locked)
10057 -               spin_unlock(&uap->port.lock);
10058 -       local_irq_restore(flags);
10059 +               spin_unlock_irqrestore(&uap->port.lock, flags);
10060  
10061         clk_disable(uap->clk);
10062  }
10063 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/tty/serial/omap-serial.c linux-4.14/drivers/tty/serial/omap-serial.c
10064 --- linux-4.14.orig/drivers/tty/serial/omap-serial.c    2018-09-05 11:03:22.000000000 +0200
10065 +++ linux-4.14/drivers/tty/serial/omap-serial.c 2018-09-05 11:05:07.000000000 +0200
10066 @@ -1311,13 +1311,10 @@
10067  
10068         pm_runtime_get_sync(up->dev);
10069  
10070 -       local_irq_save(flags);
10071 -       if (up->port.sysrq)
10072 -               locked = 0;
10073 -       else if (oops_in_progress)
10074 -               locked = spin_trylock(&up->port.lock);
10075 +       if (up->port.sysrq || oops_in_progress)
10076 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
10077         else
10078 -               spin_lock(&up->port.lock);
10079 +               spin_lock_irqsave(&up->port.lock, flags);
10080  
10081         /*
10082          * First save the IER then disable the interrupts
10083 @@ -1346,8 +1343,7 @@
10084         pm_runtime_mark_last_busy(up->dev);
10085         pm_runtime_put_autosuspend(up->dev);
10086         if (locked)
10087 -               spin_unlock(&up->port.lock);
10088 -       local_irq_restore(flags);
10089 +               spin_unlock_irqrestore(&up->port.lock, flags);
10090  }
10091  
10092  static int __init
10093 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/core/hcd.c linux-4.14/drivers/usb/core/hcd.c
10094 --- linux-4.14.orig/drivers/usb/core/hcd.c      2018-09-05 11:03:22.000000000 +0200
10095 +++ linux-4.14/drivers/usb/core/hcd.c   2018-09-05 11:05:07.000000000 +0200
10096 @@ -1775,9 +1775,9 @@
10097          * and no one may trigger the above deadlock situation when
10098          * running complete() in tasklet.
10099          */
10100 -       local_irq_save(flags);
10101 +       local_irq_save_nort(flags);
10102         urb->complete(urb);
10103 -       local_irq_restore(flags);
10104 +       local_irq_restore_nort(flags);
10105  
10106         usb_anchor_resume_wakeups(anchor);
10107         atomic_dec(&urb->use_count);
10108 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/function/f_fs.c linux-4.14/drivers/usb/gadget/function/f_fs.c
10109 --- linux-4.14.orig/drivers/usb/gadget/function/f_fs.c  2018-09-05 11:03:22.000000000 +0200
10110 +++ linux-4.14/drivers/usb/gadget/function/f_fs.c       2018-09-05 11:05:07.000000000 +0200
10111 @@ -1623,7 +1623,7 @@
10112                 pr_info("%s(): freeing\n", __func__);
10113                 ffs_data_clear(ffs);
10114                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
10115 -                      waitqueue_active(&ffs->ep0req_completion.wait) ||
10116 +                      swait_active(&ffs->ep0req_completion.wait) ||
10117                        waitqueue_active(&ffs->wait));
10118                 destroy_workqueue(ffs->io_completion_wq);
10119                 kfree(ffs->dev_name);
10120 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/function/f_ncm.c linux-4.14/drivers/usb/gadget/function/f_ncm.c
10121 --- linux-4.14.orig/drivers/usb/gadget/function/f_ncm.c 2017-11-12 19:46:13.000000000 +0100
10122 +++ linux-4.14/drivers/usb/gadget/function/f_ncm.c      2018-09-05 11:05:07.000000000 +0200
10123 @@ -77,9 +77,7 @@
10124         struct sk_buff                  *skb_tx_ndp;
10125         u16                             ndp_dgram_count;
10126         bool                            timer_force_tx;
10127 -       struct tasklet_struct           tx_tasklet;
10128         struct hrtimer                  task_timer;
10129 -
10130         bool                            timer_stopping;
10131  };
10132  
10133 @@ -1108,7 +1106,7 @@
10134  
10135                 /* Delay the timer. */
10136                 hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
10137 -                             HRTIMER_MODE_REL);
10138 +                             HRTIMER_MODE_REL_SOFT);
10139  
10140                 /* Add the datagram position entries */
10141                 ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
10142 @@ -1152,17 +1150,15 @@
10143  }
10144  
10145  /*
10146 - * This transmits the NTB if there are frames waiting.
10147 + * The transmit should only be run if no skb data has been sent
10148 + * for a certain duration.
10149   */
10150 -static void ncm_tx_tasklet(unsigned long data)
10151 +static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10152  {
10153 -       struct f_ncm    *ncm = (void *)data;
10154 -
10155 -       if (ncm->timer_stopping)
10156 -               return;
10157 +       struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10158  
10159         /* Only send if data is available. */
10160 -       if (ncm->skb_tx_data) {
10161 +       if (!ncm->timer_stopping && ncm->skb_tx_data) {
10162                 ncm->timer_force_tx = true;
10163  
10164                 /* XXX This allowance of a NULL skb argument to ndo_start_xmit
10165 @@ -1175,16 +1171,6 @@
10166  
10167                 ncm->timer_force_tx = false;
10168         }
10169 -}
10170 -
10171 -/*
10172 - * The transmit should only be run if no skb data has been sent
10173 - * for a certain duration.
10174 - */
10175 -static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10176 -{
10177 -       struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10178 -       tasklet_schedule(&ncm->tx_tasklet);
10179         return HRTIMER_NORESTART;
10180  }
10181  
10182 @@ -1517,8 +1503,7 @@
10183         ncm->port.open = ncm_open;
10184         ncm->port.close = ncm_close;
10185  
10186 -       tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
10187 -       hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10188 +       hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
10189         ncm->task_timer.function = ncm_tx_timeout;
10190  
10191         DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
10192 @@ -1627,7 +1612,6 @@
10193         DBG(c->cdev, "ncm unbind\n");
10194  
10195         hrtimer_cancel(&ncm->task_timer);
10196 -       tasklet_kill(&ncm->tx_tasklet);
10197  
10198         ncm_string_defs[0].id = 0;
10199         usb_free_all_descriptors(f);
10200 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/drivers/usb/gadget/legacy/inode.c linux-4.14/drivers/usb/gadget/legacy/inode.c
10201 --- linux-4.14.orig/drivers/usb/gadget/legacy/inode.c   2017-11-12 19:46:13.000000000 +0100
10202 +++ linux-4.14/drivers/usb/gadget/legacy/inode.c        2018-09-05 11:05:07.000000000 +0200
10203 @@ -347,7 +347,7 @@
10204         spin_unlock_irq (&epdata->dev->lock);
10205  
10206         if (likely (value == 0)) {
10207 -               value = wait_event_interruptible (done.wait, done.done);
10208 +               value = swait_event_interruptible (done.wait, done.done);
10209                 if (value != 0) {
10210                         spin_lock_irq (&epdata->dev->lock);
10211                         if (likely (epdata->ep != NULL)) {
10212 @@ -356,7 +356,7 @@
10213                                 usb_ep_dequeue (epdata->ep, epdata->req);
10214                                 spin_unlock_irq (&epdata->dev->lock);
10215  
10216 -                               wait_event (done.wait, done.done);
10217 +                               swait_event (done.wait, done.done);
10218                                 if (epdata->status == -ECONNRESET)
10219                                         epdata->status = -EINTR;
10220                         } else {
10221 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/aio.c linux-4.14/fs/aio.c
10222 --- linux-4.14.orig/fs/aio.c    2018-09-05 11:03:22.000000000 +0200
10223 +++ linux-4.14/fs/aio.c 2018-09-05 11:05:07.000000000 +0200
10224 @@ -40,6 +40,7 @@
10225  #include <linux/ramfs.h>
10226  #include <linux/percpu-refcount.h>
10227  #include <linux/mount.h>
10228 +#include <linux/swork.h>
10229  
10230  #include <asm/kmap_types.h>
10231  #include <linux/uaccess.h>
10232 @@ -117,6 +118,7 @@
10233  
10234         struct rcu_head         free_rcu;
10235         struct work_struct      free_work;      /* see free_ioctx() */
10236 +       struct swork_event      free_swork;     /* see free_ioctx() */
10237  
10238         /*
10239          * signals when all in-flight requests are done
10240 @@ -259,6 +261,7 @@
10241                 .mount          = aio_mount,
10242                 .kill_sb        = kill_anon_super,
10243         };
10244 +       BUG_ON(swork_get());
10245         aio_mnt = kern_mount(&aio_fs);
10246         if (IS_ERR(aio_mnt))
10247                 panic("Failed to create aio fs mount.");
10248 @@ -633,9 +636,9 @@
10249   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
10250   * now it's safe to cancel any that need to be.
10251   */
10252 -static void free_ioctx_users(struct percpu_ref *ref)
10253 +static void free_ioctx_users_work(struct swork_event *sev)
10254  {
10255 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
10256 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_swork);
10257         struct aio_kiocb *req;
10258  
10259         spin_lock_irq(&ctx->ctx_lock);
10260 @@ -653,6 +656,14 @@
10261         percpu_ref_put(&ctx->reqs);
10262  }
10263  
10264 +static void free_ioctx_users(struct percpu_ref *ref)
10265 +{
10266 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
10267 +
10268 +       INIT_SWORK(&ctx->free_swork, free_ioctx_users_work);
10269 +       swork_queue(&ctx->free_swork);
10270 +}
10271 +
10272  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
10273  {
10274         unsigned i, new_nr;
10275 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/autofs4/autofs_i.h linux-4.14/fs/autofs4/autofs_i.h
10276 --- linux-4.14.orig/fs/autofs4/autofs_i.h       2017-11-12 19:46:13.000000000 +0100
10277 +++ linux-4.14/fs/autofs4/autofs_i.h    2018-09-05 11:05:07.000000000 +0200
10278 @@ -20,6 +20,7 @@
10279  #include <linux/sched.h>
10280  #include <linux/mount.h>
10281  #include <linux/namei.h>
10282 +#include <linux/delay.h>
10283  #include <linux/uaccess.h>
10284  #include <linux/mutex.h>
10285  #include <linux/spinlock.h>
10286 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/autofs4/expire.c linux-4.14/fs/autofs4/expire.c
10287 --- linux-4.14.orig/fs/autofs4/expire.c 2017-11-12 19:46:13.000000000 +0100
10288 +++ linux-4.14/fs/autofs4/expire.c      2018-09-05 11:05:07.000000000 +0200
10289 @@ -148,7 +148,7 @@
10290                         parent = p->d_parent;
10291                         if (!spin_trylock(&parent->d_lock)) {
10292                                 spin_unlock(&p->d_lock);
10293 -                               cpu_relax();
10294 +                               cpu_chill();
10295                                 goto relock;
10296                         }
10297                         spin_unlock(&p->d_lock);
10298 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/buffer.c linux-4.14/fs/buffer.c
10299 --- linux-4.14.orig/fs/buffer.c 2018-09-05 11:03:22.000000000 +0200
10300 +++ linux-4.14/fs/buffer.c      2018-09-05 11:05:07.000000000 +0200
10301 @@ -302,8 +302,7 @@
10302          * decide that the page is now completely done.
10303          */
10304         first = page_buffers(page);
10305 -       local_irq_save(flags);
10306 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10307 +       flags = bh_uptodate_lock_irqsave(first);
10308         clear_buffer_async_read(bh);
10309         unlock_buffer(bh);
10310         tmp = bh;
10311 @@ -316,8 +315,7 @@
10312                 }
10313                 tmp = tmp->b_this_page;
10314         } while (tmp != bh);
10315 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10316 -       local_irq_restore(flags);
10317 +       bh_uptodate_unlock_irqrestore(first, flags);
10318  
10319         /*
10320          * If none of the buffers had errors and they are all
10321 @@ -329,9 +327,7 @@
10322         return;
10323  
10324  still_busy:
10325 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10326 -       local_irq_restore(flags);
10327 -       return;
10328 +       bh_uptodate_unlock_irqrestore(first, flags);
10329  }
10330  
10331  /*
10332 @@ -358,8 +354,7 @@
10333         }
10334  
10335         first = page_buffers(page);
10336 -       local_irq_save(flags);
10337 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10338 +       flags = bh_uptodate_lock_irqsave(first);
10339  
10340         clear_buffer_async_write(bh);
10341         unlock_buffer(bh);
10342 @@ -371,15 +366,12 @@
10343                 }
10344                 tmp = tmp->b_this_page;
10345         }
10346 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10347 -       local_irq_restore(flags);
10348 +       bh_uptodate_unlock_irqrestore(first, flags);
10349         end_page_writeback(page);
10350         return;
10351  
10352  still_busy:
10353 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10354 -       local_irq_restore(flags);
10355 -       return;
10356 +       bh_uptodate_unlock_irqrestore(first, flags);
10357  }
10358  EXPORT_SYMBOL(end_buffer_async_write);
10359  
10360 @@ -3417,6 +3409,7 @@
10361         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
10362         if (ret) {
10363                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
10364 +               buffer_head_init_locks(ret);
10365                 preempt_disable();
10366                 __this_cpu_inc(bh_accounting.nr);
10367                 recalc_bh_state();
10368 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/cifs/readdir.c linux-4.14/fs/cifs/readdir.c
10369 --- linux-4.14.orig/fs/cifs/readdir.c   2017-11-12 19:46:13.000000000 +0100
10370 +++ linux-4.14/fs/cifs/readdir.c        2018-09-05 11:05:07.000000000 +0200
10371 @@ -80,7 +80,7 @@
10372         struct inode *inode;
10373         struct super_block *sb = parent->d_sb;
10374         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
10375 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10376 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10377  
10378         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
10379  
10380 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/dcache.c linux-4.14/fs/dcache.c
10381 --- linux-4.14.orig/fs/dcache.c 2018-09-05 11:03:29.000000000 +0200
10382 +++ linux-4.14/fs/dcache.c      2018-09-05 11:05:07.000000000 +0200
10383 @@ -19,6 +19,7 @@
10384  #include <linux/mm.h>
10385  #include <linux/fs.h>
10386  #include <linux/fsnotify.h>
10387 +#include <linux/delay.h>
10388  #include <linux/slab.h>
10389  #include <linux/init.h>
10390  #include <linux/hash.h>
10391 @@ -793,6 +794,8 @@
10392   */
10393  void dput(struct dentry *dentry)
10394  {
10395 +       struct dentry *parent;
10396 +
10397         if (unlikely(!dentry))
10398                 return;
10399  
10400 @@ -829,9 +832,18 @@
10401         return;
10402  
10403  kill_it:
10404 -       dentry = dentry_kill(dentry);
10405 -       if (dentry) {
10406 -               cond_resched();
10407 +       parent = dentry_kill(dentry);
10408 +       if (parent) {
10409 +               int r;
10410 +
10411 +               if (parent == dentry) {
10412 +                       /* the task with the highest priority won't schedule */
10413 +                       r = cond_resched();
10414 +                       if (!r)
10415 +                               cpu_chill();
10416 +               } else {
10417 +                       dentry = parent;
10418 +               }
10419                 goto repeat;
10420         }
10421  }
10422 @@ -2394,7 +2406,7 @@
10423         if (dentry->d_lockref.count == 1) {
10424                 if (!spin_trylock(&inode->i_lock)) {
10425                         spin_unlock(&dentry->d_lock);
10426 -                       cpu_relax();
10427 +                       cpu_chill();
10428                         goto again;
10429                 }
10430                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
10431 @@ -2439,9 +2451,10 @@
10432  static inline unsigned start_dir_add(struct inode *dir)
10433  {
10434  
10435 +       preempt_disable_rt();
10436         for (;;) {
10437 -               unsigned n = dir->i_dir_seq;
10438 -               if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
10439 +               unsigned n = dir->__i_dir_seq;
10440 +               if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
10441                         return n;
10442                 cpu_relax();
10443         }
10444 @@ -2449,26 +2462,30 @@
10445  
10446  static inline void end_dir_add(struct inode *dir, unsigned n)
10447  {
10448 -       smp_store_release(&dir->i_dir_seq, n + 2);
10449 +       smp_store_release(&dir->__i_dir_seq, n + 2);
10450 +       preempt_enable_rt();
10451  }
10452  
10453  static void d_wait_lookup(struct dentry *dentry)
10454  {
10455 -       if (d_in_lookup(dentry)) {
10456 -               DECLARE_WAITQUEUE(wait, current);
10457 -               add_wait_queue(dentry->d_wait, &wait);
10458 -               do {
10459 -                       set_current_state(TASK_UNINTERRUPTIBLE);
10460 -                       spin_unlock(&dentry->d_lock);
10461 -                       schedule();
10462 -                       spin_lock(&dentry->d_lock);
10463 -               } while (d_in_lookup(dentry));
10464 -       }
10465 +       struct swait_queue __wait;
10466 +
10467 +       if (!d_in_lookup(dentry))
10468 +               return;
10469 +
10470 +       INIT_LIST_HEAD(&__wait.task_list);
10471 +       do {
10472 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
10473 +               spin_unlock(&dentry->d_lock);
10474 +               schedule();
10475 +               spin_lock(&dentry->d_lock);
10476 +       } while (d_in_lookup(dentry));
10477 +       finish_swait(dentry->d_wait, &__wait);
10478  }
10479  
10480  struct dentry *d_alloc_parallel(struct dentry *parent,
10481                                 const struct qstr *name,
10482 -                               wait_queue_head_t *wq)
10483 +                               struct swait_queue_head *wq)
10484  {
10485         unsigned int hash = name->hash;
10486         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
10487 @@ -2482,7 +2499,7 @@
10488  
10489  retry:
10490         rcu_read_lock();
10491 -       seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
10492 +       seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
10493         r_seq = read_seqbegin(&rename_lock);
10494         dentry = __d_lookup_rcu(parent, name, &d_seq);
10495         if (unlikely(dentry)) {
10496 @@ -2510,7 +2527,7 @@
10497         }
10498  
10499         hlist_bl_lock(b);
10500 -       if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
10501 +       if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
10502                 hlist_bl_unlock(b);
10503                 rcu_read_unlock();
10504                 goto retry;
10505 @@ -2583,7 +2600,7 @@
10506         hlist_bl_lock(b);
10507         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
10508         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
10509 -       wake_up_all(dentry->d_wait);
10510 +       swake_up_all(dentry->d_wait);
10511         dentry->d_wait = NULL;
10512         hlist_bl_unlock(b);
10513         INIT_HLIST_NODE(&dentry->d_u.d_alias);
10514 @@ -3619,6 +3636,8 @@
10515  
10516  static void __init dcache_init_early(void)
10517  {
10518 +       unsigned int loop;
10519 +
10520         /* If hashes are distributed across NUMA nodes, defer
10521          * hash allocation until vmalloc space is available.
10522          */
10523 @@ -3635,10 +3654,14 @@
10524                                         &d_hash_mask,
10525                                         0,
10526                                         0);
10527 +
10528 +       for (loop = 0; loop < (1U << d_hash_shift); loop++)
10529 +               INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10530  }
10531  
10532  static void __init dcache_init(void)
10533  {
10534 +       unsigned int loop;
10535         /*
10536          * A constructor could be added for stable state like the lists,
10537          * but it is probably not worth it because of the cache nature
10538 @@ -3661,6 +3684,10 @@
10539                                         &d_hash_mask,
10540                                         0,
10541                                         0);
10542 +
10543 +       for (loop = 0; loop < (1U << d_hash_shift); loop++)
10544 +               INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10545 +
10546  }
10547  
10548  /* SLAB cache for __getname() consumers */
10549 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/eventpoll.c linux-4.14/fs/eventpoll.c
10550 --- linux-4.14.orig/fs/eventpoll.c      2017-11-12 19:46:13.000000000 +0100
10551 +++ linux-4.14/fs/eventpoll.c   2018-09-05 11:05:07.000000000 +0200
10552 @@ -587,12 +587,12 @@
10553   */
10554  static void ep_poll_safewake(wait_queue_head_t *wq)
10555  {
10556 -       int this_cpu = get_cpu();
10557 +       int this_cpu = get_cpu_light();
10558  
10559         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
10560                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
10561  
10562 -       put_cpu();
10563 +       put_cpu_light();
10564  }
10565  
10566  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
10567 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/exec.c linux-4.14/fs/exec.c
10568 --- linux-4.14.orig/fs/exec.c   2018-09-05 11:03:29.000000000 +0200
10569 +++ linux-4.14/fs/exec.c        2018-09-05 11:05:07.000000000 +0200
10570 @@ -1025,12 +1025,14 @@
10571                 }
10572         }
10573         task_lock(tsk);
10574 +       preempt_disable_rt();
10575         active_mm = tsk->active_mm;
10576         tsk->mm = mm;
10577         tsk->active_mm = mm;
10578         activate_mm(active_mm, mm);
10579         tsk->mm->vmacache_seqnum = 0;
10580         vmacache_flush(tsk);
10581 +       preempt_enable_rt();
10582         task_unlock(tsk);
10583         if (old_mm) {
10584                 up_read(&old_mm->mmap_sem);
10585 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/ext4/page-io.c linux-4.14/fs/ext4/page-io.c
10586 --- linux-4.14.orig/fs/ext4/page-io.c   2017-11-12 19:46:13.000000000 +0100
10587 +++ linux-4.14/fs/ext4/page-io.c        2018-09-05 11:05:07.000000000 +0200
10588 @@ -95,8 +95,7 @@
10589                  * We check all buffers in the page under BH_Uptodate_Lock
10590                  * to avoid races with other end io clearing async_write flags
10591                  */
10592 -               local_irq_save(flags);
10593 -               bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
10594 +               flags = bh_uptodate_lock_irqsave(head);
10595                 do {
10596                         if (bh_offset(bh) < bio_start ||
10597                             bh_offset(bh) + bh->b_size > bio_end) {
10598 @@ -108,8 +107,7 @@
10599                         if (bio->bi_status)
10600                                 buffer_io_error(bh);
10601                 } while ((bh = bh->b_this_page) != head);
10602 -               bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
10603 -               local_irq_restore(flags);
10604 +               bh_uptodate_unlock_irqrestore(head, flags);
10605                 if (!under_io) {
10606  #ifdef CONFIG_EXT4_FS_ENCRYPTION
10607                         if (data_page)
10608 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/fuse/dir.c linux-4.14/fs/fuse/dir.c
10609 --- linux-4.14.orig/fs/fuse/dir.c       2018-09-05 11:03:22.000000000 +0200
10610 +++ linux-4.14/fs/fuse/dir.c    2018-09-05 11:05:07.000000000 +0200
10611 @@ -1187,7 +1187,7 @@
10612         struct inode *dir = d_inode(parent);
10613         struct fuse_conn *fc;
10614         struct inode *inode;
10615 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10616 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10617  
10618         if (!o->nodeid) {
10619                 /*
10620 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/inode.c linux-4.14/fs/inode.c
10621 --- linux-4.14.orig/fs/inode.c  2018-09-05 11:03:29.000000000 +0200
10622 +++ linux-4.14/fs/inode.c       2018-09-05 11:05:07.000000000 +0200
10623 @@ -154,7 +154,7 @@
10624         inode->i_bdev = NULL;
10625         inode->i_cdev = NULL;
10626         inode->i_link = NULL;
10627 -       inode->i_dir_seq = 0;
10628 +       inode->__i_dir_seq = 0;
10629         inode->i_rdev = 0;
10630         inode->dirtied_when = 0;
10631  
10632 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/libfs.c linux-4.14/fs/libfs.c
10633 --- linux-4.14.orig/fs/libfs.c  2017-11-12 19:46:13.000000000 +0100
10634 +++ linux-4.14/fs/libfs.c       2018-09-05 11:05:07.000000000 +0200
10635 @@ -90,7 +90,7 @@
10636                                     struct list_head *from,
10637                                     int count)
10638  {
10639 -       unsigned *seq = &parent->d_inode->i_dir_seq, n;
10640 +       unsigned *seq = &parent->d_inode->__i_dir_seq, n;
10641         struct dentry *res;
10642         struct list_head *p;
10643         bool skipped;
10644 @@ -123,8 +123,9 @@
10645  static void move_cursor(struct dentry *cursor, struct list_head *after)
10646  {
10647         struct dentry *parent = cursor->d_parent;
10648 -       unsigned n, *seq = &parent->d_inode->i_dir_seq;
10649 +       unsigned n, *seq = &parent->d_inode->__i_dir_seq;
10650         spin_lock(&parent->d_lock);
10651 +       preempt_disable_rt();
10652         for (;;) {
10653                 n = *seq;
10654                 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
10655 @@ -137,6 +138,7 @@
10656         else
10657                 list_add_tail(&cursor->d_child, &parent->d_subdirs);
10658         smp_store_release(seq, n + 2);
10659 +       preempt_enable_rt();
10660         spin_unlock(&parent->d_lock);
10661  }
10662  
10663 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/locks.c linux-4.14/fs/locks.c
10664 --- linux-4.14.orig/fs/locks.c  2017-11-12 19:46:13.000000000 +0100
10665 +++ linux-4.14/fs/locks.c       2018-09-05 11:05:07.000000000 +0200
10666 @@ -945,7 +945,7 @@
10667                         return -ENOMEM;
10668         }
10669  
10670 -       percpu_down_read_preempt_disable(&file_rwsem);
10671 +       percpu_down_read(&file_rwsem);
10672         spin_lock(&ctx->flc_lock);
10673         if (request->fl_flags & FL_ACCESS)
10674                 goto find_conflict;
10675 @@ -986,7 +986,7 @@
10676  
10677  out:
10678         spin_unlock(&ctx->flc_lock);
10679 -       percpu_up_read_preempt_enable(&file_rwsem);
10680 +       percpu_up_read(&file_rwsem);
10681         if (new_fl)
10682                 locks_free_lock(new_fl);
10683         locks_dispose_list(&dispose);
10684 @@ -1023,7 +1023,7 @@
10685                 new_fl2 = locks_alloc_lock();
10686         }
10687  
10688 -       percpu_down_read_preempt_disable(&file_rwsem);
10689 +       percpu_down_read(&file_rwsem);
10690         spin_lock(&ctx->flc_lock);
10691         /*
10692          * New lock request. Walk all POSIX locks and look for conflicts. If
10693 @@ -1195,7 +1195,7 @@
10694         }
10695   out:
10696         spin_unlock(&ctx->flc_lock);
10697 -       percpu_up_read_preempt_enable(&file_rwsem);
10698 +       percpu_up_read(&file_rwsem);
10699         /*
10700          * Free any unused locks.
10701          */
10702 @@ -1470,7 +1470,7 @@
10703                 return error;
10704         }
10705  
10706 -       percpu_down_read_preempt_disable(&file_rwsem);
10707 +       percpu_down_read(&file_rwsem);
10708         spin_lock(&ctx->flc_lock);
10709  
10710         time_out_leases(inode, &dispose);
10711 @@ -1522,13 +1522,13 @@
10712         locks_insert_block(fl, new_fl);
10713         trace_break_lease_block(inode, new_fl);
10714         spin_unlock(&ctx->flc_lock);
10715 -       percpu_up_read_preempt_enable(&file_rwsem);
10716 +       percpu_up_read(&file_rwsem);
10717  
10718         locks_dispose_list(&dispose);
10719         error = wait_event_interruptible_timeout(new_fl->fl_wait,
10720                                                 !new_fl->fl_next, break_time);
10721  
10722 -       percpu_down_read_preempt_disable(&file_rwsem);
10723 +       percpu_down_read(&file_rwsem);
10724         spin_lock(&ctx->flc_lock);
10725         trace_break_lease_unblock(inode, new_fl);
10726         locks_delete_block(new_fl);
10727 @@ -1545,7 +1545,7 @@
10728         }
10729  out:
10730         spin_unlock(&ctx->flc_lock);
10731 -       percpu_up_read_preempt_enable(&file_rwsem);
10732 +       percpu_up_read(&file_rwsem);
10733         locks_dispose_list(&dispose);
10734         locks_free_lock(new_fl);
10735         return error;
10736 @@ -1619,7 +1619,7 @@
10737  
10738         ctx = smp_load_acquire(&inode->i_flctx);
10739         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
10740 -               percpu_down_read_preempt_disable(&file_rwsem);
10741 +               percpu_down_read(&file_rwsem);
10742                 spin_lock(&ctx->flc_lock);
10743                 time_out_leases(inode, &dispose);
10744                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
10745 @@ -1629,7 +1629,7 @@
10746                         break;
10747                 }
10748                 spin_unlock(&ctx->flc_lock);
10749 -               percpu_up_read_preempt_enable(&file_rwsem);
10750 +               percpu_up_read(&file_rwsem);
10751  
10752                 locks_dispose_list(&dispose);
10753         }
10754 @@ -1704,7 +1704,7 @@
10755                 return -EINVAL;
10756         }
10757  
10758 -       percpu_down_read_preempt_disable(&file_rwsem);
10759 +       percpu_down_read(&file_rwsem);
10760         spin_lock(&ctx->flc_lock);
10761         time_out_leases(inode, &dispose);
10762         error = check_conflicting_open(dentry, arg, lease->fl_flags);
10763 @@ -1775,7 +1775,7 @@
10764                 lease->fl_lmops->lm_setup(lease, priv);
10765  out:
10766         spin_unlock(&ctx->flc_lock);
10767 -       percpu_up_read_preempt_enable(&file_rwsem);
10768 +       percpu_up_read(&file_rwsem);
10769         locks_dispose_list(&dispose);
10770         if (is_deleg)
10771                 inode_unlock(inode);
10772 @@ -1798,7 +1798,7 @@
10773                 return error;
10774         }
10775  
10776 -       percpu_down_read_preempt_disable(&file_rwsem);
10777 +       percpu_down_read(&file_rwsem);
10778         spin_lock(&ctx->flc_lock);
10779         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
10780                 if (fl->fl_file == filp &&
10781 @@ -1811,7 +1811,7 @@
10782         if (victim)
10783                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
10784         spin_unlock(&ctx->flc_lock);
10785 -       percpu_up_read_preempt_enable(&file_rwsem);
10786 +       percpu_up_read(&file_rwsem);
10787         locks_dispose_list(&dispose);
10788         return error;
10789  }
10790 @@ -2535,13 +2535,13 @@
10791         if (list_empty(&ctx->flc_lease))
10792                 return;
10793  
10794 -       percpu_down_read_preempt_disable(&file_rwsem);
10795 +       percpu_down_read(&file_rwsem);
10796         spin_lock(&ctx->flc_lock);
10797         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
10798                 if (filp == fl->fl_file)
10799                         lease_modify(fl, F_UNLCK, &dispose);
10800         spin_unlock(&ctx->flc_lock);
10801 -       percpu_up_read_preempt_enable(&file_rwsem);
10802 +       percpu_up_read(&file_rwsem);
10803  
10804         locks_dispose_list(&dispose);
10805  }
10806 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/namei.c linux-4.14/fs/namei.c
10807 --- linux-4.14.orig/fs/namei.c  2018-09-05 11:03:22.000000000 +0200
10808 +++ linux-4.14/fs/namei.c       2018-09-05 11:05:07.000000000 +0200
10809 @@ -1627,7 +1627,7 @@
10810  {
10811         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
10812         struct inode *inode = dir->d_inode;
10813 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10814 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10815  
10816         inode_lock_shared(inode);
10817         /* Don't go there if it's already dead */
10818 @@ -3100,7 +3100,7 @@
10819         struct dentry *dentry;
10820         int error, create_error = 0;
10821         umode_t mode = op->mode;
10822 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10823 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10824  
10825         if (unlikely(IS_DEADDIR(dir_inode)))
10826                 return -ENOENT;
10827 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/namespace.c linux-4.14/fs/namespace.c
10828 --- linux-4.14.orig/fs/namespace.c      2018-09-05 11:03:29.000000000 +0200
10829 +++ linux-4.14/fs/namespace.c   2018-09-05 11:05:07.000000000 +0200
10830 @@ -14,6 +14,7 @@
10831  #include <linux/mnt_namespace.h>
10832  #include <linux/user_namespace.h>
10833  #include <linux/namei.h>
10834 +#include <linux/delay.h>
10835  #include <linux/security.h>
10836  #include <linux/cred.h>
10837  #include <linux/idr.h>
10838 @@ -353,8 +354,11 @@
10839          * incremented count after it has set MNT_WRITE_HOLD.
10840          */
10841         smp_mb();
10842 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
10843 -               cpu_relax();
10844 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
10845 +               preempt_enable();
10846 +               cpu_chill();
10847 +               preempt_disable();
10848 +       }
10849         /*
10850          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
10851          * be set to match its requirements. So we must not load that until
10852 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/delegation.c linux-4.14/fs/nfs/delegation.c
10853 --- linux-4.14.orig/fs/nfs/delegation.c 2017-11-12 19:46:13.000000000 +0100
10854 +++ linux-4.14/fs/nfs/delegation.c      2018-09-05 11:05:07.000000000 +0200
10855 @@ -150,11 +150,11 @@
10856                 sp = state->owner;
10857                 /* Block nfs4_proc_unlck */
10858                 mutex_lock(&sp->so_delegreturn_mutex);
10859 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
10860 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
10861                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
10862                 if (!err)
10863                         err = nfs_delegation_claim_locks(ctx, state, stateid);
10864 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
10865 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
10866                         err = -EAGAIN;
10867                 mutex_unlock(&sp->so_delegreturn_mutex);
10868                 put_nfs_open_context(ctx);
10869 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/dir.c linux-4.14/fs/nfs/dir.c
10870 --- linux-4.14.orig/fs/nfs/dir.c        2018-09-05 11:03:22.000000000 +0200
10871 +++ linux-4.14/fs/nfs/dir.c     2018-09-05 11:05:07.000000000 +0200
10872 @@ -452,7 +452,7 @@
10873  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
10874  {
10875         struct qstr filename = QSTR_INIT(entry->name, entry->len);
10876 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10877 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10878         struct dentry *dentry;
10879         struct dentry *alias;
10880         struct inode *dir = d_inode(parent);
10881 @@ -1443,7 +1443,7 @@
10882                     struct file *file, unsigned open_flags,
10883                     umode_t mode, int *opened)
10884  {
10885 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10886 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10887         struct nfs_open_context *ctx;
10888         struct dentry *res;
10889         struct iattr attr = { .ia_valid = ATTR_OPEN };
10890 @@ -1763,7 +1763,11 @@
10891  
10892         trace_nfs_rmdir_enter(dir, dentry);
10893         if (d_really_is_positive(dentry)) {
10894 +#ifdef CONFIG_PREEMPT_RT_BASE
10895 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
10896 +#else
10897                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
10898 +#endif
10899                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
10900                 /* Ensure the VFS deletes this inode */
10901                 switch (error) {
10902 @@ -1773,7 +1777,11 @@
10903                 case -ENOENT:
10904                         nfs_dentry_handle_enoent(dentry);
10905                 }
10906 +#ifdef CONFIG_PREEMPT_RT_BASE
10907 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
10908 +#else
10909                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
10910 +#endif
10911         } else
10912                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
10913         trace_nfs_rmdir_exit(dir, dentry, error);
10914 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/inode.c linux-4.14/fs/nfs/inode.c
10915 --- linux-4.14.orig/fs/nfs/inode.c      2017-11-12 19:46:13.000000000 +0100
10916 +++ linux-4.14/fs/nfs/inode.c   2018-09-05 11:05:07.000000000 +0200
10917 @@ -2014,7 +2014,11 @@
10918         atomic_long_set(&nfsi->nrequests, 0);
10919         atomic_long_set(&nfsi->commit_info.ncommit, 0);
10920         atomic_set(&nfsi->commit_info.rpcs_out, 0);
10921 +#ifdef CONFIG_PREEMPT_RT_BASE
10922 +       sema_init(&nfsi->rmdir_sem, 1);
10923 +#else
10924         init_rwsem(&nfsi->rmdir_sem);
10925 +#endif
10926         mutex_init(&nfsi->commit_mutex);
10927         nfs4_init_once(nfsi);
10928  }
10929 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4_fs.h linux-4.14/fs/nfs/nfs4_fs.h
10930 --- linux-4.14.orig/fs/nfs/nfs4_fs.h    2018-09-05 11:03:22.000000000 +0200
10931 +++ linux-4.14/fs/nfs/nfs4_fs.h 2018-09-05 11:05:07.000000000 +0200
10932 @@ -112,7 +112,7 @@
10933         unsigned long        so_flags;
10934         struct list_head     so_states;
10935         struct nfs_seqid_counter so_seqid;
10936 -       seqcount_t           so_reclaim_seqcount;
10937 +       seqlock_t            so_reclaim_seqlock;
10938         struct mutex         so_delegreturn_mutex;
10939  };
10940  
10941 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4proc.c linux-4.14/fs/nfs/nfs4proc.c
10942 --- linux-4.14.orig/fs/nfs/nfs4proc.c   2018-09-05 11:03:22.000000000 +0200
10943 +++ linux-4.14/fs/nfs/nfs4proc.c        2018-09-05 11:05:07.000000000 +0200
10944 @@ -2689,7 +2689,7 @@
10945         unsigned int seq;
10946         int ret;
10947  
10948 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
10949 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
10950  
10951         ret = _nfs4_proc_open(opendata);
10952         if (ret != 0)
10953 @@ -2727,7 +2727,7 @@
10954  
10955         if (d_inode(dentry) == state->inode) {
10956                 nfs_inode_attach_open_context(ctx);
10957 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
10958 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
10959                         nfs4_schedule_stateid_recovery(server, state);
10960         }
10961  out:
10962 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/nfs4state.c linux-4.14/fs/nfs/nfs4state.c
10963 --- linux-4.14.orig/fs/nfs/nfs4state.c  2018-09-05 11:03:22.000000000 +0200
10964 +++ linux-4.14/fs/nfs/nfs4state.c       2018-09-05 11:05:07.000000000 +0200
10965 @@ -494,7 +494,7 @@
10966         nfs4_init_seqid_counter(&sp->so_seqid);
10967         atomic_set(&sp->so_count, 1);
10968         INIT_LIST_HEAD(&sp->so_lru);
10969 -       seqcount_init(&sp->so_reclaim_seqcount);
10970 +       seqlock_init(&sp->so_reclaim_seqlock);
10971         mutex_init(&sp->so_delegreturn_mutex);
10972         return sp;
10973  }
10974 @@ -1519,8 +1519,12 @@
10975          * recovering after a network partition or a reboot from a
10976          * server that doesn't support a grace period.
10977          */
10978 +#ifdef CONFIG_PREEMPT_RT_FULL
10979 +       write_seqlock(&sp->so_reclaim_seqlock);
10980 +#else
10981 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
10982 +#endif
10983         spin_lock(&sp->so_lock);
10984 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
10985  restart:
10986         list_for_each_entry(state, &sp->so_states, open_states) {
10987                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
10988 @@ -1589,14 +1593,20 @@
10989                 spin_lock(&sp->so_lock);
10990                 goto restart;
10991         }
10992 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
10993         spin_unlock(&sp->so_lock);
10994 +#ifdef CONFIG_PREEMPT_RT_FULL
10995 +       write_sequnlock(&sp->so_reclaim_seqlock);
10996 +#else
10997 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
10998 +#endif
10999         return 0;
11000  out_err:
11001         nfs4_put_open_state(state);
11002 -       spin_lock(&sp->so_lock);
11003 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11004 -       spin_unlock(&sp->so_lock);
11005 +#ifdef CONFIG_PREEMPT_RT_FULL
11006 +       write_sequnlock(&sp->so_reclaim_seqlock);
11007 +#else
11008 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11009 +#endif
11010         return status;
11011  }
11012  
11013 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/nfs/unlink.c linux-4.14/fs/nfs/unlink.c
11014 --- linux-4.14.orig/fs/nfs/unlink.c     2017-11-12 19:46:13.000000000 +0100
11015 +++ linux-4.14/fs/nfs/unlink.c  2018-09-05 11:05:07.000000000 +0200
11016 @@ -13,7 +13,7 @@
11017  #include <linux/sunrpc/clnt.h>
11018  #include <linux/nfs_fs.h>
11019  #include <linux/sched.h>
11020 -#include <linux/wait.h>
11021 +#include <linux/swait.h>
11022  #include <linux/namei.h>
11023  #include <linux/fsnotify.h>
11024  
11025 @@ -52,6 +52,29 @@
11026                 rpc_restart_call_prepare(task);
11027  }
11028  
11029 +#ifdef CONFIG_PREEMPT_RT_BASE
11030 +static void nfs_down_anon(struct semaphore *sema)
11031 +{
11032 +       down(sema);
11033 +}
11034 +
11035 +static void nfs_up_anon(struct semaphore *sema)
11036 +{
11037 +       up(sema);
11038 +}
11039 +
11040 +#else
11041 +static void nfs_down_anon(struct rw_semaphore *rwsem)
11042 +{
11043 +       down_read_non_owner(rwsem);
11044 +}
11045 +
11046 +static void nfs_up_anon(struct rw_semaphore *rwsem)
11047 +{
11048 +       up_read_non_owner(rwsem);
11049 +}
11050 +#endif
11051 +
11052  /**
11053   * nfs_async_unlink_release - Release the sillydelete data.
11054   * @task: rpc_task of the sillydelete
11055 @@ -65,7 +88,7 @@
11056         struct dentry *dentry = data->dentry;
11057         struct super_block *sb = dentry->d_sb;
11058  
11059 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11060 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11061         d_lookup_done(dentry);
11062         nfs_free_unlinkdata(data);
11063         dput(dentry);
11064 @@ -118,10 +141,10 @@
11065         struct inode *dir = d_inode(dentry->d_parent);
11066         struct dentry *alias;
11067  
11068 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
11069 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
11070         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
11071         if (IS_ERR(alias)) {
11072 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11073 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11074                 return 0;
11075         }
11076         if (!d_in_lookup(alias)) {
11077 @@ -143,7 +166,7 @@
11078                         ret = 0;
11079                 spin_unlock(&alias->d_lock);
11080                 dput(alias);
11081 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11082 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11083                 /*
11084                  * If we'd displaced old cached devname, free it.  At that
11085                  * point dentry is definitely not a root, so we won't need
11086 @@ -183,7 +206,7 @@
11087                 goto out_free_name;
11088         }
11089         data->res.dir_attr = &data->dir_attr;
11090 -       init_waitqueue_head(&data->wq);
11091 +       init_swait_queue_head(&data->wq);
11092  
11093         status = -EBUSY;
11094         spin_lock(&dentry->d_lock);
11095 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/ntfs/aops.c linux-4.14/fs/ntfs/aops.c
11096 --- linux-4.14.orig/fs/ntfs/aops.c      2017-11-12 19:46:13.000000000 +0100
11097 +++ linux-4.14/fs/ntfs/aops.c   2018-09-05 11:05:07.000000000 +0200
11098 @@ -93,13 +93,13 @@
11099                         ofs = 0;
11100                         if (file_ofs < init_size)
11101                                 ofs = init_size - file_ofs;
11102 -                       local_irq_save(flags);
11103 +                       local_irq_save_nort(flags);
11104                         kaddr = kmap_atomic(page);
11105                         memset(kaddr + bh_offset(bh) + ofs, 0,
11106                                         bh->b_size - ofs);
11107                         flush_dcache_page(page);
11108                         kunmap_atomic(kaddr);
11109 -                       local_irq_restore(flags);
11110 +                       local_irq_restore_nort(flags);
11111                 }
11112         } else {
11113                 clear_buffer_uptodate(bh);
11114 @@ -108,8 +108,7 @@
11115                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11116         }
11117         first = page_buffers(page);
11118 -       local_irq_save(flags);
11119 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11120 +       flags = bh_uptodate_lock_irqsave(first);
11121         clear_buffer_async_read(bh);
11122         unlock_buffer(bh);
11123         tmp = bh;
11124 @@ -124,8 +123,7 @@
11125                 }
11126                 tmp = tmp->b_this_page;
11127         } while (tmp != bh);
11128 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11129 -       local_irq_restore(flags);
11130 +       bh_uptodate_unlock_irqrestore(first, flags);
11131         /*
11132          * If none of the buffers had errors then we can set the page uptodate,
11133          * but we first have to perform the post read mst fixups, if the
11134 @@ -146,13 +144,13 @@
11135                 recs = PAGE_SIZE / rec_size;
11136                 /* Should have been verified before we got here... */
11137                 BUG_ON(!recs);
11138 -               local_irq_save(flags);
11139 +               local_irq_save_nort(flags);
11140                 kaddr = kmap_atomic(page);
11141                 for (i = 0; i < recs; i++)
11142                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11143                                         i * rec_size), rec_size);
11144                 kunmap_atomic(kaddr);
11145 -               local_irq_restore(flags);
11146 +               local_irq_restore_nort(flags);
11147                 flush_dcache_page(page);
11148                 if (likely(page_uptodate && !PageError(page)))
11149                         SetPageUptodate(page);
11150 @@ -160,9 +158,7 @@
11151         unlock_page(page);
11152         return;
11153  still_busy:
11154 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11155 -       local_irq_restore(flags);
11156 -       return;
11157 +       bh_uptodate_unlock_irqrestore(first, flags);
11158  }
11159  
11160  /**
11161 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/array.c linux-4.14/fs/proc/array.c
11162 --- linux-4.14.orig/fs/proc/array.c     2018-09-05 11:03:22.000000000 +0200
11163 +++ linux-4.14/fs/proc/array.c  2018-09-05 11:05:07.000000000 +0200
11164 @@ -386,9 +386,9 @@
11165  static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
11166  {
11167         seq_printf(m, "Cpus_allowed:\t%*pb\n",
11168 -                  cpumask_pr_args(&task->cpus_allowed));
11169 +                  cpumask_pr_args(task->cpus_ptr));
11170         seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
11171 -                  cpumask_pr_args(&task->cpus_allowed));
11172 +                  cpumask_pr_args(task->cpus_ptr));
11173  }
11174  
11175  int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
11176 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/base.c linux-4.14/fs/proc/base.c
11177 --- linux-4.14.orig/fs/proc/base.c      2018-09-05 11:03:28.000000000 +0200
11178 +++ linux-4.14/fs/proc/base.c   2018-09-05 11:05:07.000000000 +0200
11179 @@ -1886,7 +1886,7 @@
11180  
11181         child = d_hash_and_lookup(dir, &qname);
11182         if (!child) {
11183 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11184 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11185                 child = d_alloc_parallel(dir, &qname, &wq);
11186                 if (IS_ERR(child))
11187                         goto end_instantiate;
11188 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/proc/proc_sysctl.c linux-4.14/fs/proc/proc_sysctl.c
11189 --- linux-4.14.orig/fs/proc/proc_sysctl.c       2018-09-05 11:03:22.000000000 +0200
11190 +++ linux-4.14/fs/proc/proc_sysctl.c    2018-09-05 11:05:07.000000000 +0200
11191 @@ -679,7 +679,7 @@
11192  
11193         child = d_lookup(dir, &qname);
11194         if (!child) {
11195 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11196 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11197                 child = d_alloc_parallel(dir, &qname, &wq);
11198                 if (IS_ERR(child))
11199                         return false;
11200 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/timerfd.c linux-4.14/fs/timerfd.c
11201 --- linux-4.14.orig/fs/timerfd.c        2017-11-12 19:46:13.000000000 +0100
11202 +++ linux-4.14/fs/timerfd.c     2018-09-05 11:05:07.000000000 +0200
11203 @@ -471,7 +471,10 @@
11204                                 break;
11205                 }
11206                 spin_unlock_irq(&ctx->wqh.lock);
11207 -               cpu_relax();
11208 +               if (isalarm(ctx))
11209 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11210 +               else
11211 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11212         }
11213  
11214         /*
11215 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/fs/xfs/xfs_aops.c linux-4.14/fs/xfs/xfs_aops.c
11216 --- linux-4.14.orig/fs/xfs/xfs_aops.c   2018-09-05 11:03:22.000000000 +0200
11217 +++ linux-4.14/fs/xfs/xfs_aops.c        2018-09-05 11:05:07.000000000 +0200
11218 @@ -120,8 +120,7 @@
11219         ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
11220         ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
11221  
11222 -       local_irq_save(flags);
11223 -       bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11224 +       flags = bh_uptodate_lock_irqsave(head);
11225         do {
11226                 if (off >= bvec->bv_offset &&
11227                     off < bvec->bv_offset + bvec->bv_len) {
11228 @@ -143,8 +142,7 @@
11229                 }
11230                 off += bh->b_size;
11231         } while ((bh = bh->b_this_page) != head);
11232 -       bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11233 -       local_irq_restore(flags);
11234 +       bh_uptodate_unlock_irqrestore(head, flags);
11235  
11236         if (!busy)
11237                 end_page_writeback(bvec->bv_page);
11238 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/acpi/platform/aclinux.h linux-4.14/include/acpi/platform/aclinux.h
11239 --- linux-4.14.orig/include/acpi/platform/aclinux.h     2017-11-12 19:46:13.000000000 +0100
11240 +++ linux-4.14/include/acpi/platform/aclinux.h  2018-09-05 11:05:07.000000000 +0200
11241 @@ -134,6 +134,7 @@
11242  
11243  #define acpi_cache_t                        struct kmem_cache
11244  #define acpi_spinlock                       spinlock_t *
11245 +#define acpi_raw_spinlock              raw_spinlock_t *
11246  #define acpi_cpu_flags                      unsigned long
11247  
11248  /* Use native linux version of acpi_os_allocate_zeroed */
11249 @@ -152,6 +153,20 @@
11250  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11251  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11252  
11253 +#define acpi_os_create_raw_lock(__handle)                      \
11254 +({                                                             \
11255 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11256 +                                                               \
11257 +        if (lock) {                                            \
11258 +               *(__handle) = lock;                             \
11259 +               raw_spin_lock_init(*(__handle));                \
11260 +        }                                                      \
11261 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11262 + })
11263 +
11264 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
11265 +
11266 +
11267  /*
11268   * OSL interfaces used by debugger/disassembler
11269   */
11270 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/asm-generic/bug.h linux-4.14/include/asm-generic/bug.h
11271 --- linux-4.14.orig/include/asm-generic/bug.h   2018-09-05 11:03:22.000000000 +0200
11272 +++ linux-4.14/include/asm-generic/bug.h        2018-09-05 11:05:07.000000000 +0200
11273 @@ -234,6 +234,20 @@
11274  # define WARN_ON_SMP(x)                        ({0;})
11275  #endif
11276  
11277 +#ifdef CONFIG_PREEMPT_RT_BASE
11278 +# define BUG_ON_RT(c)                  BUG_ON(c)
11279 +# define BUG_ON_NONRT(c)               do { } while (0)
11280 +# define WARN_ON_RT(condition)         WARN_ON(condition)
11281 +# define WARN_ON_NONRT(condition)      do { } while (0)
11282 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11283 +#else
11284 +# define BUG_ON_RT(c)                  do { } while (0)
11285 +# define BUG_ON_NONRT(c)               BUG_ON(c)
11286 +# define WARN_ON_RT(condition)         do { } while (0)
11287 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
11288 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11289 +#endif
11290 +
11291  #endif /* __ASSEMBLY__ */
11292  
11293  #endif
11294 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/blkdev.h linux-4.14/include/linux/blkdev.h
11295 --- linux-4.14.orig/include/linux/blkdev.h      2018-09-05 11:03:22.000000000 +0200
11296 +++ linux-4.14/include/linux/blkdev.h   2018-09-05 11:05:07.000000000 +0200
11297 @@ -27,6 +27,7 @@
11298  #include <linux/percpu-refcount.h>
11299  #include <linux/scatterlist.h>
11300  #include <linux/blkzoned.h>
11301 +#include <linux/swork.h>
11302  
11303  struct module;
11304  struct scsi_ioctl_command;
11305 @@ -134,6 +135,9 @@
11306   */
11307  struct request {
11308         struct list_head queuelist;
11309 +#ifdef CONFIG_PREEMPT_RT_FULL
11310 +       struct work_struct work;
11311 +#endif
11312         union {
11313                 struct __call_single_data csd;
11314                 u64 fifo_time;
11315 @@ -596,6 +600,7 @@
11316  #endif
11317         struct rcu_head         rcu_head;
11318         wait_queue_head_t       mq_freeze_wq;
11319 +       struct swork_event      mq_pcpu_wake;
11320         struct percpu_ref       q_usage_counter;
11321         struct list_head        all_q_node;
11322  
11323 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/blk-mq.h linux-4.14/include/linux/blk-mq.h
11324 --- linux-4.14.orig/include/linux/blk-mq.h      2017-11-12 19:46:13.000000000 +0100
11325 +++ linux-4.14/include/linux/blk-mq.h   2018-09-05 11:05:07.000000000 +0200
11326 @@ -226,7 +226,7 @@
11327         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
11328  }
11329  
11330 -
11331 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11332  int blk_mq_request_started(struct request *rq);
11333  void blk_mq_start_request(struct request *rq);
11334  void blk_mq_end_request(struct request *rq, blk_status_t error);
11335 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/bottom_half.h linux-4.14/include/linux/bottom_half.h
11336 --- linux-4.14.orig/include/linux/bottom_half.h 2017-11-12 19:46:13.000000000 +0100
11337 +++ linux-4.14/include/linux/bottom_half.h      2018-09-05 11:05:07.000000000 +0200
11338 @@ -4,6 +4,39 @@
11339  
11340  #include <linux/preempt.h>
11341  
11342 +#ifdef CONFIG_PREEMPT_RT_FULL
11343 +
11344 +extern void __local_bh_disable(void);
11345 +extern void _local_bh_enable(void);
11346 +extern void __local_bh_enable(void);
11347 +
11348 +static inline void local_bh_disable(void)
11349 +{
11350 +       __local_bh_disable();
11351 +}
11352 +
11353 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11354 +{
11355 +       __local_bh_disable();
11356 +}
11357 +
11358 +static inline void local_bh_enable(void)
11359 +{
11360 +       __local_bh_enable();
11361 +}
11362 +
11363 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11364 +{
11365 +       __local_bh_enable();
11366 +}
11367 +
11368 +static inline void local_bh_enable_ip(unsigned long ip)
11369 +{
11370 +       __local_bh_enable();
11371 +}
11372 +
11373 +#else
11374 +
11375  #ifdef CONFIG_TRACE_IRQFLAGS
11376  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11377  #else
11378 @@ -31,5 +64,6 @@
11379  {
11380         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11381  }
11382 +#endif
11383  
11384  #endif /* _LINUX_BH_H */
11385 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/buffer_head.h linux-4.14/include/linux/buffer_head.h
11386 --- linux-4.14.orig/include/linux/buffer_head.h 2017-11-12 19:46:13.000000000 +0100
11387 +++ linux-4.14/include/linux/buffer_head.h      2018-09-05 11:05:07.000000000 +0200
11388 @@ -76,8 +76,50 @@
11389         struct address_space *b_assoc_map;      /* mapping this buffer is
11390                                                    associated with */
11391         atomic_t b_count;               /* users using this buffer_head */
11392 +#ifdef CONFIG_PREEMPT_RT_BASE
11393 +       spinlock_t b_uptodate_lock;
11394 +#if IS_ENABLED(CONFIG_JBD2)
11395 +       spinlock_t b_state_lock;
11396 +       spinlock_t b_journal_head_lock;
11397 +#endif
11398 +#endif
11399  };
11400  
11401 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11402 +{
11403 +       unsigned long flags;
11404 +
11405 +#ifndef CONFIG_PREEMPT_RT_BASE
11406 +       local_irq_save(flags);
11407 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11408 +#else
11409 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11410 +#endif
11411 +       return flags;
11412 +}
11413 +
11414 +static inline void
11415 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11416 +{
11417 +#ifndef CONFIG_PREEMPT_RT_BASE
11418 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11419 +       local_irq_restore(flags);
11420 +#else
11421 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11422 +#endif
11423 +}
11424 +
11425 +static inline void buffer_head_init_locks(struct buffer_head *bh)
11426 +{
11427 +#ifdef CONFIG_PREEMPT_RT_BASE
11428 +       spin_lock_init(&bh->b_uptodate_lock);
11429 +#if IS_ENABLED(CONFIG_JBD2)
11430 +       spin_lock_init(&bh->b_state_lock);
11431 +       spin_lock_init(&bh->b_journal_head_lock);
11432 +#endif
11433 +#endif
11434 +}
11435 +
11436  /*
11437   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11438   * and buffer_foo() functions.
11439 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/cgroup-defs.h linux-4.14/include/linux/cgroup-defs.h
11440 --- linux-4.14.orig/include/linux/cgroup-defs.h 2018-09-05 11:03:22.000000000 +0200
11441 +++ linux-4.14/include/linux/cgroup-defs.h      2018-09-05 11:05:07.000000000 +0200
11442 @@ -19,6 +19,7 @@
11443  #include <linux/percpu-rwsem.h>
11444  #include <linux/workqueue.h>
11445  #include <linux/bpf-cgroup.h>
11446 +#include <linux/swork.h>
11447  
11448  #ifdef CONFIG_CGROUPS
11449  
11450 @@ -152,6 +153,7 @@
11451         /* percpu_ref killing and RCU release */
11452         struct rcu_head rcu_head;
11453         struct work_struct destroy_work;
11454 +       struct swork_event destroy_swork;
11455  
11456         /*
11457          * PI: the parent css.  Placed here for cache proximity to following
11458 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/completion.h linux-4.14/include/linux/completion.h
11459 --- linux-4.14.orig/include/linux/completion.h  2017-11-12 19:46:13.000000000 +0100
11460 +++ linux-4.14/include/linux/completion.h       2018-09-05 11:05:07.000000000 +0200
11461 @@ -9,7 +9,7 @@
11462   * See kernel/sched/completion.c for details.
11463   */
11464  
11465 -#include <linux/wait.h>
11466 +#include <linux/swait.h>
11467  #ifdef CONFIG_LOCKDEP_COMPLETIONS
11468  #include <linux/lockdep.h>
11469  #endif
11470 @@ -28,7 +28,7 @@
11471   */
11472  struct completion {
11473         unsigned int done;
11474 -       wait_queue_head_t wait;
11475 +       struct swait_queue_head wait;
11476  #ifdef CONFIG_LOCKDEP_COMPLETIONS
11477         struct lockdep_map_cross map;
11478  #endif
11479 @@ -67,11 +67,11 @@
11480  
11481  #ifdef CONFIG_LOCKDEP_COMPLETIONS
11482  #define COMPLETION_INITIALIZER(work) \
11483 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11484 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11485         STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
11486  #else
11487  #define COMPLETION_INITIALIZER(work) \
11488 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11489 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11490  #endif
11491  
11492  #define COMPLETION_INITIALIZER_ONSTACK(work) \
11493 @@ -117,7 +117,7 @@
11494  static inline void __init_completion(struct completion *x)
11495  {
11496         x->done = 0;
11497 -       init_waitqueue_head(&x->wait);
11498 +       init_swait_queue_head(&x->wait);
11499  }
11500  
11501  /**
11502 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/cpu.h linux-4.14/include/linux/cpu.h
11503 --- linux-4.14.orig/include/linux/cpu.h 2018-09-05 11:03:22.000000000 +0200
11504 +++ linux-4.14/include/linux/cpu.h      2018-09-05 11:05:07.000000000 +0200
11505 @@ -120,6 +120,8 @@
11506  extern void cpu_hotplug_enable(void);
11507  void clear_tasks_mm_cpumask(int cpu);
11508  int cpu_down(unsigned int cpu);
11509 +extern void pin_current_cpu(void);
11510 +extern void unpin_current_cpu(void);
11511  
11512  #else /* CONFIG_HOTPLUG_CPU */
11513  
11514 @@ -130,6 +132,9 @@
11515  static inline void lockdep_assert_cpus_held(void) { }
11516  static inline void cpu_hotplug_disable(void) { }
11517  static inline void cpu_hotplug_enable(void) { }
11518 +static inline void pin_current_cpu(void) { }
11519 +static inline void unpin_current_cpu(void) { }
11520 +
11521  #endif /* !CONFIG_HOTPLUG_CPU */
11522  
11523  /* Wrappers which go away once all code is converted */
11524 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/dcache.h linux-4.14/include/linux/dcache.h
11525 --- linux-4.14.orig/include/linux/dcache.h      2018-09-05 11:03:22.000000000 +0200
11526 +++ linux-4.14/include/linux/dcache.h   2018-09-05 11:05:07.000000000 +0200
11527 @@ -107,7 +107,7 @@
11528  
11529         union {
11530                 struct list_head d_lru;         /* LRU list */
11531 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
11532 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
11533         };
11534         struct list_head d_child;       /* child of parent list */
11535         struct list_head d_subdirs;     /* our children */
11536 @@ -238,7 +238,7 @@
11537  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
11538  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
11539  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
11540 -                                       wait_queue_head_t *);
11541 +                                       struct swait_queue_head *);
11542  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
11543  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
11544  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
11545 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/delay.h linux-4.14/include/linux/delay.h
11546 --- linux-4.14.orig/include/linux/delay.h       2017-11-12 19:46:13.000000000 +0100
11547 +++ linux-4.14/include/linux/delay.h    2018-09-05 11:05:07.000000000 +0200
11548 @@ -64,4 +64,10 @@
11549         msleep(seconds * 1000);
11550  }
11551  
11552 +#ifdef CONFIG_PREEMPT_RT_FULL
11553 +extern void cpu_chill(void);
11554 +#else
11555 +# define cpu_chill()   cpu_relax()
11556 +#endif
11557 +
11558  #endif /* defined(_LINUX_DELAY_H) */
11559 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/fs.h linux-4.14/include/linux/fs.h
11560 --- linux-4.14.orig/include/linux/fs.h  2018-09-05 11:03:29.000000000 +0200
11561 +++ linux-4.14/include/linux/fs.h       2018-09-05 11:05:07.000000000 +0200
11562 @@ -655,7 +655,7 @@
11563                 struct block_device     *i_bdev;
11564                 struct cdev             *i_cdev;
11565                 char                    *i_link;
11566 -               unsigned                i_dir_seq;
11567 +               unsigned                __i_dir_seq;
11568         };
11569  
11570         __u32                   i_generation;
11571 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/highmem.h linux-4.14/include/linux/highmem.h
11572 --- linux-4.14.orig/include/linux/highmem.h     2017-11-12 19:46:13.000000000 +0100
11573 +++ linux-4.14/include/linux/highmem.h  2018-09-05 11:05:07.000000000 +0200
11574 @@ -8,6 +8,7 @@
11575  #include <linux/mm.h>
11576  #include <linux/uaccess.h>
11577  #include <linux/hardirq.h>
11578 +#include <linux/sched.h>
11579  
11580  #include <asm/cacheflush.h>
11581  
11582 @@ -66,7 +67,7 @@
11583  
11584  static inline void *kmap_atomic(struct page *page)
11585  {
11586 -       preempt_disable();
11587 +       preempt_disable_nort();
11588         pagefault_disable();
11589         return page_address(page);
11590  }
11591 @@ -75,7 +76,7 @@
11592  static inline void __kunmap_atomic(void *addr)
11593  {
11594         pagefault_enable();
11595 -       preempt_enable();
11596 +       preempt_enable_nort();
11597  }
11598  
11599  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
11600 @@ -87,32 +88,51 @@
11601  
11602  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
11603  
11604 +#ifndef CONFIG_PREEMPT_RT_FULL
11605  DECLARE_PER_CPU(int, __kmap_atomic_idx);
11606 +#endif
11607  
11608  static inline int kmap_atomic_idx_push(void)
11609  {
11610 +#ifndef CONFIG_PREEMPT_RT_FULL
11611         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
11612  
11613 -#ifdef CONFIG_DEBUG_HIGHMEM
11614 +# ifdef CONFIG_DEBUG_HIGHMEM
11615         WARN_ON_ONCE(in_irq() && !irqs_disabled());
11616         BUG_ON(idx >= KM_TYPE_NR);
11617 -#endif
11618 +# endif
11619         return idx;
11620 +#else
11621 +       current->kmap_idx++;
11622 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
11623 +       return current->kmap_idx - 1;
11624 +#endif
11625  }
11626  
11627  static inline int kmap_atomic_idx(void)
11628  {
11629 +#ifndef CONFIG_PREEMPT_RT_FULL
11630         return __this_cpu_read(__kmap_atomic_idx) - 1;
11631 +#else
11632 +       return current->kmap_idx - 1;
11633 +#endif
11634  }
11635  
11636  static inline void kmap_atomic_idx_pop(void)
11637  {
11638 -#ifdef CONFIG_DEBUG_HIGHMEM
11639 +#ifndef CONFIG_PREEMPT_RT_FULL
11640 +# ifdef CONFIG_DEBUG_HIGHMEM
11641         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
11642  
11643         BUG_ON(idx < 0);
11644 -#else
11645 +# else
11646         __this_cpu_dec(__kmap_atomic_idx);
11647 +# endif
11648 +#else
11649 +       current->kmap_idx--;
11650 +# ifdef CONFIG_DEBUG_HIGHMEM
11651 +       BUG_ON(current->kmap_idx < 0);
11652 +# endif
11653  #endif
11654  }
11655  
11656 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/hrtimer.h linux-4.14/include/linux/hrtimer.h
11657 --- linux-4.14.orig/include/linux/hrtimer.h     2017-11-12 19:46:13.000000000 +0100
11658 +++ linux-4.14/include/linux/hrtimer.h  2018-09-05 11:05:07.000000000 +0200
11659 @@ -22,19 +22,42 @@
11660  #include <linux/percpu.h>
11661  #include <linux/timer.h>
11662  #include <linux/timerqueue.h>
11663 +#include <linux/wait.h>
11664  
11665  struct hrtimer_clock_base;
11666  struct hrtimer_cpu_base;
11667  
11668  /*
11669   * Mode arguments of xxx_hrtimer functions:
11670 + *
11671 + * HRTIMER_MODE_ABS            - Time value is absolute
11672 + * HRTIMER_MODE_REL            - Time value is relative to now
11673 + * HRTIMER_MODE_PINNED         - Timer is bound to CPU (is only considered
11674 + *                               when starting the timer)
11675 + * HRTIMER_MODE_SOFT           - Timer callback function will be executed in
11676 + *                               soft irq context
11677   */
11678  enum hrtimer_mode {
11679 -       HRTIMER_MODE_ABS = 0x0,         /* Time value is absolute */
11680 -       HRTIMER_MODE_REL = 0x1,         /* Time value is relative to now */
11681 -       HRTIMER_MODE_PINNED = 0x02,     /* Timer is bound to CPU */
11682 -       HRTIMER_MODE_ABS_PINNED = 0x02,
11683 -       HRTIMER_MODE_REL_PINNED = 0x03,
11684 +       HRTIMER_MODE_ABS        = 0x00,
11685 +       HRTIMER_MODE_REL        = 0x01,
11686 +       HRTIMER_MODE_PINNED     = 0x02,
11687 +       HRTIMER_MODE_SOFT       = 0x04,
11688 +       HRTIMER_MODE_HARD       = 0x08,
11689 +
11690 +       HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
11691 +       HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
11692 +
11693 +       HRTIMER_MODE_ABS_SOFT   = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
11694 +       HRTIMER_MODE_REL_SOFT   = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
11695 +
11696 +       HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
11697 +       HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
11698 +
11699 +       HRTIMER_MODE_ABS_HARD   = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
11700 +       HRTIMER_MODE_REL_HARD   = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
11701 +
11702 +       HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
11703 +       HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
11704  };
11705  
11706  /*
11707 @@ -87,6 +110,7 @@
11708   * @base:      pointer to the timer base (per cpu and per clock)
11709   * @state:     state information (See bit values above)
11710   * @is_rel:    Set if the timer was armed relative
11711 + * @is_soft:   Set if hrtimer will be expired in soft interrupt context.
11712   *
11713   * The hrtimer structure must be initialized by hrtimer_init()
11714   */
11715 @@ -97,6 +121,7 @@
11716         struct hrtimer_clock_base       *base;
11717         u8                              state;
11718         u8                              is_rel;
11719 +       u8                              is_soft;
11720  };
11721  
11722  /**
11723 @@ -112,9 +137,9 @@
11724  };
11725  
11726  #ifdef CONFIG_64BIT
11727 -# define HRTIMER_CLOCK_BASE_ALIGN      64
11728 +# define __hrtimer_clock_base_align    ____cacheline_aligned
11729  #else
11730 -# define HRTIMER_CLOCK_BASE_ALIGN      32
11731 +# define __hrtimer_clock_base_align
11732  #endif
11733  
11734  /**
11735 @@ -123,48 +148,57 @@
11736   * @index:             clock type index for per_cpu support when moving a
11737   *                     timer to a base on another cpu.
11738   * @clockid:           clock id for per_cpu support
11739 + * @seq:               seqcount around __run_hrtimer
11740 + * @running:           pointer to the currently running hrtimer
11741   * @active:            red black tree root node for the active timers
11742   * @get_time:          function to retrieve the current time of the clock
11743   * @offset:            offset of this clock to the monotonic base
11744   */
11745  struct hrtimer_clock_base {
11746         struct hrtimer_cpu_base *cpu_base;
11747 -       int                     index;
11748 +       unsigned int            index;
11749         clockid_t               clockid;
11750 +       seqcount_t              seq;
11751 +       struct hrtimer          *running;
11752         struct timerqueue_head  active;
11753         ktime_t                 (*get_time)(void);
11754         ktime_t                 offset;
11755 -} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
11756 +} __hrtimer_clock_base_align;
11757  
11758  enum  hrtimer_base_type {
11759         HRTIMER_BASE_MONOTONIC,
11760         HRTIMER_BASE_REALTIME,
11761         HRTIMER_BASE_BOOTTIME,
11762         HRTIMER_BASE_TAI,
11763 +       HRTIMER_BASE_MONOTONIC_SOFT,
11764 +       HRTIMER_BASE_REALTIME_SOFT,
11765 +       HRTIMER_BASE_BOOTTIME_SOFT,
11766 +       HRTIMER_BASE_TAI_SOFT,
11767         HRTIMER_MAX_CLOCK_BASES,
11768  };
11769  
11770 -/*
11771 +/**
11772   * struct hrtimer_cpu_base - the per cpu clock bases
11773   * @lock:              lock protecting the base and associated clock bases
11774   *                     and timers
11775 - * @seq:               seqcount around __run_hrtimer
11776 - * @running:           pointer to the currently running hrtimer
11777   * @cpu:               cpu number
11778   * @active_bases:      Bitfield to mark bases with active timers
11779   * @clock_was_set_seq: Sequence counter of clock was set events
11780 - * @migration_enabled: The migration of hrtimers to other cpus is enabled
11781 - * @nohz_active:       The nohz functionality is enabled
11782 - * @expires_next:      absolute time of the next event which was scheduled
11783 - *                     via clock_set_next_event()
11784 - * @next_timer:                Pointer to the first expiring timer
11785 - * @in_hrtirq:         hrtimer_interrupt() is currently executing
11786   * @hres_active:       State of high resolution mode
11787 + * @in_hrtirq:         hrtimer_interrupt() is currently executing
11788   * @hang_detected:     The last hrtimer interrupt detected a hang
11789 + * @softirq_activated: displays, if the softirq is raised - update of softirq
11790 + *                     related settings is not required then.
11791   * @nr_events:         Total number of hrtimer interrupt events
11792   * @nr_retries:                Total number of hrtimer interrupt retries
11793   * @nr_hangs:          Total number of hrtimer interrupt hangs
11794   * @max_hang_time:     Maximum time spent in hrtimer_interrupt
11795 + * @expires_next:      absolute time of the next event, is required for remote
11796 + *                     hrtimer enqueue; it is the total first expiry time (hard
11797 + *                     and soft hrtimer are taken into account)
11798 + * @next_timer:                Pointer to the first expiring timer
11799 + * @softirq_expires_next: Time to check, if soft queues needs also to be expired
11800 + * @softirq_next_timer: Pointer to the first expiring softirq based timer
11801   * @clock_base:                array of clock bases for this cpu
11802   *
11803   * Note: next_timer is just an optimization for __remove_hrtimer().
11804 @@ -173,31 +207,31 @@
11805   */
11806  struct hrtimer_cpu_base {
11807         raw_spinlock_t                  lock;
11808 -       seqcount_t                      seq;
11809 -       struct hrtimer                  *running;
11810         unsigned int                    cpu;
11811         unsigned int                    active_bases;
11812         unsigned int                    clock_was_set_seq;
11813 -       bool                            migration_enabled;
11814 -       bool                            nohz_active;
11815 +       unsigned int                    hres_active             : 1,
11816 +                                       in_hrtirq               : 1,
11817 +                                       hang_detected           : 1,
11818 +                                       softirq_activated       : 1;
11819  #ifdef CONFIG_HIGH_RES_TIMERS
11820 -       unsigned int                    in_hrtirq       : 1,
11821 -                                       hres_active     : 1,
11822 -                                       hang_detected   : 1;
11823 -       ktime_t                         expires_next;
11824 -       struct hrtimer                  *next_timer;
11825         unsigned int                    nr_events;
11826 -       unsigned int                    nr_retries;
11827 -       unsigned int                    nr_hangs;
11828 +       unsigned short                  nr_retries;
11829 +       unsigned short                  nr_hangs;
11830         unsigned int                    max_hang_time;
11831  #endif
11832 +       ktime_t                         expires_next;
11833 +       struct hrtimer                  *next_timer;
11834 +       ktime_t                         softirq_expires_next;
11835 +#ifdef CONFIG_PREEMPT_RT_BASE
11836 +       wait_queue_head_t               wait;
11837 +#endif
11838 +       struct hrtimer                  *softirq_next_timer;
11839         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
11840  } ____cacheline_aligned;
11841  
11842  static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
11843  {
11844 -       BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
11845 -
11846         timer->node.expires = time;
11847         timer->_softexpires = time;
11848  }
11849 @@ -266,16 +300,17 @@
11850         return timer->base->get_time();
11851  }
11852  
11853 +static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11854 +{
11855 +       return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
11856 +               timer->base->cpu_base->hres_active : 0;
11857 +}
11858 +
11859  #ifdef CONFIG_HIGH_RES_TIMERS
11860  struct clock_event_device;
11861  
11862  extern void hrtimer_interrupt(struct clock_event_device *dev);
11863  
11864 -static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11865 -{
11866 -       return timer->base->cpu_base->hres_active;
11867 -}
11868 -
11869  /*
11870   * The resolution of the clocks. The resolution value is returned in
11871   * the clock_getres() system call to give application programmers an
11872 @@ -298,11 +333,6 @@
11873  
11874  #define hrtimer_resolution     (unsigned int)LOW_RES_NSEC
11875  
11876 -static inline int hrtimer_is_hres_active(struct hrtimer *timer)
11877 -{
11878 -       return 0;
11879 -}
11880 -
11881  static inline void clock_was_set_delayed(void) { }
11882  
11883  #endif
11884 @@ -344,10 +374,17 @@
11885  /* Initialize timers: */
11886  extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
11887                          enum hrtimer_mode mode);
11888 +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
11889 +                                enum hrtimer_mode mode,
11890 +                                struct task_struct *task);
11891  
11892  #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
11893  extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
11894                                   enum hrtimer_mode mode);
11895 +extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
11896 +                                         clockid_t clock_id,
11897 +                                         enum hrtimer_mode mode,
11898 +                                         struct task_struct *task);
11899  
11900  extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
11901  #else
11902 @@ -357,6 +394,15 @@
11903  {
11904         hrtimer_init(timer, which_clock, mode);
11905  }
11906 +
11907 +static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
11908 +                                           clockid_t clock_id,
11909 +                                           enum hrtimer_mode mode,
11910 +                                           struct task_struct *task)
11911 +{
11912 +       hrtimer_init_sleeper(sl, clock_id, mode, task);
11913 +}
11914 +
11915  static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
11916  #endif
11917  
11918 @@ -365,11 +411,12 @@
11919                                    u64 range_ns, const enum hrtimer_mode mode);
11920  
11921  /**
11922 - * hrtimer_start - (re)start an hrtimer on the current CPU
11923 + * hrtimer_start - (re)start an hrtimer
11924   * @timer:     the timer to be added
11925   * @tim:       expiry time
11926 - * @mode:      expiry mode: absolute (HRTIMER_MODE_ABS) or
11927 - *             relative (HRTIMER_MODE_REL)
11928 + * @mode:      timer mode: absolute (HRTIMER_MODE_ABS) or
11929 + *             relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
11930 + *             softirq based mode is considered for debug purpose only!
11931   */
11932  static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
11933                                  const enum hrtimer_mode mode)
11934 @@ -396,6 +443,13 @@
11935         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
11936  }
11937  
11938 +/* Softirq preemption could deadlock timer removal */
11939 +#ifdef CONFIG_PREEMPT_RT_BASE
11940 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
11941 +#else
11942 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
11943 +#endif
11944 +
11945  /* Query timers: */
11946  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
11947  
11948 @@ -420,9 +474,9 @@
11949   * Helper function to check, whether the timer is running the callback
11950   * function
11951   */
11952 -static inline int hrtimer_callback_running(struct hrtimer *timer)
11953 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
11954  {
11955 -       return timer->base->cpu_base->running == timer;
11956 +       return timer->base->running == timer;
11957  }
11958  
11959  /* Forward a hrtimer so it expires after now: */
11960 @@ -458,15 +512,12 @@
11961                               const enum hrtimer_mode mode,
11962                               const clockid_t clockid);
11963  
11964 -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
11965 -                                struct task_struct *tsk);
11966 -
11967  extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
11968                                                 const enum hrtimer_mode mode);
11969  extern int schedule_hrtimeout_range_clock(ktime_t *expires,
11970                                           u64 delta,
11971                                           const enum hrtimer_mode mode,
11972 -                                         int clock);
11973 +                                         clockid_t clock_id);
11974  extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
11975  
11976  /* Soft interrupt function to run the hrtimer queues: */
11977 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/idr.h linux-4.14/include/linux/idr.h
11978 --- linux-4.14.orig/include/linux/idr.h 2017-11-12 19:46:13.000000000 +0100
11979 +++ linux-4.14/include/linux/idr.h      2018-09-05 11:05:07.000000000 +0200
11980 @@ -167,10 +167,7 @@
11981   * Each idr_preload() should be matched with an invocation of this
11982   * function.  See idr_preload() for details.
11983   */
11984 -static inline void idr_preload_end(void)
11985 -{
11986 -       preempt_enable();
11987 -}
11988 +void idr_preload_end(void);
11989  
11990  /**
11991   * idr_find - return pointer for given id
11992 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/init_task.h linux-4.14/include/linux/init_task.h
11993 --- linux-4.14.orig/include/linux/init_task.h   2017-11-12 19:46:13.000000000 +0100
11994 +++ linux-4.14/include/linux/init_task.h        2018-09-05 11:05:07.000000000 +0200
11995 @@ -163,6 +163,12 @@
11996  # define INIT_PERF_EVENTS(tsk)
11997  #endif
11998  
11999 +#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE)
12000 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12001 +#else
12002 +# define INIT_TIMER_LIST
12003 +#endif
12004 +
12005  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12006  # define INIT_VTIME(tsk)                                               \
12007         .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount),              \
12008 @@ -234,7 +240,8 @@
12009         .static_prio    = MAX_PRIO-20,                                  \
12010         .normal_prio    = MAX_PRIO-20,                                  \
12011         .policy         = SCHED_NORMAL,                                 \
12012 -       .cpus_allowed   = CPU_MASK_ALL,                                 \
12013 +       .cpus_ptr       = &tsk.cpus_mask,                               \
12014 +       .cpus_mask      = CPU_MASK_ALL,                                 \
12015         .nr_cpus_allowed= NR_CPUS,                                      \
12016         .mm             = NULL,                                         \
12017         .active_mm      = &init_mm,                                     \
12018 @@ -276,6 +283,7 @@
12019         INIT_CPU_TIMERS(tsk)                                            \
12020         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12021         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12022 +       INIT_TIMER_LIST                                                 \
12023         .pids = {                                                       \
12024                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12025                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12026 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/interrupt.h linux-4.14/include/linux/interrupt.h
12027 --- linux-4.14.orig/include/linux/interrupt.h   2018-09-05 11:03:22.000000000 +0200
12028 +++ linux-4.14/include/linux/interrupt.h        2018-09-05 11:05:07.000000000 +0200
12029 @@ -15,6 +15,7 @@
12030  #include <linux/hrtimer.h>
12031  #include <linux/kref.h>
12032  #include <linux/workqueue.h>
12033 +#include <linux/swork.h>
12034  
12035  #include <linux/atomic.h>
12036  #include <asm/ptrace.h>
12037 @@ -63,6 +64,7 @@
12038   *                interrupt handler after suspending interrupts. For system
12039   *                wakeup devices users need to implement wakeup detection in
12040   *                their interrupt handlers.
12041 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12042   */
12043  #define IRQF_SHARED            0x00000080
12044  #define IRQF_PROBE_SHARED      0x00000100
12045 @@ -76,6 +78,7 @@
12046  #define IRQF_NO_THREAD         0x00010000
12047  #define IRQF_EARLY_RESUME      0x00020000
12048  #define IRQF_COND_SUSPEND      0x00040000
12049 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12050  
12051  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12052  
12053 @@ -207,7 +210,7 @@
12054  #ifdef CONFIG_LOCKDEP
12055  # define local_irq_enable_in_hardirq() do { } while (0)
12056  #else
12057 -# define local_irq_enable_in_hardirq() local_irq_enable()
12058 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12059  #endif
12060  
12061  extern void disable_irq_nosync(unsigned int irq);
12062 @@ -227,6 +230,7 @@
12063   * struct irq_affinity_notify - context for notification of IRQ affinity changes
12064   * @irq:               Interrupt to which notification applies
12065   * @kref:              Reference count, for internal use
12066 + * @swork:             Swork item, for internal use
12067   * @work:              Work item, for internal use
12068   * @notify:            Function to be called on change.  This will be
12069   *                     called in process context.
12070 @@ -238,7 +242,11 @@
12071  struct irq_affinity_notify {
12072         unsigned int irq;
12073         struct kref kref;
12074 +#ifdef CONFIG_PREEMPT_RT_BASE
12075 +       struct swork_event swork;
12076 +#else
12077         struct work_struct work;
12078 +#endif
12079         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12080         void (*release)(struct kref *ref);
12081  };
12082 @@ -429,9 +437,13 @@
12083                                  bool state);
12084  
12085  #ifdef CONFIG_IRQ_FORCED_THREADING
12086 +# ifndef CONFIG_PREEMPT_RT_BASE
12087  extern bool force_irqthreads;
12088 +# else
12089 +#  define force_irqthreads     (true)
12090 +# endif
12091  #else
12092 -#define force_irqthreads       (0)
12093 +#define force_irqthreads       (false)
12094  #endif
12095  
12096  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12097 @@ -488,9 +500,10 @@
12098         void    (*action)(struct softirq_action *);
12099  };
12100  
12101 +#ifndef CONFIG_PREEMPT_RT_FULL
12102  asmlinkage void do_softirq(void);
12103  asmlinkage void __do_softirq(void);
12104 -
12105 +static inline void thread_do_softirq(void) { do_softirq(); }
12106  #ifdef __ARCH_HAS_DO_SOFTIRQ
12107  void do_softirq_own_stack(void);
12108  #else
12109 @@ -499,13 +512,25 @@
12110         __do_softirq();
12111  }
12112  #endif
12113 +#else
12114 +extern void thread_do_softirq(void);
12115 +#endif
12116  
12117  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12118  extern void softirq_init(void);
12119  extern void __raise_softirq_irqoff(unsigned int nr);
12120 +#ifdef CONFIG_PREEMPT_RT_FULL
12121 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12122 +#else
12123 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12124 +{
12125 +       __raise_softirq_irqoff(nr);
12126 +}
12127 +#endif
12128  
12129  extern void raise_softirq_irqoff(unsigned int nr);
12130  extern void raise_softirq(unsigned int nr);
12131 +extern void softirq_check_pending_idle(void);
12132  
12133  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12134  
12135 @@ -527,8 +552,9 @@
12136       to be executed on some cpu at least once after this.
12137     * If the tasklet is already scheduled, but its execution is still not
12138       started, it will be executed only once.
12139 -   * If this tasklet is already running on another CPU (or schedule is called
12140 -     from tasklet itself), it is rescheduled for later.
12141 +   * If this tasklet is already running on another CPU, it is rescheduled
12142 +     for later.
12143 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12144     * Tasklet is strictly serialized wrt itself, but not
12145       wrt another tasklets. If client needs some intertask synchronization,
12146       he makes it with spinlocks.
12147 @@ -553,27 +579,36 @@
12148  enum
12149  {
12150         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12151 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12152 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12153 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12154  };
12155  
12156 -#ifdef CONFIG_SMP
12157 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12158 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12159 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12160 +
12161 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12162  static inline int tasklet_trylock(struct tasklet_struct *t)
12163  {
12164         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12165  }
12166  
12167 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12168 +{
12169 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12170 +}
12171 +
12172  static inline void tasklet_unlock(struct tasklet_struct *t)
12173  {
12174         smp_mb__before_atomic();
12175         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12176  }
12177  
12178 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12179 -{
12180 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12181 -}
12182 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12183 +
12184  #else
12185  #define tasklet_trylock(t) 1
12186 +#define tasklet_tryunlock(t)   1
12187  #define tasklet_unlock_wait(t) do { } while (0)
12188  #define tasklet_unlock(t) do { } while (0)
12189  #endif
12190 @@ -607,41 +642,17 @@
12191         smp_mb();
12192  }
12193  
12194 -static inline void tasklet_enable(struct tasklet_struct *t)
12195 -{
12196 -       smp_mb__before_atomic();
12197 -       atomic_dec(&t->count);
12198 -}
12199 -
12200 +extern void tasklet_enable(struct tasklet_struct *t);
12201  extern void tasklet_kill(struct tasklet_struct *t);
12202  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12203  extern void tasklet_init(struct tasklet_struct *t,
12204                          void (*func)(unsigned long), unsigned long data);
12205  
12206 -struct tasklet_hrtimer {
12207 -       struct hrtimer          timer;
12208 -       struct tasklet_struct   tasklet;
12209 -       enum hrtimer_restart    (*function)(struct hrtimer *);
12210 -};
12211 -
12212 -extern void
12213 -tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
12214 -                    enum hrtimer_restart (*function)(struct hrtimer *),
12215 -                    clockid_t which_clock, enum hrtimer_mode mode);
12216 -
12217 -static inline
12218 -void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
12219 -                          const enum hrtimer_mode mode)
12220 -{
12221 -       hrtimer_start(&ttimer->timer, time, mode);
12222 -}
12223 -
12224 -static inline
12225 -void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12226 -{
12227 -       hrtimer_cancel(&ttimer->timer);
12228 -       tasklet_kill(&ttimer->tasklet);
12229 -}
12230 +#ifdef CONFIG_PREEMPT_RT_FULL
12231 +extern void softirq_early_init(void);
12232 +#else
12233 +static inline void softirq_early_init(void) { }
12234 +#endif
12235  
12236  /*
12237   * Autoprobing for irqs:
12238 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irqdesc.h linux-4.14/include/linux/irqdesc.h
12239 --- linux-4.14.orig/include/linux/irqdesc.h     2017-11-12 19:46:13.000000000 +0100
12240 +++ linux-4.14/include/linux/irqdesc.h  2018-09-05 11:05:07.000000000 +0200
12241 @@ -70,6 +70,7 @@
12242         unsigned int            irqs_unhandled;
12243         atomic_t                threads_handled;
12244         int                     threads_handled_last;
12245 +       u64                     random_ip;
12246         raw_spinlock_t          lock;
12247         struct cpumask          *percpu_enabled;
12248         const struct cpumask    *percpu_affinity;
12249 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irqflags.h linux-4.14/include/linux/irqflags.h
12250 --- linux-4.14.orig/include/linux/irqflags.h    2017-11-12 19:46:13.000000000 +0100
12251 +++ linux-4.14/include/linux/irqflags.h 2018-09-05 11:05:07.000000000 +0200
12252 @@ -34,16 +34,6 @@
12253         current->hardirq_context--;             \
12254         crossrelease_hist_end(XHLOCK_HARD);     \
12255  } while (0)
12256 -# define lockdep_softirq_enter()               \
12257 -do {                                           \
12258 -       current->softirq_context++;             \
12259 -       crossrelease_hist_start(XHLOCK_SOFT);   \
12260 -} while (0)
12261 -# define lockdep_softirq_exit()                        \
12262 -do {                                           \
12263 -       current->softirq_context--;             \
12264 -       crossrelease_hist_end(XHLOCK_SOFT);     \
12265 -} while (0)
12266  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12267  #else
12268  # define trace_hardirqs_on()           do { } while (0)
12269 @@ -56,9 +46,23 @@
12270  # define trace_softirqs_enabled(p)     0
12271  # define trace_hardirq_enter()         do { } while (0)
12272  # define trace_hardirq_exit()          do { } while (0)
12273 +# define INIT_TRACE_IRQFLAGS
12274 +#endif
12275 +
12276 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12277 +# define lockdep_softirq_enter()               \
12278 +do {                                           \
12279 +       current->softirq_context++;             \
12280 +       crossrelease_hist_start(XHLOCK_SOFT);   \
12281 +} while (0)
12282 +# define lockdep_softirq_exit()                        \
12283 +do {                                           \
12284 +       current->softirq_context--;             \
12285 +       crossrelease_hist_end(XHLOCK_SOFT);     \
12286 +} while (0)
12287 +#else
12288  # define lockdep_softirq_enter()       do { } while (0)
12289  # define lockdep_softirq_exit()                do { } while (0)
12290 -# define INIT_TRACE_IRQFLAGS
12291  #endif
12292  
12293  #if defined(CONFIG_IRQSOFF_TRACER) || \
12294 @@ -165,4 +169,23 @@
12295  
12296  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12297  
12298 +/*
12299 + * local_irq* variants depending on RT/!RT
12300 + */
12301 +#ifdef CONFIG_PREEMPT_RT_FULL
12302 +# define local_irq_disable_nort()      do { } while (0)
12303 +# define local_irq_enable_nort()       do { } while (0)
12304 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12305 +# define local_irq_restore_nort(flags) (void)(flags)
12306 +# define local_irq_disable_rt()                local_irq_disable()
12307 +# define local_irq_enable_rt()         local_irq_enable()
12308 +#else
12309 +# define local_irq_disable_nort()      local_irq_disable()
12310 +# define local_irq_enable_nort()       local_irq_enable()
12311 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12312 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12313 +# define local_irq_disable_rt()                do { } while (0)
12314 +# define local_irq_enable_rt()         do { } while (0)
12315 +#endif
12316 +
12317  #endif
12318 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irq.h linux-4.14/include/linux/irq.h
12319 --- linux-4.14.orig/include/linux/irq.h 2018-09-05 11:03:22.000000000 +0200
12320 +++ linux-4.14/include/linux/irq.h      2018-09-05 11:05:07.000000000 +0200
12321 @@ -74,6 +74,7 @@
12322   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12323   *                               it from the spurious interrupt detection
12324   *                               mechanism and from core side polling.
12325 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12326   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12327   */
12328  enum {
12329 @@ -101,13 +102,14 @@
12330         IRQ_PER_CPU_DEVID       = (1 << 17),
12331         IRQ_IS_POLLED           = (1 << 18),
12332         IRQ_DISABLE_UNLAZY      = (1 << 19),
12333 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12334  };
12335  
12336  #define IRQF_MODIFY_MASK       \
12337         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12338          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12339          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12340 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12341 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12342  
12343  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12344  
12345 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/irq_work.h linux-4.14/include/linux/irq_work.h
12346 --- linux-4.14.orig/include/linux/irq_work.h    2017-11-12 19:46:13.000000000 +0100
12347 +++ linux-4.14/include/linux/irq_work.h 2018-09-05 11:05:07.000000000 +0200
12348 @@ -17,6 +17,7 @@
12349  #define IRQ_WORK_BUSY          2UL
12350  #define IRQ_WORK_FLAGS         3UL
12351  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12352 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12353  
12354  struct irq_work {
12355         unsigned long flags;
12356 @@ -52,4 +53,10 @@
12357  static inline void irq_work_run(void) { }
12358  #endif
12359  
12360 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12361 +void irq_work_tick_soft(void);
12362 +#else
12363 +static inline void irq_work_tick_soft(void) { }
12364 +#endif
12365 +
12366  #endif /* _LINUX_IRQ_WORK_H */
12367 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/jbd2.h linux-4.14/include/linux/jbd2.h
12368 --- linux-4.14.orig/include/linux/jbd2.h        2018-09-05 11:03:22.000000000 +0200
12369 +++ linux-4.14/include/linux/jbd2.h     2018-09-05 11:05:07.000000000 +0200
12370 @@ -347,32 +347,56 @@
12371  
12372  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12373  {
12374 +#ifndef CONFIG_PREEMPT_RT_BASE
12375         bit_spin_lock(BH_State, &bh->b_state);
12376 +#else
12377 +       spin_lock(&bh->b_state_lock);
12378 +#endif
12379  }
12380  
12381  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12382  {
12383 +#ifndef CONFIG_PREEMPT_RT_BASE
12384         return bit_spin_trylock(BH_State, &bh->b_state);
12385 +#else
12386 +       return spin_trylock(&bh->b_state_lock);
12387 +#endif
12388  }
12389  
12390  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12391  {
12392 +#ifndef CONFIG_PREEMPT_RT_BASE
12393         return bit_spin_is_locked(BH_State, &bh->b_state);
12394 +#else
12395 +       return spin_is_locked(&bh->b_state_lock);
12396 +#endif
12397  }
12398  
12399  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12400  {
12401 +#ifndef CONFIG_PREEMPT_RT_BASE
12402         bit_spin_unlock(BH_State, &bh->b_state);
12403 +#else
12404 +       spin_unlock(&bh->b_state_lock);
12405 +#endif
12406  }
12407  
12408  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12409  {
12410 +#ifndef CONFIG_PREEMPT_RT_BASE
12411         bit_spin_lock(BH_JournalHead, &bh->b_state);
12412 +#else
12413 +       spin_lock(&bh->b_journal_head_lock);
12414 +#endif
12415  }
12416  
12417  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12418  {
12419 +#ifndef CONFIG_PREEMPT_RT_BASE
12420         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12421 +#else
12422 +       spin_unlock(&bh->b_journal_head_lock);
12423 +#endif
12424  }
12425  
12426  #define J_ASSERT(assert)       BUG_ON(!(assert))
12427 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/kdb.h linux-4.14/include/linux/kdb.h
12428 --- linux-4.14.orig/include/linux/kdb.h 2017-11-12 19:46:13.000000000 +0100
12429 +++ linux-4.14/include/linux/kdb.h      2018-09-05 11:05:07.000000000 +0200
12430 @@ -167,6 +167,7 @@
12431  extern __printf(1, 2) int kdb_printf(const char *, ...);
12432  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12433  
12434 +#define in_kdb_printk()        (kdb_trap_printk)
12435  extern void kdb_init(int level);
12436  
12437  /* Access to kdb specific polling devices */
12438 @@ -201,6 +202,7 @@
12439  extern int kdb_unregister(char *);
12440  #else /* ! CONFIG_KGDB_KDB */
12441  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12442 +#define in_kdb_printk() (0)
12443  static inline void kdb_init(int level) {}
12444  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12445                                char *help, short minlen) { return 0; }
12446 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/kernel.h linux-4.14/include/linux/kernel.h
12447 --- linux-4.14.orig/include/linux/kernel.h      2017-11-12 19:46:13.000000000 +0100
12448 +++ linux-4.14/include/linux/kernel.h   2018-09-05 11:05:07.000000000 +0200
12449 @@ -225,6 +225,9 @@
12450   */
12451  # define might_sleep() \
12452         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12453 +
12454 +# define might_sleep_no_state_check() \
12455 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12456  # define sched_annotate_sleep()        (current->task_state_change = 0)
12457  #else
12458    static inline void ___might_sleep(const char *file, int line,
12459 @@ -232,6 +235,7 @@
12460    static inline void __might_sleep(const char *file, int line,
12461                                    int preempt_offset) { }
12462  # define might_sleep() do { might_resched(); } while (0)
12463 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12464  # define sched_annotate_sleep() do { } while (0)
12465  #endif
12466  
12467 @@ -531,6 +535,7 @@
12468         SYSTEM_HALT,
12469         SYSTEM_POWER_OFF,
12470         SYSTEM_RESTART,
12471 +       SYSTEM_SUSPEND,
12472  } system_state;
12473  
12474  #define TAINT_PROPRIETARY_MODULE       0
12475 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/list_bl.h linux-4.14/include/linux/list_bl.h
12476 --- linux-4.14.orig/include/linux/list_bl.h     2017-11-12 19:46:13.000000000 +0100
12477 +++ linux-4.14/include/linux/list_bl.h  2018-09-05 11:05:07.000000000 +0200
12478 @@ -3,6 +3,7 @@
12479  #define _LINUX_LIST_BL_H
12480  
12481  #include <linux/list.h>
12482 +#include <linux/spinlock.h>
12483  #include <linux/bit_spinlock.h>
12484  
12485  /*
12486 @@ -33,13 +34,24 @@
12487  
12488  struct hlist_bl_head {
12489         struct hlist_bl_node *first;
12490 +#ifdef CONFIG_PREEMPT_RT_BASE
12491 +       raw_spinlock_t lock;
12492 +#endif
12493  };
12494  
12495  struct hlist_bl_node {
12496         struct hlist_bl_node *next, **pprev;
12497  };
12498 -#define INIT_HLIST_BL_HEAD(ptr) \
12499 -       ((ptr)->first = NULL)
12500 +
12501 +#ifdef CONFIG_PREEMPT_RT_BASE
12502 +#define INIT_HLIST_BL_HEAD(h)          \
12503 +do {                                   \
12504 +       (h)->first = NULL;              \
12505 +       raw_spin_lock_init(&(h)->lock); \
12506 +} while (0)
12507 +#else
12508 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12509 +#endif
12510  
12511  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12512  {
12513 @@ -119,12 +131,26 @@
12514  
12515  static inline void hlist_bl_lock(struct hlist_bl_head *b)
12516  {
12517 +#ifndef CONFIG_PREEMPT_RT_BASE
12518         bit_spin_lock(0, (unsigned long *)b);
12519 +#else
12520 +       raw_spin_lock(&b->lock);
12521 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12522 +       __set_bit(0, (unsigned long *)b);
12523 +#endif
12524 +#endif
12525  }
12526  
12527  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12528  {
12529 +#ifndef CONFIG_PREEMPT_RT_BASE
12530         __bit_spin_unlock(0, (unsigned long *)b);
12531 +#else
12532 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12533 +       __clear_bit(0, (unsigned long *)b);
12534 +#endif
12535 +       raw_spin_unlock(&b->lock);
12536 +#endif
12537  }
12538  
12539  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
12540 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/locallock.h linux-4.14/include/linux/locallock.h
12541 --- linux-4.14.orig/include/linux/locallock.h   1970-01-01 01:00:00.000000000 +0100
12542 +++ linux-4.14/include/linux/locallock.h        2018-09-05 11:05:07.000000000 +0200
12543 @@ -0,0 +1,271 @@
12544 +#ifndef _LINUX_LOCALLOCK_H
12545 +#define _LINUX_LOCALLOCK_H
12546 +
12547 +#include <linux/percpu.h>
12548 +#include <linux/spinlock.h>
12549 +
12550 +#ifdef CONFIG_PREEMPT_RT_BASE
12551 +
12552 +#ifdef CONFIG_DEBUG_SPINLOCK
12553 +# define LL_WARN(cond) WARN_ON(cond)
12554 +#else
12555 +# define LL_WARN(cond) do { } while (0)
12556 +#endif
12557 +
12558 +/*
12559 + * per cpu lock based substitute for local_irq_*()
12560 + */
12561 +struct local_irq_lock {
12562 +       spinlock_t              lock;
12563 +       struct task_struct      *owner;
12564 +       int                     nestcnt;
12565 +       unsigned long           flags;
12566 +};
12567 +
12568 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
12569 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
12570 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
12571 +
12572 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
12573 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
12574 +
12575 +#define local_irq_lock_init(lvar)                                      \
12576 +       do {                                                            \
12577 +               int __cpu;                                              \
12578 +               for_each_possible_cpu(__cpu)                            \
12579 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
12580 +       } while (0)
12581 +
12582 +static inline void __local_lock(struct local_irq_lock *lv)
12583 +{
12584 +       if (lv->owner != current) {
12585 +               spin_lock(&lv->lock);
12586 +               LL_WARN(lv->owner);
12587 +               LL_WARN(lv->nestcnt);
12588 +               lv->owner = current;
12589 +       }
12590 +       lv->nestcnt++;
12591 +}
12592 +
12593 +#define local_lock(lvar)                                       \
12594 +       do { __local_lock(&get_local_var(lvar)); } while (0)
12595 +
12596 +#define local_lock_on(lvar, cpu)                               \
12597 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
12598 +
12599 +static inline int __local_trylock(struct local_irq_lock *lv)
12600 +{
12601 +       if (lv->owner != current && spin_trylock(&lv->lock)) {
12602 +               LL_WARN(lv->owner);
12603 +               LL_WARN(lv->nestcnt);
12604 +               lv->owner = current;
12605 +               lv->nestcnt = 1;
12606 +               return 1;
12607 +       } else if (lv->owner == current) {
12608 +               lv->nestcnt++;
12609 +               return 1;
12610 +       }
12611 +       return 0;
12612 +}
12613 +
12614 +#define local_trylock(lvar)                                            \
12615 +       ({                                                              \
12616 +               int __locked;                                           \
12617 +               __locked = __local_trylock(&get_local_var(lvar));       \
12618 +               if (!__locked)                                          \
12619 +                       put_local_var(lvar);                            \
12620 +               __locked;                                               \
12621 +       })
12622 +
12623 +static inline void __local_unlock(struct local_irq_lock *lv)
12624 +{
12625 +       LL_WARN(lv->nestcnt == 0);
12626 +       LL_WARN(lv->owner != current);
12627 +       if (--lv->nestcnt)
12628 +               return;
12629 +
12630 +       lv->owner = NULL;
12631 +       spin_unlock(&lv->lock);
12632 +}
12633 +
12634 +#define local_unlock(lvar)                                     \
12635 +       do {                                                    \
12636 +               __local_unlock(this_cpu_ptr(&lvar));            \
12637 +               put_local_var(lvar);                            \
12638 +       } while (0)
12639 +
12640 +#define local_unlock_on(lvar, cpu)                       \
12641 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
12642 +
12643 +static inline void __local_lock_irq(struct local_irq_lock *lv)
12644 +{
12645 +       spin_lock_irqsave(&lv->lock, lv->flags);
12646 +       LL_WARN(lv->owner);
12647 +       LL_WARN(lv->nestcnt);
12648 +       lv->owner = current;
12649 +       lv->nestcnt = 1;
12650 +}
12651 +
12652 +#define local_lock_irq(lvar)                                           \
12653 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
12654 +
12655 +#define local_lock_irq_on(lvar, cpu)                                   \
12656 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
12657 +
12658 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
12659 +{
12660 +       LL_WARN(!lv->nestcnt);
12661 +       LL_WARN(lv->owner != current);
12662 +       lv->owner = NULL;
12663 +       lv->nestcnt = 0;
12664 +       spin_unlock_irq(&lv->lock);
12665 +}
12666 +
12667 +#define local_unlock_irq(lvar)                                         \
12668 +       do {                                                            \
12669 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
12670 +               put_local_var(lvar);                                    \
12671 +       } while (0)
12672 +
12673 +#define local_unlock_irq_on(lvar, cpu)                                 \
12674 +       do {                                                            \
12675 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
12676 +       } while (0)
12677 +
12678 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
12679 +{
12680 +       if (lv->owner != current) {
12681 +               __local_lock_irq(lv);
12682 +               return 0;
12683 +       } else {
12684 +               lv->nestcnt++;
12685 +               return 1;
12686 +       }
12687 +}
12688 +
12689 +#define local_lock_irqsave(lvar, _flags)                               \
12690 +       do {                                                            \
12691 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
12692 +                       put_local_var(lvar);                            \
12693 +               _flags = __this_cpu_read(lvar.flags);                   \
12694 +       } while (0)
12695 +
12696 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
12697 +       do {                                                            \
12698 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
12699 +               _flags = per_cpu(lvar, cpu).flags;                      \
12700 +       } while (0)
12701 +
12702 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
12703 +                                           unsigned long flags)
12704 +{
12705 +       LL_WARN(!lv->nestcnt);
12706 +       LL_WARN(lv->owner != current);
12707 +       if (--lv->nestcnt)
12708 +               return 0;
12709 +
12710 +       lv->owner = NULL;
12711 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
12712 +       return 1;
12713 +}
12714 +
12715 +#define local_unlock_irqrestore(lvar, flags)                           \
12716 +       do {                                                            \
12717 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
12718 +                       put_local_var(lvar);                            \
12719 +       } while (0)
12720 +
12721 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
12722 +       do {                                                            \
12723 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
12724 +       } while (0)
12725 +
12726 +#define local_spin_trylock_irq(lvar, lock)                             \
12727 +       ({                                                              \
12728 +               int __locked;                                           \
12729 +               local_lock_irq(lvar);                                   \
12730 +               __locked = spin_trylock(lock);                          \
12731 +               if (!__locked)                                          \
12732 +                       local_unlock_irq(lvar);                         \
12733 +               __locked;                                               \
12734 +       })
12735 +
12736 +#define local_spin_lock_irq(lvar, lock)                                        \
12737 +       do {                                                            \
12738 +               local_lock_irq(lvar);                                   \
12739 +               spin_lock(lock);                                        \
12740 +       } while (0)
12741 +
12742 +#define local_spin_unlock_irq(lvar, lock)                              \
12743 +       do {                                                            \
12744 +               spin_unlock(lock);                                      \
12745 +               local_unlock_irq(lvar);                                 \
12746 +       } while (0)
12747 +
12748 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
12749 +       do {                                                            \
12750 +               local_lock_irqsave(lvar, flags);                        \
12751 +               spin_lock(lock);                                        \
12752 +       } while (0)
12753 +
12754 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
12755 +       do {                                                            \
12756 +               spin_unlock(lock);                                      \
12757 +               local_unlock_irqrestore(lvar, flags);                   \
12758 +       } while (0)
12759 +
12760 +#define get_locked_var(lvar, var)                                      \
12761 +       (*({                                                            \
12762 +               local_lock(lvar);                                       \
12763 +               this_cpu_ptr(&var);                                     \
12764 +       }))
12765 +
12766 +#define put_locked_var(lvar, var)      local_unlock(lvar);
12767 +
12768 +#define local_lock_cpu(lvar)                                           \
12769 +       ({                                                              \
12770 +               local_lock(lvar);                                       \
12771 +               smp_processor_id();                                     \
12772 +       })
12773 +
12774 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
12775 +
12776 +#else /* PREEMPT_RT_BASE */
12777 +
12778 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
12779 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
12780 +
12781 +static inline void local_irq_lock_init(int lvar) { }
12782 +
12783 +#define local_trylock(lvar)                                    \
12784 +       ({                                                      \
12785 +               preempt_disable();                              \
12786 +               1;                                              \
12787 +       })
12788 +
12789 +#define local_lock(lvar)                       preempt_disable()
12790 +#define local_unlock(lvar)                     preempt_enable()
12791 +#define local_lock_irq(lvar)                   local_irq_disable()
12792 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
12793 +#define local_unlock_irq(lvar)                 local_irq_enable()
12794 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
12795 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
12796 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
12797 +
12798 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
12799 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
12800 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
12801 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
12802 +       spin_lock_irqsave(lock, flags)
12803 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
12804 +       spin_unlock_irqrestore(lock, flags)
12805 +
12806 +#define get_locked_var(lvar, var)              get_cpu_var(var)
12807 +#define put_locked_var(lvar, var)              put_cpu_var(var)
12808 +
12809 +#define local_lock_cpu(lvar)                   get_cpu()
12810 +#define local_unlock_cpu(lvar)                 put_cpu()
12811 +
12812 +#endif
12813 +
12814 +#endif
12815 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mm_types.h linux-4.14/include/linux/mm_types.h
12816 --- linux-4.14.orig/include/linux/mm_types.h    2018-09-05 11:03:28.000000000 +0200
12817 +++ linux-4.14/include/linux/mm_types.h 2018-09-05 11:05:07.000000000 +0200
12818 @@ -12,6 +12,7 @@
12819  #include <linux/completion.h>
12820  #include <linux/cpumask.h>
12821  #include <linux/uprobes.h>
12822 +#include <linux/rcupdate.h>
12823  #include <linux/page-flags-layout.h>
12824  #include <linux/workqueue.h>
12825  
12826 @@ -498,6 +499,9 @@
12827         bool tlb_flush_batched;
12828  #endif
12829         struct uprobes_state uprobes_state;
12830 +#ifdef CONFIG_PREEMPT_RT_BASE
12831 +       struct rcu_head delayed_drop;
12832 +#endif
12833  #ifdef CONFIG_HUGETLB_PAGE
12834         atomic_long_t hugetlb_usage;
12835  #endif
12836 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mutex.h linux-4.14/include/linux/mutex.h
12837 --- linux-4.14.orig/include/linux/mutex.h       2017-11-12 19:46:13.000000000 +0100
12838 +++ linux-4.14/include/linux/mutex.h    2018-09-05 11:05:07.000000000 +0200
12839 @@ -23,6 +23,17 @@
12840  
12841  struct ww_acquire_ctx;
12842  
12843 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12844 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
12845 +               , .dep_map = { .name = #lockname }
12846 +#else
12847 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
12848 +#endif
12849 +
12850 +#ifdef CONFIG_PREEMPT_RT_FULL
12851 +# include <linux/mutex_rt.h>
12852 +#else
12853 +
12854  /*
12855   * Simple, straightforward mutexes with strict semantics:
12856   *
12857 @@ -114,13 +125,6 @@
12858         __mutex_init((mutex), #mutex, &__key);                          \
12859  } while (0)
12860  
12861 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
12862 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
12863 -               , .dep_map = { .name = #lockname }
12864 -#else
12865 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
12866 -#endif
12867 -
12868  #define __MUTEX_INITIALIZER(lockname) \
12869                 { .owner = ATOMIC_LONG_INIT(0) \
12870                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
12871 @@ -228,4 +232,6 @@
12872         return mutex_trylock(lock);
12873  }
12874  
12875 +#endif /* !PREEMPT_RT_FULL */
12876 +
12877  #endif /* __LINUX_MUTEX_H */
12878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/mutex_rt.h linux-4.14/include/linux/mutex_rt.h
12879 --- linux-4.14.orig/include/linux/mutex_rt.h    1970-01-01 01:00:00.000000000 +0100
12880 +++ linux-4.14/include/linux/mutex_rt.h 2018-09-05 11:05:07.000000000 +0200
12881 @@ -0,0 +1,130 @@
12882 +#ifndef __LINUX_MUTEX_RT_H
12883 +#define __LINUX_MUTEX_RT_H
12884 +
12885 +#ifndef __LINUX_MUTEX_H
12886 +#error "Please include mutex.h"
12887 +#endif
12888 +
12889 +#include <linux/rtmutex.h>
12890 +
12891 +/* FIXME: Just for __lockfunc */
12892 +#include <linux/spinlock.h>
12893 +
12894 +struct mutex {
12895 +       struct rt_mutex         lock;
12896 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12897 +       struct lockdep_map      dep_map;
12898 +#endif
12899 +};
12900 +
12901 +#define __MUTEX_INITIALIZER(mutexname)                                 \
12902 +       {                                                               \
12903 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
12904 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
12905 +       }
12906 +
12907 +#define DEFINE_MUTEX(mutexname)                                                \
12908 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
12909 +
12910 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
12911 +extern void __lockfunc _mutex_lock(struct mutex *lock);
12912 +extern void __lockfunc _mutex_lock_io(struct mutex *lock);
12913 +extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
12914 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
12915 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
12916 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
12917 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
12918 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
12919 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
12920 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
12921 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
12922 +
12923 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
12924 +#define mutex_lock(l)                  _mutex_lock(l)
12925 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
12926 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
12927 +#define mutex_trylock(l)               _mutex_trylock(l)
12928 +#define mutex_unlock(l)                        _mutex_unlock(l)
12929 +#define mutex_lock_io(l)               _mutex_lock_io(l);
12930 +
12931 +#define __mutex_owner(l)               ((l)->lock.owner)
12932 +
12933 +#ifdef CONFIG_DEBUG_MUTEXES
12934 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
12935 +#else
12936 +static inline void mutex_destroy(struct mutex *lock) {}
12937 +#endif
12938 +
12939 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12940 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
12941 +# define mutex_lock_interruptible_nested(l, s) \
12942 +                                       _mutex_lock_interruptible_nested(l, s)
12943 +# define mutex_lock_killable_nested(l, s) \
12944 +                                       _mutex_lock_killable_nested(l, s)
12945 +# define mutex_lock_io_nested(l, s)    _mutex_lock_io_nested(l, s)
12946 +
12947 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
12948 +do {                                                                   \
12949 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
12950 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
12951 +} while (0)
12952 +
12953 +#else
12954 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
12955 +# define mutex_lock_interruptible_nested(l, s) \
12956 +                                       _mutex_lock_interruptible(l)
12957 +# define mutex_lock_killable_nested(l, s) \
12958 +                                       _mutex_lock_killable(l)
12959 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
12960 +# define mutex_lock_io_nested(l, s)    _mutex_lock_io(l)
12961 +#endif
12962 +
12963 +# define mutex_init(mutex)                             \
12964 +do {                                                   \
12965 +       static struct lock_class_key __key;             \
12966 +                                                       \
12967 +       rt_mutex_init(&(mutex)->lock);                  \
12968 +       __mutex_do_init((mutex), #mutex, &__key);       \
12969 +} while (0)
12970 +
12971 +# define __mutex_init(mutex, name, key)                        \
12972 +do {                                                   \
12973 +       rt_mutex_init(&(mutex)->lock);                  \
12974 +       __mutex_do_init((mutex), name, key);            \
12975 +} while (0)
12976 +
12977 +/**
12978 + * These values are chosen such that FAIL and SUCCESS match the
12979 + * values of the regular mutex_trylock().
12980 + */
12981 +enum mutex_trylock_recursive_enum {
12982 +       MUTEX_TRYLOCK_FAILED    = 0,
12983 +       MUTEX_TRYLOCK_SUCCESS   = 1,
12984 +       MUTEX_TRYLOCK_RECURSIVE,
12985 +};
12986 +/**
12987 + * mutex_trylock_recursive - trylock variant that allows recursive locking
12988 + * @lock: mutex to be locked
12989 + *
12990 + * This function should not be used, _ever_. It is purely for hysterical GEM
12991 + * raisins, and once those are gone this will be removed.
12992 + *
12993 + * Returns:
12994 + *  MUTEX_TRYLOCK_FAILED    - trylock failed,
12995 + *  MUTEX_TRYLOCK_SUCCESS   - lock acquired,
12996 + *  MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
12997 + */
12998 +int __rt_mutex_owner_current(struct rt_mutex *lock);
12999 +
13000 +static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
13001 +mutex_trylock_recursive(struct mutex *lock)
13002 +{
13003 +       if (unlikely(__rt_mutex_owner_current(&lock->lock)))
13004 +               return MUTEX_TRYLOCK_RECURSIVE;
13005 +
13006 +       return mutex_trylock(lock);
13007 +}
13008 +
13009 +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13010 +
13011 +#endif
13012 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/netdevice.h linux-4.14/include/linux/netdevice.h
13013 --- linux-4.14.orig/include/linux/netdevice.h   2018-09-05 11:03:22.000000000 +0200
13014 +++ linux-4.14/include/linux/netdevice.h        2018-09-05 11:05:07.000000000 +0200
13015 @@ -409,7 +409,19 @@
13016  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
13017  
13018  void __napi_schedule(struct napi_struct *n);
13019 +
13020 +/*
13021 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13022 + * run as threads, and they can also be preempted (without PREEMPT_RT
13023 + * interrupt threads can not be preempted). Which means that calling
13024 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
13025 + * and can corrupt the napi->poll_list.
13026 + */
13027 +#ifdef CONFIG_PREEMPT_RT_FULL
13028 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
13029 +#else
13030  void __napi_schedule_irqoff(struct napi_struct *n);
13031 +#endif
13032  
13033  static inline bool napi_disable_pending(struct napi_struct *n)
13034  {
13035 @@ -571,7 +583,11 @@
13036   * write-mostly part
13037   */
13038         spinlock_t              _xmit_lock ____cacheline_aligned_in_smp;
13039 +#ifdef CONFIG_PREEMPT_RT_FULL
13040 +       struct task_struct      *xmit_lock_owner;
13041 +#else
13042         int                     xmit_lock_owner;
13043 +#endif
13044         /*
13045          * Time (in jiffies) of last Tx
13046          */
13047 @@ -2433,14 +2449,53 @@
13048  void synchronize_net(void);
13049  int init_dummy_netdev(struct net_device *dev);
13050  
13051 -DECLARE_PER_CPU(int, xmit_recursion);
13052  #define XMIT_RECURSION_LIMIT   10
13053 +#ifdef CONFIG_PREEMPT_RT_FULL
13054 +static inline int dev_recursion_level(void)
13055 +{
13056 +       return current->xmit_recursion;
13057 +}
13058 +
13059 +static inline int xmit_rec_read(void)
13060 +{
13061 +       return current->xmit_recursion;
13062 +}
13063 +
13064 +static inline void xmit_rec_inc(void)
13065 +{
13066 +       current->xmit_recursion++;
13067 +}
13068 +
13069 +static inline void xmit_rec_dec(void)
13070 +{
13071 +       current->xmit_recursion--;
13072 +}
13073 +
13074 +#else
13075 +
13076 +DECLARE_PER_CPU(int, xmit_recursion);
13077  
13078  static inline int dev_recursion_level(void)
13079  {
13080         return this_cpu_read(xmit_recursion);
13081  }
13082  
13083 +static inline int xmit_rec_read(void)
13084 +{
13085 +       return __this_cpu_read(xmit_recursion);
13086 +}
13087 +
13088 +static inline void xmit_rec_inc(void)
13089 +{
13090 +       __this_cpu_inc(xmit_recursion);
13091 +}
13092 +
13093 +static inline void xmit_rec_dec(void)
13094 +{
13095 +       __this_cpu_dec(xmit_recursion);
13096 +}
13097 +#endif
13098 +
13099  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13100  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13101  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
13102 @@ -2792,6 +2847,7 @@
13103         unsigned int            dropped;
13104         struct sk_buff_head     input_pkt_queue;
13105         struct napi_struct      backlog;
13106 +       struct sk_buff_head     tofree_queue;
13107  
13108  };
13109  
13110 @@ -3515,10 +3571,48 @@
13111         return (1 << debug_value) - 1;
13112  }
13113  
13114 +#ifdef CONFIG_PREEMPT_RT_FULL
13115 +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13116 +{
13117 +       txq->xmit_lock_owner = current;
13118 +}
13119 +
13120 +static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13121 +{
13122 +       txq->xmit_lock_owner = NULL;
13123 +}
13124 +
13125 +static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13126 +{
13127 +       if (txq->xmit_lock_owner != NULL)
13128 +               return true;
13129 +       return false;
13130 +}
13131 +
13132 +#else
13133 +
13134 +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13135 +{
13136 +       txq->xmit_lock_owner = cpu;
13137 +}
13138 +
13139 +static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13140 +{
13141 +       txq->xmit_lock_owner = -1;
13142 +}
13143 +
13144 +static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13145 +{
13146 +       if (txq->xmit_lock_owner != -1)
13147 +               return true;
13148 +       return false;
13149 +}
13150 +#endif
13151 +
13152  static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
13153  {
13154         spin_lock(&txq->_xmit_lock);
13155 -       txq->xmit_lock_owner = cpu;
13156 +       netdev_queue_set_owner(txq, cpu);
13157  }
13158  
13159  static inline bool __netif_tx_acquire(struct netdev_queue *txq)
13160 @@ -3535,32 +3629,32 @@
13161  static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
13162  {
13163         spin_lock_bh(&txq->_xmit_lock);
13164 -       txq->xmit_lock_owner = smp_processor_id();
13165 +       netdev_queue_set_owner(txq, smp_processor_id());
13166  }
13167  
13168  static inline bool __netif_tx_trylock(struct netdev_queue *txq)
13169  {
13170         bool ok = spin_trylock(&txq->_xmit_lock);
13171         if (likely(ok))
13172 -               txq->xmit_lock_owner = smp_processor_id();
13173 +               netdev_queue_set_owner(txq, smp_processor_id());
13174         return ok;
13175  }
13176  
13177  static inline void __netif_tx_unlock(struct netdev_queue *txq)
13178  {
13179 -       txq->xmit_lock_owner = -1;
13180 +       netdev_queue_clear_owner(txq);
13181         spin_unlock(&txq->_xmit_lock);
13182  }
13183  
13184  static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
13185  {
13186 -       txq->xmit_lock_owner = -1;
13187 +       netdev_queue_clear_owner(txq);
13188         spin_unlock_bh(&txq->_xmit_lock);
13189  }
13190  
13191  static inline void txq_trans_update(struct netdev_queue *txq)
13192  {
13193 -       if (txq->xmit_lock_owner != -1)
13194 +       if (netdev_queue_has_owner(txq))
13195                 txq->trans_start = jiffies;
13196  }
13197  
13198 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/netfilter/x_tables.h linux-4.14/include/linux/netfilter/x_tables.h
13199 --- linux-4.14.orig/include/linux/netfilter/x_tables.h  2018-09-05 11:03:22.000000000 +0200
13200 +++ linux-4.14/include/linux/netfilter/x_tables.h       2018-09-05 11:05:07.000000000 +0200
13201 @@ -6,6 +6,7 @@
13202  #include <linux/netdevice.h>
13203  #include <linux/static_key.h>
13204  #include <linux/netfilter.h>
13205 +#include <linux/locallock.h>
13206  #include <uapi/linux/netfilter/x_tables.h>
13207  
13208  /* Test a struct->invflags and a boolean for inequality */
13209 @@ -341,6 +342,8 @@
13210   */
13211  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13212  
13213 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13214 +
13215  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13216   *
13217   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13218 @@ -361,6 +364,9 @@
13219  {
13220         unsigned int addend;
13221  
13222 +       /* RT protection */
13223 +       local_lock(xt_write_lock);
13224 +
13225         /*
13226          * Low order bit of sequence is set if we already
13227          * called xt_write_recseq_begin().
13228 @@ -391,6 +397,7 @@
13229         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13230         smp_wmb();
13231         __this_cpu_add(xt_recseq.sequence, addend);
13232 +       local_unlock(xt_write_lock);
13233  }
13234  
13235  /*
13236 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/nfs_fs.h linux-4.14/include/linux/nfs_fs.h
13237 --- linux-4.14.orig/include/linux/nfs_fs.h      2017-11-12 19:46:13.000000000 +0100
13238 +++ linux-4.14/include/linux/nfs_fs.h   2018-09-05 11:05:07.000000000 +0200
13239 @@ -162,7 +162,11 @@
13240  
13241         /* Readers: in-flight sillydelete RPC calls */
13242         /* Writers: rmdir */
13243 +#ifdef CONFIG_PREEMPT_RT_BASE
13244 +       struct semaphore        rmdir_sem;
13245 +#else
13246         struct rw_semaphore     rmdir_sem;
13247 +#endif
13248         struct mutex            commit_mutex;
13249  
13250  #if IS_ENABLED(CONFIG_NFS_V4)
13251 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/nfs_xdr.h linux-4.14/include/linux/nfs_xdr.h
13252 --- linux-4.14.orig/include/linux/nfs_xdr.h     2017-11-12 19:46:13.000000000 +0100
13253 +++ linux-4.14/include/linux/nfs_xdr.h  2018-09-05 11:05:07.000000000 +0200
13254 @@ -1530,7 +1530,7 @@
13255         struct nfs_removeargs args;
13256         struct nfs_removeres res;
13257         struct dentry *dentry;
13258 -       wait_queue_head_t wq;
13259 +       struct swait_queue_head wq;
13260         struct rpc_cred *cred;
13261         struct nfs_fattr dir_attr;
13262         long timeout;
13263 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/notifier.h linux-4.14/include/linux/notifier.h
13264 --- linux-4.14.orig/include/linux/notifier.h    2017-11-12 19:46:13.000000000 +0100
13265 +++ linux-4.14/include/linux/notifier.h 2018-09-05 11:05:07.000000000 +0200
13266 @@ -7,7 +7,7 @@
13267   *
13268   *                             Alan Cox <Alan.Cox@linux.org>
13269   */
13270
13271 +
13272  #ifndef _LINUX_NOTIFIER_H
13273  #define _LINUX_NOTIFIER_H
13274  #include <linux/errno.h>
13275 @@ -43,9 +43,7 @@
13276   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13277   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13278   * SRCU notifier chains should be used when the chain will be called very
13279 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13280 - * chains are slightly more difficult to use because they require special
13281 - * runtime initialization.
13282 + * often but notifier_blocks will seldom be removed.
13283   */
13284  
13285  struct notifier_block;
13286 @@ -91,7 +89,7 @@
13287                 (name)->head = NULL;            \
13288         } while (0)
13289  
13290 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13291 +/* srcu_notifier_heads must be cleaned up dynamically */
13292  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13293  #define srcu_cleanup_notifier_head(name)       \
13294                 cleanup_srcu_struct(&(name)->srcu);
13295 @@ -104,7 +102,13 @@
13296                 .head = NULL }
13297  #define RAW_NOTIFIER_INIT(name)        {                               \
13298                 .head = NULL }
13299 -/* srcu_notifier_heads cannot be initialized statically */
13300 +
13301 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13302 +       {                                                       \
13303 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13304 +               .head = NULL,                                   \
13305 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13306 +       }
13307  
13308  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13309         struct atomic_notifier_head name =                      \
13310 @@ -116,6 +120,26 @@
13311         struct raw_notifier_head name =                         \
13312                 RAW_NOTIFIER_INIT(name)
13313  
13314 +#ifdef CONFIG_TREE_SRCU
13315 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13316 +       static DEFINE_PER_CPU(struct srcu_data,                 \
13317 +                       name##_head_srcu_data);                 \
13318 +       mod struct srcu_notifier_head name =                    \
13319 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
13320 +
13321 +#else
13322 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13323 +       mod struct srcu_notifier_head name =                    \
13324 +                       SRCU_NOTIFIER_INIT(name, name)
13325 +
13326 +#endif
13327 +
13328 +#define SRCU_NOTIFIER_HEAD(name)                               \
13329 +       _SRCU_NOTIFIER_HEAD(name, )
13330 +
13331 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13332 +       _SRCU_NOTIFIER_HEAD(name, static)
13333 +
13334  #ifdef __KERNEL__
13335  
13336  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13337 @@ -185,12 +209,12 @@
13338  
13339  /*
13340   *     Declared notifiers so far. I can imagine quite a few more chains
13341 - *     over time (eg laptop power reset chains, reboot chain (to clean 
13342 + *     over time (eg laptop power reset chains, reboot chain (to clean
13343   *     device units up), device [un]mount chain, module load/unload chain,
13344 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
13345 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13346   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13347   */
13348
13349 +
13350  /* CPU notfiers are defined in include/linux/cpu.h. */
13351  
13352  /* netdevice notifiers are defined in include/linux/netdevice.h */
13353 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/percpu.h linux-4.14/include/linux/percpu.h
13354 --- linux-4.14.orig/include/linux/percpu.h      2017-11-12 19:46:13.000000000 +0100
13355 +++ linux-4.14/include/linux/percpu.h   2018-09-05 11:05:07.000000000 +0200
13356 @@ -19,6 +19,35 @@
13357  #define PERCPU_MODULE_RESERVE          0
13358  #endif
13359  
13360 +#ifdef CONFIG_PREEMPT_RT_FULL
13361 +
13362 +#define get_local_var(var) (*({        \
13363 +       migrate_disable();      \
13364 +       this_cpu_ptr(&var);     }))
13365 +
13366 +#define put_local_var(var) do {        \
13367 +       (void)&(var);           \
13368 +       migrate_enable();       \
13369 +} while (0)
13370 +
13371 +# define get_local_ptr(var) ({ \
13372 +       migrate_disable();      \
13373 +       this_cpu_ptr(var);      })
13374 +
13375 +# define put_local_ptr(var) do {       \
13376 +       (void)(var);                    \
13377 +       migrate_enable();               \
13378 +} while (0)
13379 +
13380 +#else
13381 +
13382 +#define get_local_var(var)     get_cpu_var(var)
13383 +#define put_local_var(var)     put_cpu_var(var)
13384 +#define get_local_ptr(var)     get_cpu_ptr(var)
13385 +#define put_local_ptr(var)     put_cpu_ptr(var)
13386 +
13387 +#endif
13388 +
13389  /* minimum unit size, also is the maximum supported allocation size */
13390  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13391  
13392 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/percpu-rwsem.h linux-4.14/include/linux/percpu-rwsem.h
13393 --- linux-4.14.orig/include/linux/percpu-rwsem.h        2018-09-05 11:03:22.000000000 +0200
13394 +++ linux-4.14/include/linux/percpu-rwsem.h     2018-09-05 11:05:07.000000000 +0200
13395 @@ -29,7 +29,7 @@
13396  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
13397  extern void __percpu_up_read(struct percpu_rw_semaphore *);
13398  
13399 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
13400 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13401  {
13402         might_sleep();
13403  
13404 @@ -47,16 +47,10 @@
13405         __this_cpu_inc(*sem->read_count);
13406         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
13407                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
13408 -       barrier();
13409         /*
13410 -        * The barrier() prevents the compiler from
13411 +        * The preempt_enable() prevents the compiler from
13412          * bleeding the critical section out.
13413          */
13414 -}
13415 -
13416 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13417 -{
13418 -       percpu_down_read_preempt_disable(sem);
13419         preempt_enable();
13420  }
13421  
13422 @@ -83,13 +77,9 @@
13423         return ret;
13424  }
13425  
13426 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
13427 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13428  {
13429 -       /*
13430 -        * The barrier() prevents the compiler from
13431 -        * bleeding the critical section out.
13432 -        */
13433 -       barrier();
13434 +       preempt_disable();
13435         /*
13436          * Same as in percpu_down_read().
13437          */
13438 @@ -102,12 +92,6 @@
13439         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
13440  }
13441  
13442 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13443 -{
13444 -       preempt_disable();
13445 -       percpu_up_read_preempt_enable(sem);
13446 -}
13447 -
13448  extern void percpu_down_write(struct percpu_rw_semaphore *);
13449  extern void percpu_up_write(struct percpu_rw_semaphore *);
13450  
13451 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/pid.h linux-4.14/include/linux/pid.h
13452 --- linux-4.14.orig/include/linux/pid.h 2017-11-12 19:46:13.000000000 +0100
13453 +++ linux-4.14/include/linux/pid.h      2018-09-05 11:05:07.000000000 +0200
13454 @@ -3,6 +3,7 @@
13455  #define _LINUX_PID_H
13456  
13457  #include <linux/rculist.h>
13458 +#include <linux/atomic.h>
13459  
13460  enum pid_type
13461  {
13462 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/posix-timers.h linux-4.14/include/linux/posix-timers.h
13463 --- linux-4.14.orig/include/linux/posix-timers.h        2017-11-12 19:46:13.000000000 +0100
13464 +++ linux-4.14/include/linux/posix-timers.h     2018-09-05 11:05:07.000000000 +0200
13465 @@ -101,8 +101,8 @@
13466                 struct {
13467                         struct alarm    alarmtimer;
13468                 } alarm;
13469 -               struct rcu_head         rcu;
13470         } it;
13471 +       struct rcu_head         rcu;
13472  };
13473  
13474  void run_posix_cpu_timers(struct task_struct *task);
13475 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/preempt.h linux-4.14/include/linux/preempt.h
13476 --- linux-4.14.orig/include/linux/preempt.h     2017-11-12 19:46:13.000000000 +0100
13477 +++ linux-4.14/include/linux/preempt.h  2018-09-05 11:05:07.000000000 +0200
13478 @@ -51,7 +51,11 @@
13479  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13480  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13481  
13482 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13483 +#ifndef CONFIG_PREEMPT_RT_FULL
13484 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13485 +#else
13486 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13487 +#endif
13488  
13489  /* We use the MSB mostly because its available */
13490  #define PREEMPT_NEED_RESCHED   0x80000000
13491 @@ -81,9 +85,15 @@
13492  #include <asm/preempt.h>
13493  
13494  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13495 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13496  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13497                                  | NMI_MASK))
13498 +#ifndef CONFIG_PREEMPT_RT_FULL
13499 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13500 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13501 +#else
13502 +# define softirq_count()       (0UL)
13503 +extern int in_serving_softirq(void);
13504 +#endif
13505  
13506  /*
13507   * Are we doing bottom half or hardware interrupt processing?
13508 @@ -101,7 +111,6 @@
13509  #define in_irq()               (hardirq_count())
13510  #define in_softirq()           (softirq_count())
13511  #define in_interrupt()         (irq_count())
13512 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
13513  #define in_nmi()               (preempt_count() & NMI_MASK)
13514  #define in_task()              (!(preempt_count() & \
13515                                    (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
13516 @@ -118,7 +127,11 @@
13517  /*
13518   * The preempt_count offset after spin_lock()
13519   */
13520 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13521  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
13522 +#else
13523 +#define PREEMPT_LOCK_OFFSET    0
13524 +#endif
13525  
13526  /*
13527   * The preempt_count offset needed for things like:
13528 @@ -167,6 +180,20 @@
13529  #define preempt_count_inc() preempt_count_add(1)
13530  #define preempt_count_dec() preempt_count_sub(1)
13531  
13532 +#ifdef CONFIG_PREEMPT_LAZY
13533 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
13534 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
13535 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
13536 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
13537 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
13538 +#else
13539 +#define add_preempt_lazy_count(val)    do { } while (0)
13540 +#define sub_preempt_lazy_count(val)    do { } while (0)
13541 +#define inc_preempt_lazy_count()       do { } while (0)
13542 +#define dec_preempt_lazy_count()       do { } while (0)
13543 +#define preempt_lazy_count()           (0)
13544 +#endif
13545 +
13546  #ifdef CONFIG_PREEMPT_COUNT
13547  
13548  #define preempt_disable() \
13549 @@ -175,16 +202,53 @@
13550         barrier(); \
13551  } while (0)
13552  
13553 +#define preempt_lazy_disable() \
13554 +do { \
13555 +       inc_preempt_lazy_count(); \
13556 +       barrier(); \
13557 +} while (0)
13558 +
13559  #define sched_preempt_enable_no_resched() \
13560  do { \
13561         barrier(); \
13562         preempt_count_dec(); \
13563  } while (0)
13564  
13565 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13566 +#ifdef CONFIG_PREEMPT_RT_BASE
13567 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13568 +# define preempt_check_resched_rt() preempt_check_resched()
13569 +#else
13570 +# define preempt_enable_no_resched() preempt_enable()
13571 +# define preempt_check_resched_rt() barrier();
13572 +#endif
13573  
13574  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
13575  
13576 +#ifdef CONFIG_SMP
13577 +
13578 +extern void migrate_disable(void);
13579 +extern void migrate_enable(void);
13580 +
13581 +int __migrate_disabled(struct task_struct *p);
13582 +
13583 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
13584 +
13585 +extern void migrate_disable(void);
13586 +extern void migrate_enable(void);
13587 +static inline int __migrate_disabled(struct task_struct *p)
13588 +{
13589 +       return 0;
13590 +}
13591 +
13592 +#else
13593 +#define migrate_disable()              barrier()
13594 +#define migrate_enable()               barrier()
13595 +static inline int __migrate_disabled(struct task_struct *p)
13596 +{
13597 +       return 0;
13598 +}
13599 +#endif
13600 +
13601  #ifdef CONFIG_PREEMPT
13602  #define preempt_enable() \
13603  do { \
13604 @@ -206,6 +270,13 @@
13605                 __preempt_schedule(); \
13606  } while (0)
13607  
13608 +#define preempt_lazy_enable() \
13609 +do { \
13610 +       dec_preempt_lazy_count(); \
13611 +       barrier(); \
13612 +       preempt_check_resched(); \
13613 +} while (0)
13614 +
13615  #else /* !CONFIG_PREEMPT */
13616  #define preempt_enable() \
13617  do { \
13618 @@ -213,6 +284,12 @@
13619         preempt_count_dec(); \
13620  } while (0)
13621  
13622 +#define preempt_lazy_enable() \
13623 +do { \
13624 +       dec_preempt_lazy_count(); \
13625 +       barrier(); \
13626 +} while (0)
13627 +
13628  #define preempt_enable_notrace() \
13629  do { \
13630         barrier(); \
13631 @@ -251,8 +328,16 @@
13632  #define preempt_disable_notrace()              barrier()
13633  #define preempt_enable_no_resched_notrace()    barrier()
13634  #define preempt_enable_notrace()               barrier()
13635 +#define preempt_check_resched_rt()             barrier()
13636  #define preemptible()                          0
13637  
13638 +#define migrate_disable()                      barrier()
13639 +#define migrate_enable()                       barrier()
13640 +
13641 +static inline int __migrate_disabled(struct task_struct *p)
13642 +{
13643 +       return 0;
13644 +}
13645  #endif /* CONFIG_PREEMPT_COUNT */
13646  
13647  #ifdef MODULE
13648 @@ -271,10 +356,22 @@
13649  } while (0)
13650  #define preempt_fold_need_resched() \
13651  do { \
13652 -       if (tif_need_resched()) \
13653 +       if (tif_need_resched_now()) \
13654                 set_preempt_need_resched(); \
13655  } while (0)
13656  
13657 +#ifdef CONFIG_PREEMPT_RT_FULL
13658 +# define preempt_disable_rt()          preempt_disable()
13659 +# define preempt_enable_rt()           preempt_enable()
13660 +# define preempt_disable_nort()                barrier()
13661 +# define preempt_enable_nort()         barrier()
13662 +#else
13663 +# define preempt_disable_rt()          barrier()
13664 +# define preempt_enable_rt()           barrier()
13665 +# define preempt_disable_nort()                preempt_disable()
13666 +# define preempt_enable_nort()         preempt_enable()
13667 +#endif
13668 +
13669  #ifdef CONFIG_PREEMPT_NOTIFIERS
13670  
13671  struct preempt_notifier;
13672 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/printk.h linux-4.14/include/linux/printk.h
13673 --- linux-4.14.orig/include/linux/printk.h      2017-11-12 19:46:13.000000000 +0100
13674 +++ linux-4.14/include/linux/printk.h   2018-09-05 11:05:07.000000000 +0200
13675 @@ -142,9 +142,11 @@
13676  #ifdef CONFIG_EARLY_PRINTK
13677  extern asmlinkage __printf(1, 2)
13678  void early_printk(const char *fmt, ...);
13679 +extern void printk_kill(void);
13680  #else
13681  static inline __printf(1, 2) __cold
13682  void early_printk(const char *s, ...) { }
13683 +static inline void printk_kill(void) { }
13684  #endif
13685  
13686  #ifdef CONFIG_PRINTK_NMI
13687 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/radix-tree.h linux-4.14/include/linux/radix-tree.h
13688 --- linux-4.14.orig/include/linux/radix-tree.h  2017-11-12 19:46:13.000000000 +0100
13689 +++ linux-4.14/include/linux/radix-tree.h       2018-09-05 11:05:07.000000000 +0200
13690 @@ -328,6 +328,8 @@
13691  int radix_tree_preload(gfp_t gfp_mask);
13692  int radix_tree_maybe_preload(gfp_t gfp_mask);
13693  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
13694 +void radix_tree_preload_end(void);
13695 +
13696  void radix_tree_init(void);
13697  void *radix_tree_tag_set(struct radix_tree_root *,
13698                         unsigned long index, unsigned int tag);
13699 @@ -347,11 +349,6 @@
13700                 unsigned int max_items, unsigned int tag);
13701  int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
13702  
13703 -static inline void radix_tree_preload_end(void)
13704 -{
13705 -       preempt_enable();
13706 -}
13707 -
13708  int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
13709  int radix_tree_split(struct radix_tree_root *, unsigned long index,
13710                         unsigned new_order);
13711 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/random.h linux-4.14/include/linux/random.h
13712 --- linux-4.14.orig/include/linux/random.h      2017-11-12 19:46:13.000000000 +0100
13713 +++ linux-4.14/include/linux/random.h   2018-09-05 11:05:07.000000000 +0200
13714 @@ -32,7 +32,7 @@
13715  
13716  extern void add_input_randomness(unsigned int type, unsigned int code,
13717                                  unsigned int value) __latent_entropy;
13718 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
13719 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
13720  
13721  extern void get_random_bytes(void *buf, int nbytes);
13722  extern int wait_for_random_bytes(void);
13723 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree_augmented.h linux-4.14/include/linux/rbtree_augmented.h
13724 --- linux-4.14.orig/include/linux/rbtree_augmented.h    2017-11-12 19:46:13.000000000 +0100
13725 +++ linux-4.14/include/linux/rbtree_augmented.h 2018-09-05 11:05:07.000000000 +0200
13726 @@ -26,6 +26,7 @@
13727  
13728  #include <linux/compiler.h>
13729  #include <linux/rbtree.h>
13730 +#include <linux/rcupdate.h>
13731  
13732  /*
13733   * Please note - only struct rb_augment_callbacks and the prototypes for
13734 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree.h linux-4.14/include/linux/rbtree.h
13735 --- linux-4.14.orig/include/linux/rbtree.h      2017-11-12 19:46:13.000000000 +0100
13736 +++ linux-4.14/include/linux/rbtree.h   2018-09-05 11:05:07.000000000 +0200
13737 @@ -31,7 +31,7 @@
13738  
13739  #include <linux/kernel.h>
13740  #include <linux/stddef.h>
13741 -#include <linux/rcupdate.h>
13742 +#include <linux/rcu_assign_pointer.h>
13743  
13744  struct rb_node {
13745         unsigned long  __rb_parent_color;
13746 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rbtree_latch.h linux-4.14/include/linux/rbtree_latch.h
13747 --- linux-4.14.orig/include/linux/rbtree_latch.h        2017-11-12 19:46:13.000000000 +0100
13748 +++ linux-4.14/include/linux/rbtree_latch.h     2018-09-05 11:05:07.000000000 +0200
13749 @@ -35,6 +35,7 @@
13750  
13751  #include <linux/rbtree.h>
13752  #include <linux/seqlock.h>
13753 +#include <linux/rcupdate.h>
13754  
13755  struct latch_tree_node {
13756         struct rb_node node[2];
13757 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcu_assign_pointer.h linux-4.14/include/linux/rcu_assign_pointer.h
13758 --- linux-4.14.orig/include/linux/rcu_assign_pointer.h  1970-01-01 01:00:00.000000000 +0100
13759 +++ linux-4.14/include/linux/rcu_assign_pointer.h       2018-09-05 11:05:07.000000000 +0200
13760 @@ -0,0 +1,54 @@
13761 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
13762 +#define __LINUX_RCU_ASSIGN_POINTER_H__
13763 +#include <linux/compiler.h>
13764 +#include <asm/barrier.h>
13765 +
13766 +/**
13767 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
13768 + * @v: The value to statically initialize with.
13769 + */
13770 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
13771 +
13772 +/**
13773 + * rcu_assign_pointer() - assign to RCU-protected pointer
13774 + * @p: pointer to assign to
13775 + * @v: value to assign (publish)
13776 + *
13777 + * Assigns the specified value to the specified RCU-protected
13778 + * pointer, ensuring that any concurrent RCU readers will see
13779 + * any prior initialization.
13780 + *
13781 + * Inserts memory barriers on architectures that require them
13782 + * (which is most of them), and also prevents the compiler from
13783 + * reordering the code that initializes the structure after the pointer
13784 + * assignment.  More importantly, this call documents which pointers
13785 + * will be dereferenced by RCU read-side code.
13786 + *
13787 + * In some special cases, you may use RCU_INIT_POINTER() instead
13788 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
13789 + * to the fact that it does not constrain either the CPU or the compiler.
13790 + * That said, using RCU_INIT_POINTER() when you should have used
13791 + * rcu_assign_pointer() is a very bad thing that results in
13792 + * impossible-to-diagnose memory corruption.  So please be careful.
13793 + * See the RCU_INIT_POINTER() comment header for details.
13794 + *
13795 + * Note that rcu_assign_pointer() evaluates each of its arguments only
13796 + * once, appearances notwithstanding.  One of the "extra" evaluations
13797 + * is in typeof() and the other visible only to sparse (__CHECKER__),
13798 + * neither of which actually execute the argument.  As with most cpp
13799 + * macros, this execute-arguments-only-once property is important, so
13800 + * please be careful when making changes to rcu_assign_pointer() and the
13801 + * other macros that it invokes.
13802 + */
13803 +#define rcu_assign_pointer(p, v)                                             \
13804 +({                                                                           \
13805 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
13806 +                                                                             \
13807 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
13808 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
13809 +       else                                                                  \
13810 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
13811 +       _r_a_p__v;                                                            \
13812 +})
13813 +
13814 +#endif
13815 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcupdate.h linux-4.14/include/linux/rcupdate.h
13816 --- linux-4.14.orig/include/linux/rcupdate.h    2018-09-05 11:03:22.000000000 +0200
13817 +++ linux-4.14/include/linux/rcupdate.h 2018-09-05 11:05:07.000000000 +0200
13818 @@ -42,6 +42,7 @@
13819  #include <linux/lockdep.h>
13820  #include <asm/processor.h>
13821  #include <linux/cpumask.h>
13822 +#include <linux/rcu_assign_pointer.h>
13823  
13824  #define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
13825  #define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
13826 @@ -55,7 +56,11 @@
13827  #define        call_rcu        call_rcu_sched
13828  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13829  
13830 +#ifdef CONFIG_PREEMPT_RT_FULL
13831 +#define call_rcu_bh    call_rcu
13832 +#else
13833  void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
13834 +#endif
13835  void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
13836  void synchronize_sched(void);
13837  void rcu_barrier_tasks(void);
13838 @@ -74,6 +79,11 @@
13839   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
13840   */
13841  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
13842 +#ifndef CONFIG_PREEMPT_RT_FULL
13843 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13844 +#else
13845 +static inline int sched_rcu_preempt_depth(void) { return 0; }
13846 +#endif
13847  
13848  #else /* #ifdef CONFIG_PREEMPT_RCU */
13849  
13850 @@ -99,6 +109,8 @@
13851         return 0;
13852  }
13853  
13854 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13855 +
13856  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13857  
13858  /* Internal to kernel */
13859 @@ -255,7 +267,14 @@
13860  extern struct lockdep_map rcu_callback_map;
13861  int debug_lockdep_rcu_enabled(void);
13862  int rcu_read_lock_held(void);
13863 +#ifdef CONFIG_PREEMPT_RT_FULL
13864 +static inline int rcu_read_lock_bh_held(void)
13865 +{
13866 +       return rcu_read_lock_held();
13867 +}
13868 +#else
13869  int rcu_read_lock_bh_held(void);
13870 +#endif
13871  int rcu_read_lock_sched_held(void);
13872  
13873  #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
13874 @@ -365,54 +384,6 @@
13875  })
13876  
13877  /**
13878 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
13879 - * @v: The value to statically initialize with.
13880 - */
13881 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
13882 -
13883 -/**
13884 - * rcu_assign_pointer() - assign to RCU-protected pointer
13885 - * @p: pointer to assign to
13886 - * @v: value to assign (publish)
13887 - *
13888 - * Assigns the specified value to the specified RCU-protected
13889 - * pointer, ensuring that any concurrent RCU readers will see
13890 - * any prior initialization.
13891 - *
13892 - * Inserts memory barriers on architectures that require them
13893 - * (which is most of them), and also prevents the compiler from
13894 - * reordering the code that initializes the structure after the pointer
13895 - * assignment.  More importantly, this call documents which pointers
13896 - * will be dereferenced by RCU read-side code.
13897 - *
13898 - * In some special cases, you may use RCU_INIT_POINTER() instead
13899 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
13900 - * to the fact that it does not constrain either the CPU or the compiler.
13901 - * That said, using RCU_INIT_POINTER() when you should have used
13902 - * rcu_assign_pointer() is a very bad thing that results in
13903 - * impossible-to-diagnose memory corruption.  So please be careful.
13904 - * See the RCU_INIT_POINTER() comment header for details.
13905 - *
13906 - * Note that rcu_assign_pointer() evaluates each of its arguments only
13907 - * once, appearances notwithstanding.  One of the "extra" evaluations
13908 - * is in typeof() and the other visible only to sparse (__CHECKER__),
13909 - * neither of which actually execute the argument.  As with most cpp
13910 - * macros, this execute-arguments-only-once property is important, so
13911 - * please be careful when making changes to rcu_assign_pointer() and the
13912 - * other macros that it invokes.
13913 - */
13914 -#define rcu_assign_pointer(p, v)                                             \
13915 -({                                                                           \
13916 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
13917 -                                                                             \
13918 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
13919 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
13920 -       else                                                                  \
13921 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
13922 -       _r_a_p__v;                                                            \
13923 -})
13924 -
13925 -/**
13926   * rcu_swap_protected() - swap an RCU and a regular pointer
13927   * @rcu_ptr: RCU pointer
13928   * @ptr: regular pointer
13929 @@ -707,10 +678,14 @@
13930  static inline void rcu_read_lock_bh(void)
13931  {
13932         local_bh_disable();
13933 +#ifdef CONFIG_PREEMPT_RT_FULL
13934 +       rcu_read_lock();
13935 +#else
13936         __acquire(RCU_BH);
13937         rcu_lock_acquire(&rcu_bh_lock_map);
13938         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13939                          "rcu_read_lock_bh() used illegally while idle");
13940 +#endif
13941  }
13942  
13943  /*
13944 @@ -720,10 +695,14 @@
13945   */
13946  static inline void rcu_read_unlock_bh(void)
13947  {
13948 +#ifdef CONFIG_PREEMPT_RT_FULL
13949 +       rcu_read_unlock();
13950 +#else
13951         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13952                          "rcu_read_unlock_bh() used illegally while idle");
13953         rcu_lock_release(&rcu_bh_lock_map);
13954         __release(RCU_BH);
13955 +#endif
13956         local_bh_enable();
13957  }
13958  
13959 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rcutree.h linux-4.14/include/linux/rcutree.h
13960 --- linux-4.14.orig/include/linux/rcutree.h     2017-11-12 19:46:13.000000000 +0100
13961 +++ linux-4.14/include/linux/rcutree.h  2018-09-05 11:05:07.000000000 +0200
13962 @@ -44,7 +44,11 @@
13963         rcu_note_context_switch(false);
13964  }
13965  
13966 +#ifdef CONFIG_PREEMPT_RT_FULL
13967 +# define synchronize_rcu_bh    synchronize_rcu
13968 +#else
13969  void synchronize_rcu_bh(void);
13970 +#endif
13971  void synchronize_sched_expedited(void);
13972  void synchronize_rcu_expedited(void);
13973  
13974 @@ -72,7 +76,11 @@
13975  }
13976  
13977  void rcu_barrier(void);
13978 +#ifdef CONFIG_PREEMPT_RT_FULL
13979 +# define rcu_barrier_bh                rcu_barrier
13980 +#else
13981  void rcu_barrier_bh(void);
13982 +#endif
13983  void rcu_barrier_sched(void);
13984  unsigned long get_state_synchronize_rcu(void);
13985  void cond_synchronize_rcu(unsigned long oldstate);
13986 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/ring_buffer.h linux-4.14/include/linux/ring_buffer.h
13987 --- linux-4.14.orig/include/linux/ring_buffer.h 2018-09-05 11:03:22.000000000 +0200
13988 +++ linux-4.14/include/linux/ring_buffer.h      2018-09-05 11:05:07.000000000 +0200
13989 @@ -34,10 +34,12 @@
13990   *                              array[0] = time delta (28 .. 59)
13991   *                              size = 8 bytes
13992   *
13993 - * @RINGBUF_TYPE_TIME_STAMP:   Sync time stamp with external clock
13994 - *                              array[0]    = tv_nsec
13995 - *                              array[1..2] = tv_sec
13996 - *                              size = 16 bytes
13997 + * @RINGBUF_TYPE_TIME_STAMP:   Absolute timestamp
13998 + *                              Same format as TIME_EXTEND except that the
13999 + *                              value is an absolute timestamp, not a delta
14000 + *                              event.time_delta contains bottom 27 bits
14001 + *                              array[0] = top (28 .. 59) bits
14002 + *                              size = 8 bytes
14003   *
14004   * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
14005   *                             Data record
14006 @@ -54,12 +56,12 @@
14007         RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
14008         RINGBUF_TYPE_PADDING,
14009         RINGBUF_TYPE_TIME_EXTEND,
14010 -       /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
14011         RINGBUF_TYPE_TIME_STAMP,
14012  };
14013  
14014  unsigned ring_buffer_event_length(struct ring_buffer_event *event);
14015  void *ring_buffer_event_data(struct ring_buffer_event *event);
14016 +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
14017  
14018  /*
14019   * ring_buffer_discard_commit will remove an event that has not
14020 @@ -115,6 +117,9 @@
14021  int ring_buffer_write(struct ring_buffer *buffer,
14022                       unsigned long length, void *data);
14023  
14024 +void ring_buffer_nest_start(struct ring_buffer *buffer);
14025 +void ring_buffer_nest_end(struct ring_buffer *buffer);
14026 +
14027  struct ring_buffer_event *
14028  ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
14029                  unsigned long *lost_events);
14030 @@ -179,6 +184,8 @@
14031                                       int cpu, u64 *ts);
14032  void ring_buffer_set_clock(struct ring_buffer *buffer,
14033                            u64 (*clock)(void));
14034 +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
14035 +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
14036  
14037  size_t ring_buffer_page_len(void *page);
14038  
14039 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rtmutex.h linux-4.14/include/linux/rtmutex.h
14040 --- linux-4.14.orig/include/linux/rtmutex.h     2017-11-12 19:46:13.000000000 +0100
14041 +++ linux-4.14/include/linux/rtmutex.h  2018-09-05 11:05:07.000000000 +0200
14042 @@ -14,11 +14,15 @@
14043  #define __LINUX_RT_MUTEX_H
14044  
14045  #include <linux/linkage.h>
14046 +#include <linux/spinlock_types_raw.h>
14047  #include <linux/rbtree.h>
14048 -#include <linux/spinlock_types.h>
14049  
14050  extern int max_lock_depth; /* for sysctl */
14051  
14052 +#ifdef CONFIG_DEBUG_MUTEXES
14053 +#include <linux/debug_locks.h>
14054 +#endif
14055 +
14056  /**
14057   * The rt_mutex structure
14058   *
14059 @@ -31,8 +35,8 @@
14060         raw_spinlock_t          wait_lock;
14061         struct rb_root_cached   waiters;
14062         struct task_struct      *owner;
14063 -#ifdef CONFIG_DEBUG_RT_MUTEXES
14064         int                     save_state;
14065 +#ifdef CONFIG_DEBUG_RT_MUTEXES
14066         const char              *name, *file;
14067         int                     line;
14068         void                    *magic;
14069 @@ -82,16 +86,23 @@
14070  #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14071  #endif
14072  
14073 -#define __RT_MUTEX_INITIALIZER(mutexname) \
14074 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14075 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14076 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14077         , .waiters = RB_ROOT_CACHED \
14078         , .owner = NULL \
14079         __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14080 -       __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
14081 +       __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14082 +
14083 +#define __RT_MUTEX_INITIALIZER(mutexname) \
14084 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
14085  
14086  #define DEFINE_RT_MUTEX(mutexname) \
14087         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
14088  
14089 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14090 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
14091 +               , .save_state = 1 }
14092 +
14093  /**
14094   * rt_mutex_is_locked - is the mutex locked
14095   * @lock: the mutex to be queried
14096 @@ -108,6 +119,7 @@
14097  
14098  extern void rt_mutex_lock(struct rt_mutex *lock);
14099  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14100 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14101  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14102                                struct hrtimer_sleeper *timeout);
14103  
14104 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_rt.h linux-4.14/include/linux/rwlock_rt.h
14105 --- linux-4.14.orig/include/linux/rwlock_rt.h   1970-01-01 01:00:00.000000000 +0100
14106 +++ linux-4.14/include/linux/rwlock_rt.h        2018-09-05 11:05:07.000000000 +0200
14107 @@ -0,0 +1,119 @@
14108 +#ifndef __LINUX_RWLOCK_RT_H
14109 +#define __LINUX_RWLOCK_RT_H
14110 +
14111 +#ifndef __LINUX_SPINLOCK_H
14112 +#error Do not include directly. Use spinlock.h
14113 +#endif
14114 +
14115 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14116 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14117 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14118 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14119 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14120 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14121 +extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock);
14122 +extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock);
14123 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14124 +
14125 +#define read_can_lock(rwlock)          rt_read_can_lock(rwlock)
14126 +#define write_can_lock(rwlock)         rt_write_can_lock(rwlock)
14127 +
14128 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
14129 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
14130 +
14131 +static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
14132 +{
14133 +       /* XXX ARCH_IRQ_ENABLED */
14134 +       *flags = 0;
14135 +       return rt_write_trylock(lock);
14136 +}
14137 +
14138 +#define write_trylock_irqsave(lock, flags)             \
14139 +       __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags)))
14140 +
14141 +#define read_lock_irqsave(lock, flags)                 \
14142 +       do {                                            \
14143 +               typecheck(unsigned long, flags);        \
14144 +               rt_read_lock(lock);                     \
14145 +               flags = 0;                              \
14146 +       } while (0)
14147 +
14148 +#define write_lock_irqsave(lock, flags)                        \
14149 +       do {                                            \
14150 +               typecheck(unsigned long, flags);        \
14151 +               rt_write_lock(lock);                    \
14152 +               flags = 0;                              \
14153 +       } while (0)
14154 +
14155 +#define read_lock(lock)                rt_read_lock(lock)
14156 +
14157 +#define read_lock_bh(lock)                             \
14158 +       do {                                            \
14159 +               local_bh_disable();                     \
14160 +               rt_read_lock(lock);                     \
14161 +       } while (0)
14162 +
14163 +#define read_lock_irq(lock)    read_lock(lock)
14164 +
14165 +#define write_lock(lock)       rt_write_lock(lock)
14166 +
14167 +#define write_lock_bh(lock)                            \
14168 +       do {                                            \
14169 +               local_bh_disable();                     \
14170 +               rt_write_lock(lock);                    \
14171 +       } while (0)
14172 +
14173 +#define write_lock_irq(lock)   write_lock(lock)
14174 +
14175 +#define read_unlock(lock)      rt_read_unlock(lock)
14176 +
14177 +#define read_unlock_bh(lock)                           \
14178 +       do {                                            \
14179 +               rt_read_unlock(lock);                   \
14180 +               local_bh_enable();                      \
14181 +       } while (0)
14182 +
14183 +#define read_unlock_irq(lock)  read_unlock(lock)
14184 +
14185 +#define write_unlock(lock)     rt_write_unlock(lock)
14186 +
14187 +#define write_unlock_bh(lock)                          \
14188 +       do {                                            \
14189 +               rt_write_unlock(lock);                  \
14190 +               local_bh_enable();                      \
14191 +       } while (0)
14192 +
14193 +#define write_unlock_irq(lock) write_unlock(lock)
14194 +
14195 +#define read_unlock_irqrestore(lock, flags)            \
14196 +       do {                                            \
14197 +               typecheck(unsigned long, flags);        \
14198 +               (void) flags;                           \
14199 +               rt_read_unlock(lock);                   \
14200 +       } while (0)
14201 +
14202 +#define write_unlock_irqrestore(lock, flags) \
14203 +       do {                                            \
14204 +               typecheck(unsigned long, flags);        \
14205 +               (void) flags;                           \
14206 +               rt_write_unlock(lock);                  \
14207 +       } while (0)
14208 +
14209 +#define rwlock_init(rwl)                               \
14210 +do {                                                   \
14211 +       static struct lock_class_key __key;             \
14212 +                                                       \
14213 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
14214 +} while (0)
14215 +
14216 +/*
14217 + * Internal functions made global for CPU pinning
14218 + */
14219 +void __read_rt_lock(struct rt_rw_lock *lock);
14220 +int __read_rt_trylock(struct rt_rw_lock *lock);
14221 +void __write_rt_lock(struct rt_rw_lock *lock);
14222 +int __write_rt_trylock(struct rt_rw_lock *lock);
14223 +void __read_rt_unlock(struct rt_rw_lock *lock);
14224 +void __write_rt_unlock(struct rt_rw_lock *lock);
14225 +
14226 +#endif
14227 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_types.h linux-4.14/include/linux/rwlock_types.h
14228 --- linux-4.14.orig/include/linux/rwlock_types.h        2017-11-12 19:46:13.000000000 +0100
14229 +++ linux-4.14/include/linux/rwlock_types.h     2018-09-05 11:05:07.000000000 +0200
14230 @@ -1,6 +1,10 @@
14231  #ifndef __LINUX_RWLOCK_TYPES_H
14232  #define __LINUX_RWLOCK_TYPES_H
14233  
14234 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14235 +# error "Do not include directly, include spinlock_types.h"
14236 +#endif
14237 +
14238  /*
14239   * include/linux/rwlock_types.h - generic rwlock type definitions
14240   *                               and initializers
14241 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwlock_types_rt.h linux-4.14/include/linux/rwlock_types_rt.h
14242 --- linux-4.14.orig/include/linux/rwlock_types_rt.h     1970-01-01 01:00:00.000000000 +0100
14243 +++ linux-4.14/include/linux/rwlock_types_rt.h  2018-09-05 11:05:07.000000000 +0200
14244 @@ -0,0 +1,55 @@
14245 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14246 +#define __LINUX_RWLOCK_TYPES_RT_H
14247 +
14248 +#ifndef __LINUX_SPINLOCK_TYPES_H
14249 +#error "Do not include directly. Include spinlock_types.h instead"
14250 +#endif
14251 +
14252 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14253 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14254 +#else
14255 +# define RW_DEP_MAP_INIT(lockname)
14256 +#endif
14257 +
14258 +typedef struct rt_rw_lock rwlock_t;
14259 +
14260 +#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
14261 +
14262 +#define DEFINE_RWLOCK(name) \
14263 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
14264 +
14265 +/*
14266 + * A reader biased implementation primarily for CPU pinning.
14267 + *
14268 + * Can be selected as general replacement for the single reader RT rwlock
14269 + * variant
14270 + */
14271 +struct rt_rw_lock {
14272 +       struct rt_mutex         rtmutex;
14273 +       atomic_t                readers;
14274 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14275 +       struct lockdep_map      dep_map;
14276 +#endif
14277 +};
14278 +
14279 +#define READER_BIAS    (1U << 31)
14280 +#define WRITER_BIAS    (1U << 30)
14281 +
14282 +#define __RWLOCK_RT_INITIALIZER(name)                                  \
14283 +{                                                                      \
14284 +       .readers = ATOMIC_INIT(READER_BIAS),                            \
14285 +       .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex),     \
14286 +       RW_DEP_MAP_INIT(name)                                           \
14287 +}
14288 +
14289 +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
14290 +                            struct lock_class_key *key);
14291 +
14292 +#define rwlock_biased_rt_init(rwlock)                                  \
14293 +       do {                                                            \
14294 +               static struct lock_class_key __key;                     \
14295 +                                                                       \
14296 +               __rwlock_biased_rt_init((rwlock), #rwlock, &__key);     \
14297 +       } while (0)
14298 +
14299 +#endif
14300 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwsem.h linux-4.14/include/linux/rwsem.h
14301 --- linux-4.14.orig/include/linux/rwsem.h       2018-09-05 11:03:22.000000000 +0200
14302 +++ linux-4.14/include/linux/rwsem.h    2018-09-05 11:05:07.000000000 +0200
14303 @@ -20,6 +20,10 @@
14304  #include <linux/osq_lock.h>
14305  #endif
14306  
14307 +#ifdef CONFIG_PREEMPT_RT_FULL
14308 +#include <linux/rwsem_rt.h>
14309 +#else /* PREEMPT_RT_FULL */
14310 +
14311  struct rw_semaphore;
14312  
14313  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14314 @@ -114,6 +118,13 @@
14315         return !list_empty(&sem->wait_list);
14316  }
14317  
14318 +#endif /* !PREEMPT_RT_FULL */
14319 +
14320 +/*
14321 + * The functions below are the same for all rwsem implementations including
14322 + * the RT specific variant.
14323 + */
14324 +
14325  /*
14326   * lock for reading
14327   */
14328 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/rwsem_rt.h linux-4.14/include/linux/rwsem_rt.h
14329 --- linux-4.14.orig/include/linux/rwsem_rt.h    1970-01-01 01:00:00.000000000 +0100
14330 +++ linux-4.14/include/linux/rwsem_rt.h 2018-09-05 11:05:07.000000000 +0200
14331 @@ -0,0 +1,67 @@
14332 +#ifndef _LINUX_RWSEM_RT_H
14333 +#define _LINUX_RWSEM_RT_H
14334 +
14335 +#ifndef _LINUX_RWSEM_H
14336 +#error "Include rwsem.h"
14337 +#endif
14338 +
14339 +#include <linux/rtmutex.h>
14340 +#include <linux/swait.h>
14341 +
14342 +#define READER_BIAS            (1U << 31)
14343 +#define WRITER_BIAS            (1U << 30)
14344 +
14345 +struct rw_semaphore {
14346 +       atomic_t                readers;
14347 +       struct rt_mutex         rtmutex;
14348 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14349 +       struct lockdep_map      dep_map;
14350 +#endif
14351 +};
14352 +
14353 +#define __RWSEM_INITIALIZER(name)                              \
14354 +{                                                              \
14355 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
14356 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
14357 +       RW_DEP_MAP_INIT(name)                                   \
14358 +}
14359 +
14360 +#define DECLARE_RWSEM(lockname) \
14361 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14362 +
14363 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
14364 +                         struct lock_class_key *key);
14365 +
14366 +#define __init_rwsem(sem, name, key)                   \
14367 +do {                                                   \
14368 +               rt_mutex_init(&(sem)->rtmutex);         \
14369 +               __rwsem_init((sem), (name), (key));     \
14370 +} while (0)
14371 +
14372 +#define init_rwsem(sem)                                        \
14373 +do {                                                   \
14374 +       static struct lock_class_key __key;             \
14375 +                                                       \
14376 +       __init_rwsem((sem), #sem, &__key);              \
14377 +} while (0)
14378 +
14379 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
14380 +{
14381 +       return atomic_read(&sem->readers) != READER_BIAS;
14382 +}
14383 +
14384 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14385 +{
14386 +       return atomic_read(&sem->readers) > 0;
14387 +}
14388 +
14389 +extern void __down_read(struct rw_semaphore *sem);
14390 +extern int __down_read_trylock(struct rw_semaphore *sem);
14391 +extern void __down_write(struct rw_semaphore *sem);
14392 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
14393 +extern int __down_write_trylock(struct rw_semaphore *sem);
14394 +extern void __up_read(struct rw_semaphore *sem);
14395 +extern void __up_write(struct rw_semaphore *sem);
14396 +extern void __downgrade_write(struct rw_semaphore *sem);
14397 +
14398 +#endif
14399 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/mm.h linux-4.14/include/linux/sched/mm.h
14400 --- linux-4.14.orig/include/linux/sched/mm.h    2017-11-12 19:46:13.000000000 +0100
14401 +++ linux-4.14/include/linux/sched/mm.h 2018-09-05 11:05:07.000000000 +0200
14402 @@ -43,6 +43,17 @@
14403                 __mmdrop(mm);
14404  }
14405  
14406 +#ifdef CONFIG_PREEMPT_RT_BASE
14407 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14408 +static inline void mmdrop_delayed(struct mm_struct *mm)
14409 +{
14410 +       if (atomic_dec_and_test(&mm->mm_count))
14411 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14412 +}
14413 +#else
14414 +# define mmdrop_delayed(mm)    mmdrop(mm)
14415 +#endif
14416 +
14417  static inline void mmdrop_async_fn(struct work_struct *work)
14418  {
14419         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
14420 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/task.h linux-4.14/include/linux/sched/task.h
14421 --- linux-4.14.orig/include/linux/sched/task.h  2018-09-05 11:03:22.000000000 +0200
14422 +++ linux-4.14/include/linux/sched/task.h       2018-09-05 11:05:07.000000000 +0200
14423 @@ -88,6 +88,15 @@
14424  
14425  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14426  
14427 +#ifdef CONFIG_PREEMPT_RT_BASE
14428 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14429 +
14430 +static inline void put_task_struct(struct task_struct *t)
14431 +{
14432 +       if (atomic_dec_and_test(&t->usage))
14433 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
14434 +}
14435 +#else
14436  extern void __put_task_struct(struct task_struct *t);
14437  
14438  static inline void put_task_struct(struct task_struct *t)
14439 @@ -95,7 +104,7 @@
14440         if (atomic_dec_and_test(&t->usage))
14441                 __put_task_struct(t);
14442  }
14443 -
14444 +#endif
14445  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
14446  
14447  #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
14448 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched/wake_q.h linux-4.14/include/linux/sched/wake_q.h
14449 --- linux-4.14.orig/include/linux/sched/wake_q.h        2017-11-12 19:46:13.000000000 +0100
14450 +++ linux-4.14/include/linux/sched/wake_q.h     2018-09-05 11:05:07.000000000 +0200
14451 @@ -47,8 +47,29 @@
14452         head->lastp = &head->first;
14453  }
14454  
14455 -extern void wake_q_add(struct wake_q_head *head,
14456 -                      struct task_struct *task);
14457 -extern void wake_up_q(struct wake_q_head *head);
14458 +extern void __wake_q_add(struct wake_q_head *head,
14459 +                        struct task_struct *task, bool sleeper);
14460 +static inline void wake_q_add(struct wake_q_head *head,
14461 +                             struct task_struct *task)
14462 +{
14463 +       __wake_q_add(head, task, false);
14464 +}
14465 +
14466 +static inline void wake_q_add_sleeper(struct wake_q_head *head,
14467 +                                     struct task_struct *task)
14468 +{
14469 +       __wake_q_add(head, task, true);
14470 +}
14471 +
14472 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14473 +static inline void wake_up_q(struct wake_q_head *head)
14474 +{
14475 +       __wake_up_q(head, false);
14476 +}
14477 +
14478 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14479 +{
14480 +       __wake_up_q(head, true);
14481 +}
14482  
14483  #endif /* _LINUX_SCHED_WAKE_Q_H */
14484 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/sched.h linux-4.14/include/linux/sched.h
14485 --- linux-4.14.orig/include/linux/sched.h       2018-09-05 11:03:22.000000000 +0200
14486 +++ linux-4.14/include/linux/sched.h    2018-09-05 11:05:07.000000000 +0200
14487 @@ -27,6 +27,7 @@
14488  #include <linux/signal_types.h>
14489  #include <linux/mm_types_task.h>
14490  #include <linux/task_io_accounting.h>
14491 +#include <asm/kmap_types.h>
14492  
14493  /* task_struct member predeclarations (sorted alphabetically): */
14494  struct audit_context;
14495 @@ -93,7 +94,6 @@
14496  
14497  /* Convenience macros for the sake of wake_up(): */
14498  #define TASK_NORMAL                    (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14499 -#define TASK_ALL                       (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
14500  
14501  /* get_task_state(): */
14502  #define TASK_REPORT                    (TASK_RUNNING | TASK_INTERRUPTIBLE | \
14503 @@ -101,12 +101,8 @@
14504                                          __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
14505                                          TASK_PARKED)
14506  
14507 -#define task_is_traced(task)           ((task->state & __TASK_TRACED) != 0)
14508 -
14509  #define task_is_stopped(task)          ((task->state & __TASK_STOPPED) != 0)
14510  
14511 -#define task_is_stopped_or_traced(task)        ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14512 -
14513  #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14514                                          (task->flags & PF_FROZEN) == 0 && \
14515                                          (task->state & TASK_NOLOAD) == 0)
14516 @@ -134,6 +130,11 @@
14517                 smp_store_mb(current->state, (state_value));    \
14518         } while (0)
14519  
14520 +#define __set_current_state_no_track(state_value)              \
14521 +               current->state = (state_value);
14522 +#define set_current_state_no_track(state_value)                        \
14523 +               smp_store_mb(current->state, (state_value));
14524 +
14525  #define set_special_state(state_value)                                 \
14526         do {                                                            \
14527                 unsigned long flags; /* may shadow */                   \
14528 @@ -187,6 +188,9 @@
14529  #define set_current_state(state_value)                                 \
14530         smp_store_mb(current->state, (state_value))
14531  
14532 +#define __set_current_state_no_track(state_value)      __set_current_state(state_value)
14533 +#define set_current_state_no_track(state_value)                set_current_state(state_value)
14534 +
14535  /*
14536   * set_special_state() should be used for those states when the blocking task
14537   * can not use the regular condition based wait-loop. In that case we must
14538 @@ -566,6 +570,8 @@
14539  #endif
14540         /* -1 unrunnable, 0 runnable, >0 stopped: */
14541         volatile long                   state;
14542 +       /* saved state for "spinlock sleepers" */
14543 +       volatile long                   saved_state;
14544  
14545         /*
14546          * This begins the randomizable portion of task_struct. Only
14547 @@ -618,7 +624,25 @@
14548  
14549         unsigned int                    policy;
14550         int                             nr_cpus_allowed;
14551 -       cpumask_t                       cpus_allowed;
14552 +       const cpumask_t                 *cpus_ptr;
14553 +       cpumask_t                       cpus_mask;
14554 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
14555 +       int                             migrate_disable;
14556 +       int                             migrate_disable_update;
14557 +       int                             pinned_on_cpu;
14558 +# ifdef CONFIG_SCHED_DEBUG
14559 +       int                             migrate_disable_atomic;
14560 +# endif
14561 +
14562 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14563 +       int                             migrate_disable;
14564 +# ifdef CONFIG_SCHED_DEBUG
14565 +       int                             migrate_disable_atomic;
14566 +# endif
14567 +#endif
14568 +#ifdef CONFIG_PREEMPT_RT_FULL
14569 +       int                             sleeping_lock;
14570 +#endif
14571  
14572  #ifdef CONFIG_PREEMPT_RCU
14573         int                             rcu_read_lock_nesting;
14574 @@ -777,6 +801,9 @@
14575  #ifdef CONFIG_POSIX_TIMERS
14576         struct task_cputime             cputime_expires;
14577         struct list_head                cpu_timers[3];
14578 +#ifdef CONFIG_PREEMPT_RT_BASE
14579 +       struct task_struct              *posix_timer_list;
14580 +#endif
14581  #endif
14582  
14583         /* Process credentials: */
14584 @@ -820,11 +847,17 @@
14585         /* Signal handlers: */
14586         struct signal_struct            *signal;
14587         struct sighand_struct           *sighand;
14588 +       struct sigqueue                 *sigqueue_cache;
14589 +
14590         sigset_t                        blocked;
14591         sigset_t                        real_blocked;
14592         /* Restored if set_restore_sigmask() was used: */
14593         sigset_t                        saved_sigmask;
14594         struct sigpending               pending;
14595 +#ifdef CONFIG_PREEMPT_RT_FULL
14596 +       /* TODO: move me into ->restart_block ? */
14597 +       struct                          siginfo forced_info;
14598 +#endif
14599         unsigned long                   sas_ss_sp;
14600         size_t                          sas_ss_size;
14601         unsigned int                    sas_ss_flags;
14602 @@ -849,6 +882,7 @@
14603         raw_spinlock_t                  pi_lock;
14604  
14605         struct wake_q_node              wake_q;
14606 +       struct wake_q_node              wake_q_sleeper;
14607  
14608  #ifdef CONFIG_RT_MUTEXES
14609         /* PI waiters blocked on a rt_mutex held by this task: */
14610 @@ -1116,9 +1150,23 @@
14611         unsigned int                    sequential_io;
14612         unsigned int                    sequential_io_avg;
14613  #endif
14614 +#ifdef CONFIG_PREEMPT_RT_BASE
14615 +       struct rcu_head                 put_rcu;
14616 +       int                             softirq_nestcnt;
14617 +       unsigned int                    softirqs_raised;
14618 +#endif
14619 +#ifdef CONFIG_PREEMPT_RT_FULL
14620 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14621 +       int                             kmap_idx;
14622 +       pte_t                           kmap_pte[KM_TYPE_NR];
14623 +# endif
14624 +#endif
14625  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14626         unsigned long                   task_state_change;
14627  #endif
14628 +#ifdef CONFIG_PREEMPT_RT_FULL
14629 +       int                             xmit_recursion;
14630 +#endif
14631         int                             pagefault_disabled;
14632  #ifdef CONFIG_MMU
14633         struct task_struct              *oom_reaper_list;
14634 @@ -1332,6 +1380,7 @@
14635  /*
14636   * Per process flags
14637   */
14638 +#define PF_IN_SOFTIRQ          0x00000001      /* Task is serving softirq */
14639  #define PF_IDLE                        0x00000002      /* I am an IDLE thread */
14640  #define PF_EXITING             0x00000004      /* Getting shut down */
14641  #define PF_EXITPIDONE          0x00000008      /* PI exit done on shut down */
14642 @@ -1355,7 +1404,7 @@
14643  #define PF_KTHREAD             0x00200000      /* I am a kernel thread */
14644  #define PF_RANDOMIZE           0x00400000      /* Randomize virtual address space */
14645  #define PF_SWAPWRITE           0x00800000      /* Allowed to write to swap */
14646 -#define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_allowed */
14647 +#define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_mask */
14648  #define PF_MCE_EARLY           0x08000000      /* Early kill for mce process policy */
14649  #define PF_MUTEX_TESTER                0x20000000      /* Thread belongs to the rt mutex tester */
14650  #define PF_FREEZER_SKIP                0x40000000      /* Freezer should not count it as freezable */
14651 @@ -1535,6 +1584,7 @@
14652  
14653  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14654  extern int wake_up_process(struct task_struct *tsk);
14655 +extern int wake_up_lock_sleeper(struct task_struct *tsk);
14656  extern void wake_up_new_task(struct task_struct *tsk);
14657  
14658  #ifdef CONFIG_SMP
14659 @@ -1611,6 +1661,89 @@
14660         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14661  }
14662  
14663 +#ifdef CONFIG_PREEMPT_LAZY
14664 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14665 +{
14666 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14667 +}
14668 +
14669 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14670 +{
14671 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14672 +}
14673 +
14674 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14675 +{
14676 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14677 +}
14678 +
14679 +static inline int need_resched_lazy(void)
14680 +{
14681 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14682 +}
14683 +
14684 +static inline int need_resched_now(void)
14685 +{
14686 +       return test_thread_flag(TIF_NEED_RESCHED);
14687 +}
14688 +
14689 +#else
14690 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14691 +static inline int need_resched_lazy(void) { return 0; }
14692 +
14693 +static inline int need_resched_now(void)
14694 +{
14695 +       return test_thread_flag(TIF_NEED_RESCHED);
14696 +}
14697 +
14698 +#endif
14699 +
14700 +
14701 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14702 +{
14703 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14704 +               return true;
14705 +#ifdef CONFIG_PREEMPT_RT_FULL
14706 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14707 +               return true;
14708 +#endif
14709 +       return false;
14710 +}
14711 +
14712 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
14713 +{
14714 +       bool traced_stopped;
14715 +
14716 +#ifdef CONFIG_PREEMPT_RT_FULL
14717 +       unsigned long flags;
14718 +
14719 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
14720 +       traced_stopped = __task_is_stopped_or_traced(task);
14721 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14722 +#else
14723 +       traced_stopped = __task_is_stopped_or_traced(task);
14724 +#endif
14725 +       return traced_stopped;
14726 +}
14727 +
14728 +static inline bool task_is_traced(struct task_struct *task)
14729 +{
14730 +       bool traced = false;
14731 +
14732 +       if (task->state & __TASK_TRACED)
14733 +               return true;
14734 +#ifdef CONFIG_PREEMPT_RT_FULL
14735 +       /* in case the task is sleeping on tasklist_lock */
14736 +       raw_spin_lock_irq(&task->pi_lock);
14737 +       if (task->state & __TASK_TRACED)
14738 +               traced = true;
14739 +       else if (task->saved_state & __TASK_TRACED)
14740 +               traced = true;
14741 +       raw_spin_unlock_irq(&task->pi_lock);
14742 +#endif
14743 +       return traced;
14744 +}
14745 +
14746  /*
14747   * cond_resched() and cond_resched_lock(): latency reduction via
14748   * explicit rescheduling in places that are safe. The return
14749 @@ -1636,12 +1769,16 @@
14750         __cond_resched_lock(lock);                              \
14751  })
14752  
14753 +#ifndef CONFIG_PREEMPT_RT_FULL
14754  extern int __cond_resched_softirq(void);
14755  
14756  #define cond_resched_softirq() ({                                      \
14757         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
14758         __cond_resched_softirq();                                       \
14759  })
14760 +#else
14761 +# define cond_resched_softirq()                cond_resched()
14762 +#endif
14763  
14764  static inline void cond_resched_rcu(void)
14765  {
14766 @@ -1671,6 +1808,23 @@
14767         return unlikely(tif_need_resched());
14768  }
14769  
14770 +#ifdef CONFIG_PREEMPT_RT_FULL
14771 +static inline void sleeping_lock_inc(void)
14772 +{
14773 +       current->sleeping_lock++;
14774 +}
14775 +
14776 +static inline void sleeping_lock_dec(void)
14777 +{
14778 +       current->sleeping_lock--;
14779 +}
14780 +
14781 +#else
14782 +
14783 +static inline void sleeping_lock_inc(void) { }
14784 +static inline void sleeping_lock_dec(void) { }
14785 +#endif
14786 +
14787  /*
14788   * Wrappers for p->thread_info->cpu access. No-op on UP.
14789   */
14790 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/seqlock.h linux-4.14/include/linux/seqlock.h
14791 --- linux-4.14.orig/include/linux/seqlock.h     2017-11-12 19:46:13.000000000 +0100
14792 +++ linux-4.14/include/linux/seqlock.h  2018-09-05 11:05:07.000000000 +0200
14793 @@ -221,20 +221,30 @@
14794         return __read_seqcount_retry(s, start);
14795  }
14796  
14797 -
14798 -
14799 -static inline void raw_write_seqcount_begin(seqcount_t *s)
14800 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
14801  {
14802         s->sequence++;
14803         smp_wmb();
14804  }
14805  
14806 -static inline void raw_write_seqcount_end(seqcount_t *s)
14807 +static inline void raw_write_seqcount_begin(seqcount_t *s)
14808 +{
14809 +       preempt_disable_rt();
14810 +       __raw_write_seqcount_begin(s);
14811 +}
14812 +
14813 +static inline void __raw_write_seqcount_end(seqcount_t *s)
14814  {
14815         smp_wmb();
14816         s->sequence++;
14817  }
14818  
14819 +static inline void raw_write_seqcount_end(seqcount_t *s)
14820 +{
14821 +       __raw_write_seqcount_end(s);
14822 +       preempt_enable_rt();
14823 +}
14824 +
14825  /**
14826   * raw_write_seqcount_barrier - do a seq write barrier
14827   * @s: pointer to seqcount_t
14828 @@ -429,10 +439,32 @@
14829  /*
14830   * Read side functions for starting and finalizing a read side section.
14831   */
14832 +#ifndef CONFIG_PREEMPT_RT_FULL
14833  static inline unsigned read_seqbegin(const seqlock_t *sl)
14834  {
14835         return read_seqcount_begin(&sl->seqcount);
14836  }
14837 +#else
14838 +/*
14839 + * Starvation safe read side for RT
14840 + */
14841 +static inline unsigned read_seqbegin(seqlock_t *sl)
14842 +{
14843 +       unsigned ret;
14844 +
14845 +repeat:
14846 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
14847 +       if (unlikely(ret & 1)) {
14848 +               /*
14849 +                * Take the lock and let the writer proceed (i.e. evtl
14850 +                * boost it), otherwise we could loop here forever.
14851 +                */
14852 +               spin_unlock_wait(&sl->lock);
14853 +               goto repeat;
14854 +       }
14855 +       return ret;
14856 +}
14857 +#endif
14858  
14859  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14860  {
14861 @@ -447,36 +479,45 @@
14862  static inline void write_seqlock(seqlock_t *sl)
14863  {
14864         spin_lock(&sl->lock);
14865 -       write_seqcount_begin(&sl->seqcount);
14866 +       __raw_write_seqcount_begin(&sl->seqcount);
14867 +}
14868 +
14869 +static inline int try_write_seqlock(seqlock_t *sl)
14870 +{
14871 +       if (spin_trylock(&sl->lock)) {
14872 +               __raw_write_seqcount_begin(&sl->seqcount);
14873 +               return 1;
14874 +       }
14875 +       return 0;
14876  }
14877  
14878  static inline void write_sequnlock(seqlock_t *sl)
14879  {
14880 -       write_seqcount_end(&sl->seqcount);
14881 +       __raw_write_seqcount_end(&sl->seqcount);
14882         spin_unlock(&sl->lock);
14883  }
14884  
14885  static inline void write_seqlock_bh(seqlock_t *sl)
14886  {
14887         spin_lock_bh(&sl->lock);
14888 -       write_seqcount_begin(&sl->seqcount);
14889 +       __raw_write_seqcount_begin(&sl->seqcount);
14890  }
14891  
14892  static inline void write_sequnlock_bh(seqlock_t *sl)
14893  {
14894 -       write_seqcount_end(&sl->seqcount);
14895 +       __raw_write_seqcount_end(&sl->seqcount);
14896         spin_unlock_bh(&sl->lock);
14897  }
14898  
14899  static inline void write_seqlock_irq(seqlock_t *sl)
14900  {
14901         spin_lock_irq(&sl->lock);
14902 -       write_seqcount_begin(&sl->seqcount);
14903 +       __raw_write_seqcount_begin(&sl->seqcount);
14904  }
14905  
14906  static inline void write_sequnlock_irq(seqlock_t *sl)
14907  {
14908 -       write_seqcount_end(&sl->seqcount);
14909 +       __raw_write_seqcount_end(&sl->seqcount);
14910         spin_unlock_irq(&sl->lock);
14911  }
14912  
14913 @@ -485,7 +526,7 @@
14914         unsigned long flags;
14915  
14916         spin_lock_irqsave(&sl->lock, flags);
14917 -       write_seqcount_begin(&sl->seqcount);
14918 +       __raw_write_seqcount_begin(&sl->seqcount);
14919         return flags;
14920  }
14921  
14922 @@ -495,7 +536,7 @@
14923  static inline void
14924  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
14925  {
14926 -       write_seqcount_end(&sl->seqcount);
14927 +       __raw_write_seqcount_end(&sl->seqcount);
14928         spin_unlock_irqrestore(&sl->lock, flags);
14929  }
14930  
14931 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/signal.h linux-4.14/include/linux/signal.h
14932 --- linux-4.14.orig/include/linux/signal.h      2017-11-12 19:46:13.000000000 +0100
14933 +++ linux-4.14/include/linux/signal.h   2018-09-05 11:05:07.000000000 +0200
14934 @@ -243,6 +243,7 @@
14935  }
14936  
14937  extern void flush_sigqueue(struct sigpending *queue);
14938 +extern void flush_task_sigqueue(struct task_struct *tsk);
14939  
14940  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
14941  static inline int valid_signal(unsigned long sig)
14942 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/skbuff.h linux-4.14/include/linux/skbuff.h
14943 --- linux-4.14.orig/include/linux/skbuff.h      2018-09-05 11:03:22.000000000 +0200
14944 +++ linux-4.14/include/linux/skbuff.h   2018-09-05 11:05:07.000000000 +0200
14945 @@ -287,6 +287,7 @@
14946  
14947         __u32           qlen;
14948         spinlock_t      lock;
14949 +       raw_spinlock_t  raw_lock;
14950  };
14951  
14952  struct sk_buff;
14953 @@ -1667,6 +1668,12 @@
14954         __skb_queue_head_init(list);
14955  }
14956  
14957 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
14958 +{
14959 +       raw_spin_lock_init(&list->raw_lock);
14960 +       __skb_queue_head_init(list);
14961 +}
14962 +
14963  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
14964                 struct lock_class_key *class)
14965  {
14966 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/smp.h linux-4.14/include/linux/smp.h
14967 --- linux-4.14.orig/include/linux/smp.h 2017-11-12 19:46:13.000000000 +0100
14968 +++ linux-4.14/include/linux/smp.h      2018-09-05 11:05:07.000000000 +0200
14969 @@ -202,6 +202,9 @@
14970  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
14971  #define put_cpu()              preempt_enable()
14972  
14973 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
14974 +#define put_cpu_light()                migrate_enable()
14975 +
14976  /*
14977   * Callback to arch code if there's nosmp or maxcpus=0 on the
14978   * boot command line:
14979 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_api_smp.h linux-4.14/include/linux/spinlock_api_smp.h
14980 --- linux-4.14.orig/include/linux/spinlock_api_smp.h    2017-11-12 19:46:13.000000000 +0100
14981 +++ linux-4.14/include/linux/spinlock_api_smp.h 2018-09-05 11:05:07.000000000 +0200
14982 @@ -187,6 +187,8 @@
14983         return 0;
14984  }
14985  
14986 -#include <linux/rwlock_api_smp.h>
14987 +#ifndef CONFIG_PREEMPT_RT_FULL
14988 +# include <linux/rwlock_api_smp.h>
14989 +#endif
14990  
14991  #endif /* __LINUX_SPINLOCK_API_SMP_H */
14992 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock.h linux-4.14/include/linux/spinlock.h
14993 --- linux-4.14.orig/include/linux/spinlock.h    2017-11-12 19:46:13.000000000 +0100
14994 +++ linux-4.14/include/linux/spinlock.h 2018-09-05 11:05:07.000000000 +0200
14995 @@ -286,7 +286,11 @@
14996  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
14997  
14998  /* Include rwlock functions */
14999 -#include <linux/rwlock.h>
15000 +#ifdef CONFIG_PREEMPT_RT_FULL
15001 +# include <linux/rwlock_rt.h>
15002 +#else
15003 +# include <linux/rwlock.h>
15004 +#endif
15005  
15006  /*
15007   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15008 @@ -297,6 +301,10 @@
15009  # include <linux/spinlock_api_up.h>
15010  #endif
15011  
15012 +#ifdef CONFIG_PREEMPT_RT_FULL
15013 +# include <linux/spinlock_rt.h>
15014 +#else /* PREEMPT_RT_FULL */
15015 +
15016  /*
15017   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15018   */
15019 @@ -421,4 +429,6 @@
15020  #define atomic_dec_and_lock(atomic, lock) \
15021                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
15022  
15023 +#endif /* !PREEMPT_RT_FULL */
15024 +
15025  #endif /* __LINUX_SPINLOCK_H */
15026 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_rt.h linux-4.14/include/linux/spinlock_rt.h
15027 --- linux-4.14.orig/include/linux/spinlock_rt.h 1970-01-01 01:00:00.000000000 +0100
15028 +++ linux-4.14/include/linux/spinlock_rt.h      2018-09-05 11:05:07.000000000 +0200
15029 @@ -0,0 +1,159 @@
15030 +#ifndef __LINUX_SPINLOCK_RT_H
15031 +#define __LINUX_SPINLOCK_RT_H
15032 +
15033 +#ifndef __LINUX_SPINLOCK_H
15034 +#error Do not include directly. Use spinlock.h
15035 +#endif
15036 +
15037 +#include <linux/bug.h>
15038 +
15039 +extern void
15040 +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key);
15041 +
15042 +#define spin_lock_init(slock)                          \
15043 +do {                                                   \
15044 +       static struct lock_class_key __key;             \
15045 +                                                       \
15046 +       rt_mutex_init(&(slock)->lock);                  \
15047 +       __rt_spin_lock_init(slock, #slock, &__key);     \
15048 +} while (0)
15049 +
15050 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15051 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15052 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15053 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15054 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15055 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15056 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15057 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15058 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15059 +
15060 +/*
15061 + * lockdep-less calls, for derived types like rwlock:
15062 + * (for trylock they can use rt_mutex_trylock() directly.
15063 + * Migrate disable handling must be done at the call site.
15064 + */
15065 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15066 +extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15067 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15068 +
15069 +#define spin_lock(lock)                        rt_spin_lock(lock)
15070 +
15071 +#define spin_lock_bh(lock)                     \
15072 +       do {                                    \
15073 +               local_bh_disable();             \
15074 +               rt_spin_lock(lock);             \
15075 +       } while (0)
15076 +
15077 +#define spin_lock_irq(lock)            spin_lock(lock)
15078 +
15079 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
15080 +
15081 +#define spin_trylock(lock)                     \
15082 +({                                             \
15083 +       int __locked;                           \
15084 +       __locked = spin_do_trylock(lock);       \
15085 +       __locked;                               \
15086 +})
15087 +
15088 +#ifdef CONFIG_LOCKDEP
15089 +# define spin_lock_nested(lock, subclass)              \
15090 +       do {                                            \
15091 +               rt_spin_lock_nested(lock, subclass);    \
15092 +       } while (0)
15093 +
15094 +#define spin_lock_bh_nested(lock, subclass)            \
15095 +       do {                                            \
15096 +               local_bh_disable();                     \
15097 +               rt_spin_lock_nested(lock, subclass);    \
15098 +       } while (0)
15099 +
15100 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15101 +       do {                                             \
15102 +               typecheck(unsigned long, flags);         \
15103 +               flags = 0;                               \
15104 +               rt_spin_lock_nested(lock, subclass);     \
15105 +       } while (0)
15106 +#else
15107 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
15108 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
15109 +
15110 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15111 +       do {                                             \
15112 +               typecheck(unsigned long, flags);         \
15113 +               flags = 0;                               \
15114 +               spin_lock(lock);                         \
15115 +       } while (0)
15116 +#endif
15117 +
15118 +#define spin_lock_irqsave(lock, flags)                  \
15119 +       do {                                             \
15120 +               typecheck(unsigned long, flags);         \
15121 +               flags = 0;                               \
15122 +               spin_lock(lock);                         \
15123 +       } while (0)
15124 +
15125 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15126 +{
15127 +       unsigned long flags = 0;
15128 +#ifdef CONFIG_TRACE_IRQFLAGS
15129 +       flags = rt_spin_lock_trace_flags(lock);
15130 +#else
15131 +       spin_lock(lock); /* lock_local */
15132 +#endif
15133 +       return flags;
15134 +}
15135 +
15136 +/* FIXME: we need rt_spin_lock_nest_lock */
15137 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15138 +
15139 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15140 +
15141 +#define spin_unlock_bh(lock)                           \
15142 +       do {                                            \
15143 +               rt_spin_unlock(lock);                   \
15144 +               local_bh_enable();                      \
15145 +       } while (0)
15146 +
15147 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15148 +
15149 +#define spin_unlock_irqrestore(lock, flags)            \
15150 +       do {                                            \
15151 +               typecheck(unsigned long, flags);        \
15152 +               (void) flags;                           \
15153 +               spin_unlock(lock);                      \
15154 +       } while (0)
15155 +
15156 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15157 +#define spin_trylock_irq(lock) spin_trylock(lock)
15158 +
15159 +#define spin_trylock_irqsave(lock, flags)      \
15160 +       rt_spin_trylock_irqsave(lock, &(flags))
15161 +
15162 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15163 +
15164 +#ifdef CONFIG_GENERIC_LOCKBREAK
15165 +# define spin_is_contended(lock)       ((lock)->break_lock)
15166 +#else
15167 +# define spin_is_contended(lock)       (((void)(lock), 0))
15168 +#endif
15169 +
15170 +static inline int spin_can_lock(spinlock_t *lock)
15171 +{
15172 +       return !rt_mutex_is_locked(&lock->lock);
15173 +}
15174 +
15175 +static inline int spin_is_locked(spinlock_t *lock)
15176 +{
15177 +       return rt_mutex_is_locked(&lock->lock);
15178 +}
15179 +
15180 +static inline void assert_spin_locked(spinlock_t *lock)
15181 +{
15182 +       BUG_ON(!spin_is_locked(lock));
15183 +}
15184 +
15185 +#define atomic_dec_and_lock(atomic, lock) \
15186 +       atomic_dec_and_spin_lock(atomic, lock)
15187 +
15188 +#endif
15189 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types.h linux-4.14/include/linux/spinlock_types.h
15190 --- linux-4.14.orig/include/linux/spinlock_types.h      2017-11-12 19:46:13.000000000 +0100
15191 +++ linux-4.14/include/linux/spinlock_types.h   2018-09-05 11:05:07.000000000 +0200
15192 @@ -9,80 +9,15 @@
15193   * Released under the General Public License (GPL).
15194   */
15195  
15196 -#if defined(CONFIG_SMP)
15197 -# include <asm/spinlock_types.h>
15198 -#else
15199 -# include <linux/spinlock_types_up.h>
15200 -#endif
15201 -
15202 -#include <linux/lockdep.h>
15203 -
15204 -typedef struct raw_spinlock {
15205 -       arch_spinlock_t raw_lock;
15206 -#ifdef CONFIG_GENERIC_LOCKBREAK
15207 -       unsigned int break_lock;
15208 -#endif
15209 -#ifdef CONFIG_DEBUG_SPINLOCK
15210 -       unsigned int magic, owner_cpu;
15211 -       void *owner;
15212 -#endif
15213 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15214 -       struct lockdep_map dep_map;
15215 -#endif
15216 -} raw_spinlock_t;
15217 -
15218 -#define SPINLOCK_MAGIC         0xdead4ead
15219 -
15220 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15221 -
15222 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15223 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15224 -#else
15225 -# define SPIN_DEP_MAP_INIT(lockname)
15226 -#endif
15227 +#include <linux/spinlock_types_raw.h>
15228  
15229 -#ifdef CONFIG_DEBUG_SPINLOCK
15230 -# define SPIN_DEBUG_INIT(lockname)             \
15231 -       .magic = SPINLOCK_MAGIC,                \
15232 -       .owner_cpu = -1,                        \
15233 -       .owner = SPINLOCK_OWNER_INIT,
15234 +#ifndef CONFIG_PREEMPT_RT_FULL
15235 +# include <linux/spinlock_types_nort.h>
15236 +# include <linux/rwlock_types.h>
15237  #else
15238 -# define SPIN_DEBUG_INIT(lockname)
15239 -#endif
15240 -
15241 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15242 -       {                                       \
15243 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15244 -       SPIN_DEBUG_INIT(lockname)               \
15245 -       SPIN_DEP_MAP_INIT(lockname) }
15246 -
15247 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15248 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15249 -
15250 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15251 -
15252 -typedef struct spinlock {
15253 -       union {
15254 -               struct raw_spinlock rlock;
15255 -
15256 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15257 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15258 -               struct {
15259 -                       u8 __padding[LOCK_PADSIZE];
15260 -                       struct lockdep_map dep_map;
15261 -               };
15262 +# include <linux/rtmutex.h>
15263 +# include <linux/spinlock_types_rt.h>
15264 +# include <linux/rwlock_types_rt.h>
15265  #endif
15266 -       };
15267 -} spinlock_t;
15268 -
15269 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15270 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15271 -
15272 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15273 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15274 -
15275 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15276 -
15277 -#include <linux/rwlock_types.h>
15278  
15279  #endif /* __LINUX_SPINLOCK_TYPES_H */
15280 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_nort.h linux-4.14/include/linux/spinlock_types_nort.h
15281 --- linux-4.14.orig/include/linux/spinlock_types_nort.h 1970-01-01 01:00:00.000000000 +0100
15282 +++ linux-4.14/include/linux/spinlock_types_nort.h      2018-09-05 11:05:07.000000000 +0200
15283 @@ -0,0 +1,33 @@
15284 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15285 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15286 +
15287 +#ifndef __LINUX_SPINLOCK_TYPES_H
15288 +#error "Do not include directly. Include spinlock_types.h instead"
15289 +#endif
15290 +
15291 +/*
15292 + * The non RT version maps spinlocks to raw_spinlocks
15293 + */
15294 +typedef struct spinlock {
15295 +       union {
15296 +               struct raw_spinlock rlock;
15297 +
15298 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15299 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15300 +               struct {
15301 +                       u8 __padding[LOCK_PADSIZE];
15302 +                       struct lockdep_map dep_map;
15303 +               };
15304 +#endif
15305 +       };
15306 +} spinlock_t;
15307 +
15308 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15309 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15310 +
15311 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15312 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15313 +
15314 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15315 +
15316 +#endif
15317 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_raw.h linux-4.14/include/linux/spinlock_types_raw.h
15318 --- linux-4.14.orig/include/linux/spinlock_types_raw.h  1970-01-01 01:00:00.000000000 +0100
15319 +++ linux-4.14/include/linux/spinlock_types_raw.h       2018-09-05 11:05:07.000000000 +0200
15320 @@ -0,0 +1,58 @@
15321 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15322 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15323 +
15324 +#include <linux/types.h>
15325 +
15326 +#if defined(CONFIG_SMP)
15327 +# include <asm/spinlock_types.h>
15328 +#else
15329 +# include <linux/spinlock_types_up.h>
15330 +#endif
15331 +
15332 +#include <linux/lockdep.h>
15333 +
15334 +typedef struct raw_spinlock {
15335 +       arch_spinlock_t raw_lock;
15336 +#ifdef CONFIG_GENERIC_LOCKBREAK
15337 +       unsigned int break_lock;
15338 +#endif
15339 +#ifdef CONFIG_DEBUG_SPINLOCK
15340 +       unsigned int magic, owner_cpu;
15341 +       void *owner;
15342 +#endif
15343 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15344 +       struct lockdep_map dep_map;
15345 +#endif
15346 +} raw_spinlock_t;
15347 +
15348 +#define SPINLOCK_MAGIC         0xdead4ead
15349 +
15350 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15351 +
15352 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15353 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15354 +#else
15355 +# define SPIN_DEP_MAP_INIT(lockname)
15356 +#endif
15357 +
15358 +#ifdef CONFIG_DEBUG_SPINLOCK
15359 +# define SPIN_DEBUG_INIT(lockname)             \
15360 +       .magic = SPINLOCK_MAGIC,                \
15361 +       .owner_cpu = -1,                        \
15362 +       .owner = SPINLOCK_OWNER_INIT,
15363 +#else
15364 +# define SPIN_DEBUG_INIT(lockname)
15365 +#endif
15366 +
15367 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15368 +       {                                       \
15369 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15370 +       SPIN_DEBUG_INIT(lockname)               \
15371 +       SPIN_DEP_MAP_INIT(lockname) }
15372 +
15373 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15374 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15375 +
15376 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15377 +
15378 +#endif
15379 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_rt.h linux-4.14/include/linux/spinlock_types_rt.h
15380 --- linux-4.14.orig/include/linux/spinlock_types_rt.h   1970-01-01 01:00:00.000000000 +0100
15381 +++ linux-4.14/include/linux/spinlock_types_rt.h        2018-09-05 11:05:07.000000000 +0200
15382 @@ -0,0 +1,48 @@
15383 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15384 +#define __LINUX_SPINLOCK_TYPES_RT_H
15385 +
15386 +#ifndef __LINUX_SPINLOCK_TYPES_H
15387 +#error "Do not include directly. Include spinlock_types.h instead"
15388 +#endif
15389 +
15390 +#include <linux/cache.h>
15391 +
15392 +/*
15393 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15394 + */
15395 +typedef struct spinlock {
15396 +       struct rt_mutex         lock;
15397 +       unsigned int            break_lock;
15398 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15399 +       struct lockdep_map      dep_map;
15400 +#endif
15401 +} spinlock_t;
15402 +
15403 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15404 +# define __RT_SPIN_INITIALIZER(name) \
15405 +       { \
15406 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15407 +       .save_state = 1, \
15408 +       .file = __FILE__, \
15409 +       .line = __LINE__ , \
15410 +       }
15411 +#else
15412 +# define __RT_SPIN_INITIALIZER(name) \
15413 +       {                                                               \
15414 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15415 +       .save_state = 1, \
15416 +       }
15417 +#endif
15418 +
15419 +/*
15420 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15421 +*/
15422 +
15423 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15424 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15425 +         SPIN_DEP_MAP_INIT(name) }
15426 +
15427 +#define DEFINE_SPINLOCK(name) \
15428 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15429 +
15430 +#endif
15431 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/spinlock_types_up.h linux-4.14/include/linux/spinlock_types_up.h
15432 --- linux-4.14.orig/include/linux/spinlock_types_up.h   2017-11-12 19:46:13.000000000 +0100
15433 +++ linux-4.14/include/linux/spinlock_types_up.h        2018-09-05 11:05:07.000000000 +0200
15434 @@ -1,10 +1,6 @@
15435  #ifndef __LINUX_SPINLOCK_TYPES_UP_H
15436  #define __LINUX_SPINLOCK_TYPES_UP_H
15437  
15438 -#ifndef __LINUX_SPINLOCK_TYPES_H
15439 -# error "please don't include this file directly"
15440 -#endif
15441 -
15442  /*
15443   * include/linux/spinlock_types_up.h - spinlock type definitions for UP
15444   *
15445 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/srcutiny.h linux-4.14/include/linux/srcutiny.h
15446 --- linux-4.14.orig/include/linux/srcutiny.h    2017-11-12 19:46:13.000000000 +0100
15447 +++ linux-4.14/include/linux/srcutiny.h 2018-09-05 11:05:07.000000000 +0200
15448 @@ -43,7 +43,7 @@
15449  
15450  void srcu_drive_gp(struct work_struct *wp);
15451  
15452 -#define __SRCU_STRUCT_INIT(name)                                       \
15453 +#define __SRCU_STRUCT_INIT(name, __ignored)                            \
15454  {                                                                      \
15455         .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),        \
15456         .srcu_cb_tail = &name.srcu_cb_head,                             \
15457 @@ -56,9 +56,9 @@
15458   * Tree SRCU, which needs some per-CPU data.
15459   */
15460  #define DEFINE_SRCU(name) \
15461 -       struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15462 +       struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15463  #define DEFINE_STATIC_SRCU(name) \
15464 -       static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15465 +       static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15466  
15467  void synchronize_srcu(struct srcu_struct *sp);
15468  
15469 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/srcutree.h linux-4.14/include/linux/srcutree.h
15470 --- linux-4.14.orig/include/linux/srcutree.h    2017-11-12 19:46:13.000000000 +0100
15471 +++ linux-4.14/include/linux/srcutree.h 2018-09-05 11:05:07.000000000 +0200
15472 @@ -40,7 +40,7 @@
15473         unsigned long srcu_unlock_count[2];     /* Unlocks per CPU. */
15474  
15475         /* Update-side state. */
15476 -       raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
15477 +       spinlock_t __private lock ____cacheline_internodealigned_in_smp;
15478         struct rcu_segcblist srcu_cblist;       /* List of callbacks.*/
15479         unsigned long srcu_gp_seq_needed;       /* Furthest future GP needed. */
15480         unsigned long srcu_gp_seq_needed_exp;   /* Furthest future exp GP. */
15481 @@ -58,7 +58,7 @@
15482   * Node in SRCU combining tree, similar in function to rcu_data.
15483   */
15484  struct srcu_node {
15485 -       raw_spinlock_t __private lock;
15486 +       spinlock_t __private lock;
15487         unsigned long srcu_have_cbs[4];         /* GP seq for children */
15488                                                 /*  having CBs, but only */
15489                                                 /*  is > ->srcu_gq_seq. */
15490 @@ -78,7 +78,7 @@
15491         struct srcu_node *level[RCU_NUM_LVLS + 1];
15492                                                 /* First node at each level. */
15493         struct mutex srcu_cb_mutex;             /* Serialize CB preparation. */
15494 -       raw_spinlock_t __private lock;          /* Protect counters */
15495 +       spinlock_t __private lock;              /* Protect counters */
15496         struct mutex srcu_gp_mutex;             /* Serialize GP work. */
15497         unsigned int srcu_idx;                  /* Current rdr array element. */
15498         unsigned long srcu_gp_seq;              /* Grace-period seq #. */
15499 @@ -104,10 +104,10 @@
15500  #define SRCU_STATE_SCAN1       1
15501  #define SRCU_STATE_SCAN2       2
15502  
15503 -#define __SRCU_STRUCT_INIT(name)                                       \
15504 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
15505         {                                                               \
15506 -               .sda = &name##_srcu_data,                               \
15507 -               .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),            \
15508 +               .sda = &pcpu_name,                                      \
15509 +               .lock = __SPIN_LOCK_UNLOCKED(name.lock),                \
15510                 .srcu_gp_seq_needed = 0 - 1,                            \
15511                 __SRCU_DEP_MAP_INIT(name)                               \
15512         }
15513 @@ -133,7 +133,7 @@
15514   */
15515  #define __DEFINE_SRCU(name, is_static)                                 \
15516         static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
15517 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15518 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
15519  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
15520  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
15521  
15522 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/suspend.h linux-4.14/include/linux/suspend.h
15523 --- linux-4.14.orig/include/linux/suspend.h     2018-09-05 11:03:22.000000000 +0200
15524 +++ linux-4.14/include/linux/suspend.h  2018-09-05 11:05:07.000000000 +0200
15525 @@ -196,6 +196,12 @@
15526         void (*end)(void);
15527  };
15528  
15529 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15530 +extern bool pm_in_action;
15531 +#else
15532 +# define pm_in_action false
15533 +#endif
15534 +
15535  #ifdef CONFIG_SUSPEND
15536  extern suspend_state_t mem_sleep_current;
15537  extern suspend_state_t mem_sleep_default;
15538 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swait.h linux-4.14/include/linux/swait.h
15539 --- linux-4.14.orig/include/linux/swait.h       2017-11-12 19:46:13.000000000 +0100
15540 +++ linux-4.14/include/linux/swait.h    2018-09-05 11:05:07.000000000 +0200
15541 @@ -5,6 +5,7 @@
15542  #include <linux/list.h>
15543  #include <linux/stddef.h>
15544  #include <linux/spinlock.h>
15545 +#include <linux/wait.h>
15546  #include <asm/current.h>
15547  
15548  /*
15549 @@ -147,6 +148,7 @@
15550  extern void swake_up(struct swait_queue_head *q);
15551  extern void swake_up_all(struct swait_queue_head *q);
15552  extern void swake_up_locked(struct swait_queue_head *q);
15553 +extern void swake_up_all_locked(struct swait_queue_head *q);
15554  
15555  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15556  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15557 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swap.h linux-4.14/include/linux/swap.h
15558 --- linux-4.14.orig/include/linux/swap.h        2017-11-12 19:46:13.000000000 +0100
15559 +++ linux-4.14/include/linux/swap.h     2018-09-05 11:05:07.000000000 +0200
15560 @@ -12,6 +12,7 @@
15561  #include <linux/fs.h>
15562  #include <linux/atomic.h>
15563  #include <linux/page-flags.h>
15564 +#include <linux/locallock.h>
15565  #include <asm/page.h>
15566  
15567  struct notifier_block;
15568 @@ -297,7 +298,8 @@
15569  void *workingset_eviction(struct address_space *mapping, struct page *page);
15570  bool workingset_refault(void *shadow);
15571  void workingset_activation(struct page *page);
15572 -void workingset_update_node(struct radix_tree_node *node, void *private);
15573 +void __workingset_update_node(struct radix_tree_node *node, void *private);
15574 +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
15575  
15576  /* linux/mm/page_alloc.c */
15577  extern unsigned long totalram_pages;
15578 @@ -310,6 +312,7 @@
15579  
15580  
15581  /* linux/mm/swap.c */
15582 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15583  extern void lru_cache_add(struct page *);
15584  extern void lru_cache_add_anon(struct page *page);
15585  extern void lru_cache_add_file(struct page *page);
15586 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/swork.h linux-4.14/include/linux/swork.h
15587 --- linux-4.14.orig/include/linux/swork.h       1970-01-01 01:00:00.000000000 +0100
15588 +++ linux-4.14/include/linux/swork.h    2018-09-05 11:05:07.000000000 +0200
15589 @@ -0,0 +1,24 @@
15590 +#ifndef _LINUX_SWORK_H
15591 +#define _LINUX_SWORK_H
15592 +
15593 +#include <linux/list.h>
15594 +
15595 +struct swork_event {
15596 +       struct list_head item;
15597 +       unsigned long flags;
15598 +       void (*func)(struct swork_event *);
15599 +};
15600 +
15601 +static inline void INIT_SWORK(struct swork_event *event,
15602 +                             void (*func)(struct swork_event *))
15603 +{
15604 +       event->flags = 0;
15605 +       event->func = func;
15606 +}
15607 +
15608 +bool swork_queue(struct swork_event *sev);
15609 +
15610 +int swork_get(void);
15611 +void swork_put(void);
15612 +
15613 +#endif /* _LINUX_SWORK_H */
15614 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/thread_info.h linux-4.14/include/linux/thread_info.h
15615 --- linux-4.14.orig/include/linux/thread_info.h 2018-09-05 11:03:22.000000000 +0200
15616 +++ linux-4.14/include/linux/thread_info.h      2018-09-05 11:05:07.000000000 +0200
15617 @@ -86,7 +86,17 @@
15618  #define test_thread_flag(flag) \
15619         test_ti_thread_flag(current_thread_info(), flag)
15620  
15621 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15622 +#ifdef CONFIG_PREEMPT_LAZY
15623 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
15624 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
15625 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15626 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
15627 +
15628 +#else
15629 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
15630 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15631 +#define tif_need_resched_lazy()        0
15632 +#endif
15633  
15634  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
15635  static inline int arch_within_stack_frames(const void * const stack,
15636 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/timer.h linux-4.14/include/linux/timer.h
15637 --- linux-4.14.orig/include/linux/timer.h       2018-09-05 11:03:22.000000000 +0200
15638 +++ linux-4.14/include/linux/timer.h    2018-09-05 11:05:07.000000000 +0200
15639 @@ -213,7 +213,7 @@
15640  
15641  extern int try_to_del_timer_sync(struct timer_list *timer);
15642  
15643 -#ifdef CONFIG_SMP
15644 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15645    extern int del_timer_sync(struct timer_list *timer);
15646  #else
15647  # define del_timer_sync(t)             del_timer(t)
15648 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/trace_events.h linux-4.14/include/linux/trace_events.h
15649 --- linux-4.14.orig/include/linux/trace_events.h        2017-11-12 19:46:13.000000000 +0100
15650 +++ linux-4.14/include/linux/trace_events.h     2018-09-05 11:05:07.000000000 +0200
15651 @@ -62,6 +62,9 @@
15652         unsigned char           flags;
15653         unsigned char           preempt_count;
15654         int                     pid;
15655 +       unsigned short          migrate_disable;
15656 +       unsigned short          padding;
15657 +       unsigned char           preempt_lazy_count;
15658  };
15659  
15660  #define TRACE_EVENT_TYPE_MAX                                           \
15661 @@ -402,11 +405,13 @@
15662  
15663  extern int filter_match_preds(struct event_filter *filter, void *rec);
15664  
15665 -extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
15666 -                                                  void *rec);
15667 -extern void event_triggers_post_call(struct trace_event_file *file,
15668 -                                    enum event_trigger_type tt,
15669 -                                    void *rec);
15670 +extern enum event_trigger_type
15671 +event_triggers_call(struct trace_event_file *file, void *rec,
15672 +                   struct ring_buffer_event *event);
15673 +extern void
15674 +event_triggers_post_call(struct trace_event_file *file,
15675 +                        enum event_trigger_type tt,
15676 +                        void *rec, struct ring_buffer_event *event);
15677  
15678  bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
15679  
15680 @@ -426,7 +431,7 @@
15681  
15682         if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
15683                 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
15684 -                       event_triggers_call(file, NULL);
15685 +                       event_triggers_call(file, NULL, NULL);
15686                 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
15687                         return true;
15688                 if (eflags & EVENT_FILE_FL_PID_FILTER)
15689 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/uaccess.h linux-4.14/include/linux/uaccess.h
15690 --- linux-4.14.orig/include/linux/uaccess.h     2017-11-12 19:46:13.000000000 +0100
15691 +++ linux-4.14/include/linux/uaccess.h  2018-09-05 11:05:07.000000000 +0200
15692 @@ -185,6 +185,7 @@
15693   */
15694  static inline void pagefault_disable(void)
15695  {
15696 +       migrate_disable();
15697         pagefault_disabled_inc();
15698         /*
15699          * make sure to have issued the store before a pagefault
15700 @@ -201,6 +202,7 @@
15701          */
15702         barrier();
15703         pagefault_disabled_dec();
15704 +       migrate_enable();
15705  }
15706  
15707  /*
15708 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/vmstat.h linux-4.14/include/linux/vmstat.h
15709 --- linux-4.14.orig/include/linux/vmstat.h      2017-11-12 19:46:13.000000000 +0100
15710 +++ linux-4.14/include/linux/vmstat.h   2018-09-05 11:05:07.000000000 +0200
15711 @@ -33,7 +33,9 @@
15712   */
15713  static inline void __count_vm_event(enum vm_event_item item)
15714  {
15715 +       preempt_disable_rt();
15716         raw_cpu_inc(vm_event_states.event[item]);
15717 +       preempt_enable_rt();
15718  }
15719  
15720  static inline void count_vm_event(enum vm_event_item item)
15721 @@ -43,7 +45,9 @@
15722  
15723  static inline void __count_vm_events(enum vm_event_item item, long delta)
15724  {
15725 +       preempt_disable_rt();
15726         raw_cpu_add(vm_event_states.event[item], delta);
15727 +       preempt_enable_rt();
15728  }
15729  
15730  static inline void count_vm_events(enum vm_event_item item, long delta)
15731 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/linux/wait.h linux-4.14/include/linux/wait.h
15732 --- linux-4.14.orig/include/linux/wait.h        2017-11-12 19:46:13.000000000 +0100
15733 +++ linux-4.14/include/linux/wait.h     2018-09-05 11:05:07.000000000 +0200
15734 @@ -10,6 +10,7 @@
15735  
15736  #include <asm/current.h>
15737  #include <uapi/linux/wait.h>
15738 +#include <linux/atomic.h>
15739  
15740  typedef struct wait_queue_entry wait_queue_entry_t;
15741  
15742 @@ -486,8 +487,8 @@
15743         int __ret = 0;                                                          \
15744         struct hrtimer_sleeper __t;                                             \
15745                                                                                 \
15746 -       hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);   \
15747 -       hrtimer_init_sleeper(&__t, current);                                    \
15748 +       hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL,  \
15749 +                                     current);                                 \
15750         if ((timeout) != KTIME_MAX)                                             \
15751                 hrtimer_start_range_ns(&__t.timer, timeout,                     \
15752                                        current->timer_slack_ns,                 \
15753 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/gen_stats.h linux-4.14/include/net/gen_stats.h
15754 --- linux-4.14.orig/include/net/gen_stats.h     2017-11-12 19:46:13.000000000 +0100
15755 +++ linux-4.14/include/net/gen_stats.h  2018-09-05 11:05:07.000000000 +0200
15756 @@ -6,6 +6,7 @@
15757  #include <linux/socket.h>
15758  #include <linux/rtnetlink.h>
15759  #include <linux/pkt_sched.h>
15760 +#include <net/net_seq_lock.h>
15761  
15762  struct gnet_stats_basic_cpu {
15763         struct gnet_stats_basic_packed bstats;
15764 @@ -36,11 +37,11 @@
15765                                  spinlock_t *lock, struct gnet_dump *d,
15766                                  int padattr);
15767  
15768 -int gnet_stats_copy_basic(const seqcount_t *running,
15769 +int gnet_stats_copy_basic(net_seqlock_t *running,
15770                           struct gnet_dump *d,
15771                           struct gnet_stats_basic_cpu __percpu *cpu,
15772                           struct gnet_stats_basic_packed *b);
15773 -void __gnet_stats_copy_basic(const seqcount_t *running,
15774 +void __gnet_stats_copy_basic(net_seqlock_t *running,
15775                              struct gnet_stats_basic_packed *bstats,
15776                              struct gnet_stats_basic_cpu __percpu *cpu,
15777                              struct gnet_stats_basic_packed *b);
15778 @@ -57,13 +58,13 @@
15779                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
15780                       struct net_rate_estimator __rcu **rate_est,
15781                       spinlock_t *stats_lock,
15782 -                     seqcount_t *running, struct nlattr *opt);
15783 +                     net_seqlock_t *running, struct nlattr *opt);
15784  void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
15785  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
15786                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
15787                           struct net_rate_estimator __rcu **ptr,
15788                           spinlock_t *stats_lock,
15789 -                         seqcount_t *running, struct nlattr *opt);
15790 +                         net_seqlock_t *running, struct nlattr *opt);
15791  bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
15792  bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
15793                         struct gnet_stats_rate_est64 *sample);
15794 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/neighbour.h linux-4.14/include/net/neighbour.h
15795 --- linux-4.14.orig/include/net/neighbour.h     2017-11-12 19:46:13.000000000 +0100
15796 +++ linux-4.14/include/net/neighbour.h  2018-09-05 11:05:07.000000000 +0200
15797 @@ -450,7 +450,7 @@
15798  }
15799  #endif
15800  
15801 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
15802 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
15803  {
15804         unsigned int seq;
15805         unsigned int hh_len;
15806 @@ -474,7 +474,7 @@
15807  
15808  static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
15809  {
15810 -       const struct hh_cache *hh = &n->hh;
15811 +       struct hh_cache *hh = &n->hh;
15812  
15813         if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
15814                 return neigh_hh_output(hh, skb);
15815 @@ -515,7 +515,7 @@
15816  
15817  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
15818  
15819 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
15820 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
15821                                      const struct net_device *dev)
15822  {
15823         unsigned int seq;
15824 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/net_seq_lock.h linux-4.14/include/net/net_seq_lock.h
15825 --- linux-4.14.orig/include/net/net_seq_lock.h  1970-01-01 01:00:00.000000000 +0100
15826 +++ linux-4.14/include/net/net_seq_lock.h       2018-09-05 11:05:07.000000000 +0200
15827 @@ -0,0 +1,15 @@
15828 +#ifndef __NET_NET_SEQ_LOCK_H__
15829 +#define __NET_NET_SEQ_LOCK_H__
15830 +
15831 +#ifdef CONFIG_PREEMPT_RT_BASE
15832 +# define net_seqlock_t                 seqlock_t
15833 +# define net_seq_begin(__r)            read_seqbegin(__r)
15834 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
15835 +
15836 +#else
15837 +# define net_seqlock_t                 seqcount_t
15838 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
15839 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
15840 +#endif
15841 +
15842 +#endif
15843 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/sch_generic.h linux-4.14/include/net/sch_generic.h
15844 --- linux-4.14.orig/include/net/sch_generic.h   2018-09-05 11:03:22.000000000 +0200
15845 +++ linux-4.14/include/net/sch_generic.h        2018-09-05 11:05:07.000000000 +0200
15846 @@ -10,6 +10,7 @@
15847  #include <linux/percpu.h>
15848  #include <linux/dynamic_queue_limits.h>
15849  #include <linux/list.h>
15850 +#include <net/net_seq_lock.h>
15851  #include <linux/refcount.h>
15852  #include <linux/workqueue.h>
15853  #include <net/gen_stats.h>
15854 @@ -90,7 +91,7 @@
15855         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
15856         struct qdisc_skb_head   q;
15857         struct gnet_stats_basic_packed bstats;
15858 -       seqcount_t              running;
15859 +       net_seqlock_t           running;
15860         struct gnet_stats_queue qstats;
15861         unsigned long           state;
15862         struct Qdisc            *next_sched;
15863 @@ -109,13 +110,22 @@
15864         refcount_inc(&qdisc->refcnt);
15865  }
15866  
15867 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
15868 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
15869  {
15870 +#ifdef CONFIG_PREEMPT_RT_BASE
15871 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
15872 +#else
15873         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
15874 +#endif
15875  }
15876  
15877  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
15878  {
15879 +#ifdef CONFIG_PREEMPT_RT_BASE
15880 +       if (try_write_seqlock(&qdisc->running))
15881 +               return true;
15882 +       return false;
15883 +#else
15884         if (qdisc_is_running(qdisc))
15885                 return false;
15886         /* Variant of write_seqcount_begin() telling lockdep a trylock
15887 @@ -124,11 +134,16 @@
15888         raw_write_seqcount_begin(&qdisc->running);
15889         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
15890         return true;
15891 +#endif
15892  }
15893  
15894  static inline void qdisc_run_end(struct Qdisc *qdisc)
15895  {
15896 +#ifdef CONFIG_PREEMPT_RT_BASE
15897 +       write_sequnlock(&qdisc->running);
15898 +#else
15899         write_seqcount_end(&qdisc->running);
15900 +#endif
15901  }
15902  
15903  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
15904 @@ -337,7 +352,7 @@
15905         return qdisc_lock(root);
15906  }
15907  
15908 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
15909 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
15910  {
15911         struct Qdisc *root = qdisc_root_sleeping(qdisc);
15912  
15913 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/net/xfrm.h linux-4.14/include/net/xfrm.h
15914 --- linux-4.14.orig/include/net/xfrm.h  2018-09-05 11:03:22.000000000 +0200
15915 +++ linux-4.14/include/net/xfrm.h       2018-09-05 11:05:07.000000000 +0200
15916 @@ -217,7 +217,7 @@
15917         struct xfrm_stats       stats;
15918  
15919         struct xfrm_lifetime_cur curlft;
15920 -       struct tasklet_hrtimer  mtimer;
15921 +       struct hrtimer          mtimer;
15922  
15923         struct xfrm_state_offload xso;
15924  
15925 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/include/trace/events/timer.h linux-4.14/include/trace/events/timer.h
15926 --- linux-4.14.orig/include/trace/events/timer.h        2018-09-05 11:03:22.000000000 +0200
15927 +++ linux-4.14/include/trace/events/timer.h     2018-09-05 11:05:07.000000000 +0200
15928 @@ -148,7 +148,11 @@
15929                 { HRTIMER_MODE_ABS,             "ABS"           },      \
15930                 { HRTIMER_MODE_REL,             "REL"           },      \
15931                 { HRTIMER_MODE_ABS_PINNED,      "ABS|PINNED"    },      \
15932 -               { HRTIMER_MODE_REL_PINNED,      "REL|PINNED"    })
15933 +               { HRTIMER_MODE_REL_PINNED,      "REL|PINNED"    },      \
15934 +               { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"      },      \
15935 +               { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"      },      \
15936 +               { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" },    \
15937 +               { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
15938  
15939  /**
15940   * hrtimer_init - called when the hrtimer is initialized
15941 @@ -186,15 +190,16 @@
15942   */
15943  TRACE_EVENT(hrtimer_start,
15944  
15945 -       TP_PROTO(struct hrtimer *hrtimer),
15946 +       TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
15947  
15948 -       TP_ARGS(hrtimer),
15949 +       TP_ARGS(hrtimer, mode),
15950  
15951         TP_STRUCT__entry(
15952                 __field( void *,        hrtimer         )
15953                 __field( void *,        function        )
15954                 __field( s64,           expires         )
15955                 __field( s64,           softexpires     )
15956 +               __field( enum hrtimer_mode,     mode    )
15957         ),
15958  
15959         TP_fast_assign(
15960 @@ -202,12 +207,14 @@
15961                 __entry->function       = hrtimer->function;
15962                 __entry->expires        = hrtimer_get_expires(hrtimer);
15963                 __entry->softexpires    = hrtimer_get_softexpires(hrtimer);
15964 +               __entry->mode           = mode;
15965         ),
15966  
15967 -       TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
15968 -                 __entry->hrtimer, __entry->function,
15969 +       TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
15970 +                 "mode=%s", __entry->hrtimer, __entry->function,
15971                   (unsigned long long) __entry->expires,
15972 -                 (unsigned long long) __entry->softexpires)
15973 +                 (unsigned long long) __entry->softexpires,
15974 +                 decode_hrtimer_mode(__entry->mode))
15975  );
15976  
15977  /**
15978 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/Kconfig linux-4.14/init/Kconfig
15979 --- linux-4.14.orig/init/Kconfig        2018-09-05 11:03:22.000000000 +0200
15980 +++ linux-4.14/init/Kconfig     2018-09-05 11:05:07.000000000 +0200
15981 @@ -744,6 +744,7 @@
15982  config RT_GROUP_SCHED
15983         bool "Group scheduling for SCHED_RR/FIFO"
15984         depends on CGROUP_SCHED
15985 +       depends on !PREEMPT_RT_FULL
15986         default n
15987         help
15988           This feature lets you explicitly allocate real CPU bandwidth
15989 @@ -1533,6 +1534,7 @@
15990  
15991  config SLAB
15992         bool "SLAB"
15993 +       depends on !PREEMPT_RT_FULL
15994         select HAVE_HARDENED_USERCOPY_ALLOCATOR
15995         help
15996           The regular slab allocator that is established and known to work
15997 @@ -1553,6 +1555,7 @@
15998  config SLOB
15999         depends on EXPERT
16000         bool "SLOB (Simple Allocator)"
16001 +       depends on !PREEMPT_RT_FULL
16002         help
16003            SLOB replaces the stock allocator with a drastically simpler
16004            allocator. SLOB is generally more space efficient but
16005 @@ -1594,7 +1597,7 @@
16006  
16007  config SLUB_CPU_PARTIAL
16008         default y
16009 -       depends on SLUB && SMP
16010 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16011         bool "SLUB per cpu partial cache"
16012         help
16013           Per cpu partial caches accellerate objects allocation and freeing
16014 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/main.c linux-4.14/init/main.c
16015 --- linux-4.14.orig/init/main.c 2018-09-05 11:03:22.000000000 +0200
16016 +++ linux-4.14/init/main.c      2018-09-05 11:05:07.000000000 +0200
16017 @@ -543,6 +543,7 @@
16018         setup_command_line(command_line);
16019         setup_nr_cpu_ids();
16020         setup_per_cpu_areas();
16021 +       softirq_early_init();
16022         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16023         boot_cpu_hotplug_init();
16024  
16025 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/init/Makefile linux-4.14/init/Makefile
16026 --- linux-4.14.orig/init/Makefile       2017-11-12 19:46:13.000000000 +0100
16027 +++ linux-4.14/init/Makefile    2018-09-05 11:05:07.000000000 +0200
16028 @@ -36,4 +36,4 @@
16029  include/generated/compile.h: FORCE
16030         @$($(quiet)chk_compile.h)
16031         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16032 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16033 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16034 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cgroup/cgroup.c linux-4.14/kernel/cgroup/cgroup.c
16035 --- linux-4.14.orig/kernel/cgroup/cgroup.c      2018-09-05 11:03:22.000000000 +0200
16036 +++ linux-4.14/kernel/cgroup/cgroup.c   2018-09-05 11:05:07.000000000 +0200
16037 @@ -4508,10 +4508,10 @@
16038         queue_work(cgroup_destroy_wq, &css->destroy_work);
16039  }
16040  
16041 -static void css_release_work_fn(struct work_struct *work)
16042 +static void css_release_work_fn(struct swork_event *sev)
16043  {
16044         struct cgroup_subsys_state *css =
16045 -               container_of(work, struct cgroup_subsys_state, destroy_work);
16046 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
16047         struct cgroup_subsys *ss = css->ss;
16048         struct cgroup *cgrp = css->cgroup;
16049  
16050 @@ -4562,8 +4562,8 @@
16051         struct cgroup_subsys_state *css =
16052                 container_of(ref, struct cgroup_subsys_state, refcnt);
16053  
16054 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
16055 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
16056 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16057 +       swork_queue(&css->destroy_swork);
16058  }
16059  
16060  static void init_and_link_css(struct cgroup_subsys_state *css,
16061 @@ -5269,6 +5269,7 @@
16062          */
16063         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16064         BUG_ON(!cgroup_destroy_wq);
16065 +       BUG_ON(swork_get());
16066         return 0;
16067  }
16068  core_initcall(cgroup_wq_init);
16069 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cgroup/cpuset.c linux-4.14/kernel/cgroup/cpuset.c
16070 --- linux-4.14.orig/kernel/cgroup/cpuset.c      2017-11-12 19:46:13.000000000 +0100
16071 +++ linux-4.14/kernel/cgroup/cpuset.c   2018-09-05 11:05:07.000000000 +0200
16072 @@ -288,7 +288,7 @@
16073   */
16074  
16075  static DEFINE_MUTEX(cpuset_mutex);
16076 -static DEFINE_SPINLOCK(callback_lock);
16077 +static DEFINE_RAW_SPINLOCK(callback_lock);
16078  
16079  static struct workqueue_struct *cpuset_migrate_mm_wq;
16080  
16081 @@ -926,9 +926,9 @@
16082                         continue;
16083                 rcu_read_unlock();
16084  
16085 -               spin_lock_irq(&callback_lock);
16086 +               raw_spin_lock_irq(&callback_lock);
16087                 cpumask_copy(cp->effective_cpus, new_cpus);
16088 -               spin_unlock_irq(&callback_lock);
16089 +               raw_spin_unlock_irq(&callback_lock);
16090  
16091                 WARN_ON(!is_in_v2_mode() &&
16092                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
16093 @@ -993,9 +993,9 @@
16094         if (retval < 0)
16095                 return retval;
16096  
16097 -       spin_lock_irq(&callback_lock);
16098 +       raw_spin_lock_irq(&callback_lock);
16099         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
16100 -       spin_unlock_irq(&callback_lock);
16101 +       raw_spin_unlock_irq(&callback_lock);
16102  
16103         /* use trialcs->cpus_allowed as a temp variable */
16104         update_cpumasks_hier(cs, trialcs->cpus_allowed);
16105 @@ -1179,9 +1179,9 @@
16106                         continue;
16107                 rcu_read_unlock();
16108  
16109 -               spin_lock_irq(&callback_lock);
16110 +               raw_spin_lock_irq(&callback_lock);
16111                 cp->effective_mems = *new_mems;
16112 -               spin_unlock_irq(&callback_lock);
16113 +               raw_spin_unlock_irq(&callback_lock);
16114  
16115                 WARN_ON(!is_in_v2_mode() &&
16116                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
16117 @@ -1249,9 +1249,9 @@
16118         if (retval < 0)
16119                 goto done;
16120  
16121 -       spin_lock_irq(&callback_lock);
16122 +       raw_spin_lock_irq(&callback_lock);
16123         cs->mems_allowed = trialcs->mems_allowed;
16124 -       spin_unlock_irq(&callback_lock);
16125 +       raw_spin_unlock_irq(&callback_lock);
16126  
16127         /* use trialcs->mems_allowed as a temp variable */
16128         update_nodemasks_hier(cs, &trialcs->mems_allowed);
16129 @@ -1342,9 +1342,9 @@
16130         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
16131                         || (is_spread_page(cs) != is_spread_page(trialcs)));
16132  
16133 -       spin_lock_irq(&callback_lock);
16134 +       raw_spin_lock_irq(&callback_lock);
16135         cs->flags = trialcs->flags;
16136 -       spin_unlock_irq(&callback_lock);
16137 +       raw_spin_unlock_irq(&callback_lock);
16138  
16139         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
16140                 rebuild_sched_domains_locked();
16141 @@ -1759,7 +1759,7 @@
16142         cpuset_filetype_t type = seq_cft(sf)->private;
16143         int ret = 0;
16144  
16145 -       spin_lock_irq(&callback_lock);
16146 +       raw_spin_lock_irq(&callback_lock);
16147  
16148         switch (type) {
16149         case FILE_CPULIST:
16150 @@ -1778,7 +1778,7 @@
16151                 ret = -EINVAL;
16152         }
16153  
16154 -       spin_unlock_irq(&callback_lock);
16155 +       raw_spin_unlock_irq(&callback_lock);
16156         return ret;
16157  }
16158  
16159 @@ -1993,12 +1993,12 @@
16160  
16161         cpuset_inc();
16162  
16163 -       spin_lock_irq(&callback_lock);
16164 +       raw_spin_lock_irq(&callback_lock);
16165         if (is_in_v2_mode()) {
16166                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
16167                 cs->effective_mems = parent->effective_mems;
16168         }
16169 -       spin_unlock_irq(&callback_lock);
16170 +       raw_spin_unlock_irq(&callback_lock);
16171  
16172         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
16173                 goto out_unlock;
16174 @@ -2025,12 +2025,12 @@
16175         }
16176         rcu_read_unlock();
16177  
16178 -       spin_lock_irq(&callback_lock);
16179 +       raw_spin_lock_irq(&callback_lock);
16180         cs->mems_allowed = parent->mems_allowed;
16181         cs->effective_mems = parent->mems_allowed;
16182         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
16183         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
16184 -       spin_unlock_irq(&callback_lock);
16185 +       raw_spin_unlock_irq(&callback_lock);
16186  out_unlock:
16187         mutex_unlock(&cpuset_mutex);
16188         return 0;
16189 @@ -2069,7 +2069,7 @@
16190  static void cpuset_bind(struct cgroup_subsys_state *root_css)
16191  {
16192         mutex_lock(&cpuset_mutex);
16193 -       spin_lock_irq(&callback_lock);
16194 +       raw_spin_lock_irq(&callback_lock);
16195  
16196         if (is_in_v2_mode()) {
16197                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
16198 @@ -2080,7 +2080,7 @@
16199                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
16200         }
16201  
16202 -       spin_unlock_irq(&callback_lock);
16203 +       raw_spin_unlock_irq(&callback_lock);
16204         mutex_unlock(&cpuset_mutex);
16205  }
16206  
16207 @@ -2094,7 +2094,7 @@
16208         if (task_css_is_root(task, cpuset_cgrp_id))
16209                 return;
16210  
16211 -       set_cpus_allowed_ptr(task, &current->cpus_allowed);
16212 +       set_cpus_allowed_ptr(task, current->cpus_ptr);
16213         task->mems_allowed = current->mems_allowed;
16214  }
16215  
16216 @@ -2178,12 +2178,12 @@
16217  {
16218         bool is_empty;
16219  
16220 -       spin_lock_irq(&callback_lock);
16221 +       raw_spin_lock_irq(&callback_lock);
16222         cpumask_copy(cs->cpus_allowed, new_cpus);
16223         cpumask_copy(cs->effective_cpus, new_cpus);
16224         cs->mems_allowed = *new_mems;
16225         cs->effective_mems = *new_mems;
16226 -       spin_unlock_irq(&callback_lock);
16227 +       raw_spin_unlock_irq(&callback_lock);
16228  
16229         /*
16230          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
16231 @@ -2220,10 +2220,10 @@
16232         if (nodes_empty(*new_mems))
16233                 *new_mems = parent_cs(cs)->effective_mems;
16234  
16235 -       spin_lock_irq(&callback_lock);
16236 +       raw_spin_lock_irq(&callback_lock);
16237         cpumask_copy(cs->effective_cpus, new_cpus);
16238         cs->effective_mems = *new_mems;
16239 -       spin_unlock_irq(&callback_lock);
16240 +       raw_spin_unlock_irq(&callback_lock);
16241  
16242         if (cpus_updated)
16243                 update_tasks_cpumask(cs);
16244 @@ -2316,21 +2316,21 @@
16245  
16246         /* synchronize cpus_allowed to cpu_active_mask */
16247         if (cpus_updated) {
16248 -               spin_lock_irq(&callback_lock);
16249 +               raw_spin_lock_irq(&callback_lock);
16250                 if (!on_dfl)
16251                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
16252                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
16253 -               spin_unlock_irq(&callback_lock);
16254 +               raw_spin_unlock_irq(&callback_lock);
16255                 /* we don't mess with cpumasks of tasks in top_cpuset */
16256         }
16257  
16258         /* synchronize mems_allowed to N_MEMORY */
16259         if (mems_updated) {
16260 -               spin_lock_irq(&callback_lock);
16261 +               raw_spin_lock_irq(&callback_lock);
16262                 if (!on_dfl)
16263                         top_cpuset.mems_allowed = new_mems;
16264                 top_cpuset.effective_mems = new_mems;
16265 -               spin_unlock_irq(&callback_lock);
16266 +               raw_spin_unlock_irq(&callback_lock);
16267                 update_tasks_nodemask(&top_cpuset);
16268         }
16269  
16270 @@ -2429,11 +2429,11 @@
16271  {
16272         unsigned long flags;
16273  
16274 -       spin_lock_irqsave(&callback_lock, flags);
16275 +       raw_spin_lock_irqsave(&callback_lock, flags);
16276         rcu_read_lock();
16277         guarantee_online_cpus(task_cs(tsk), pmask);
16278         rcu_read_unlock();
16279 -       spin_unlock_irqrestore(&callback_lock, flags);
16280 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
16281  }
16282  
16283  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
16284 @@ -2481,11 +2481,11 @@
16285         nodemask_t mask;
16286         unsigned long flags;
16287  
16288 -       spin_lock_irqsave(&callback_lock, flags);
16289 +       raw_spin_lock_irqsave(&callback_lock, flags);
16290         rcu_read_lock();
16291         guarantee_online_mems(task_cs(tsk), &mask);
16292         rcu_read_unlock();
16293 -       spin_unlock_irqrestore(&callback_lock, flags);
16294 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
16295  
16296         return mask;
16297  }
16298 @@ -2577,14 +2577,14 @@
16299                 return true;
16300  
16301         /* Not hardwall and node outside mems_allowed: scan up cpusets */
16302 -       spin_lock_irqsave(&callback_lock, flags);
16303 +       raw_spin_lock_irqsave(&callback_lock, flags);
16304  
16305         rcu_read_lock();
16306         cs = nearest_hardwall_ancestor(task_cs(current));
16307         allowed = node_isset(node, cs->mems_allowed);
16308         rcu_read_unlock();
16309  
16310 -       spin_unlock_irqrestore(&callback_lock, flags);
16311 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
16312         return allowed;
16313  }
16314  
16315 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/cpu.c linux-4.14/kernel/cpu.c
16316 --- linux-4.14.orig/kernel/cpu.c        2018-09-05 11:03:22.000000000 +0200
16317 +++ linux-4.14/kernel/cpu.c     2018-09-05 11:05:07.000000000 +0200
16318 @@ -74,6 +74,11 @@
16319         .fail = CPUHP_INVALID,
16320  };
16321  
16322 +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
16323 +static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
16324 +       __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
16325 +#endif
16326 +
16327  #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
16328  static struct lockdep_map cpuhp_state_up_map =
16329         STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
16330 @@ -287,6 +292,55 @@
16331  
16332  #ifdef CONFIG_HOTPLUG_CPU
16333  
16334 +/**
16335 + * pin_current_cpu - Prevent the current cpu from being unplugged
16336 + */
16337 +void pin_current_cpu(void)
16338 +{
16339 +#ifdef CONFIG_PREEMPT_RT_FULL
16340 +       struct rt_rw_lock *cpuhp_pin;
16341 +       unsigned int cpu;
16342 +       int ret;
16343 +
16344 +again:
16345 +       cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16346 +       ret = __read_rt_trylock(cpuhp_pin);
16347 +       if (ret) {
16348 +               current->pinned_on_cpu = smp_processor_id();
16349 +               return;
16350 +       }
16351 +       cpu = smp_processor_id();
16352 +       preempt_lazy_enable();
16353 +       preempt_enable();
16354 +
16355 +       __read_rt_lock(cpuhp_pin);
16356 +
16357 +       preempt_disable();
16358 +       preempt_lazy_disable();
16359 +       if (cpu != smp_processor_id()) {
16360 +               __read_rt_unlock(cpuhp_pin);
16361 +               goto again;
16362 +       }
16363 +       current->pinned_on_cpu = cpu;
16364 +#endif
16365 +}
16366 +
16367 +/**
16368 + * unpin_current_cpu - Allow unplug of current cpu
16369 + */
16370 +void unpin_current_cpu(void)
16371 +{
16372 +#ifdef CONFIG_PREEMPT_RT_FULL
16373 +       struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16374 +
16375 +       if (WARN_ON(current->pinned_on_cpu != smp_processor_id()))
16376 +               cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu);
16377 +
16378 +       current->pinned_on_cpu = -1;
16379 +       __read_rt_unlock(cpuhp_pin);
16380 +#endif
16381 +}
16382 +
16383  DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
16384  
16385  void cpus_read_lock(void)
16386 @@ -843,6 +897,9 @@
16387  
16388  static int takedown_cpu(unsigned int cpu)
16389  {
16390 +#ifdef CONFIG_PREEMPT_RT_FULL
16391 +       struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu);
16392 +#endif
16393         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
16394         int err;
16395  
16396 @@ -855,11 +912,18 @@
16397          */
16398         irq_lock_sparse();
16399  
16400 +#ifdef CONFIG_PREEMPT_RT_FULL
16401 +       __write_rt_lock(cpuhp_pin);
16402 +#endif
16403 +
16404         /*
16405          * So now all preempt/rcu users must observe !cpu_active().
16406          */
16407         err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
16408         if (err) {
16409 +#ifdef CONFIG_PREEMPT_RT_FULL
16410 +               __write_rt_unlock(cpuhp_pin);
16411 +#endif
16412                 /* CPU refused to die */
16413                 irq_unlock_sparse();
16414                 /* Unpark the hotplug thread so we can rollback there */
16415 @@ -878,6 +942,9 @@
16416         wait_for_ap_thread(st, false);
16417         BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
16418  
16419 +#ifdef CONFIG_PREEMPT_RT_FULL
16420 +       __write_rt_unlock(cpuhp_pin);
16421 +#endif
16422         /* Interrupts are moved away from the dying cpu, reenable alloc/free */
16423         irq_unlock_sparse();
16424  
16425 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/debug/kdb/kdb_io.c linux-4.14/kernel/debug/kdb/kdb_io.c
16426 --- linux-4.14.orig/kernel/debug/kdb/kdb_io.c   2018-09-05 11:03:22.000000000 +0200
16427 +++ linux-4.14/kernel/debug/kdb/kdb_io.c        2018-09-05 11:05:07.000000000 +0200
16428 @@ -854,9 +854,11 @@
16429         va_list ap;
16430         int r;
16431  
16432 +       kdb_trap_printk++;
16433         va_start(ap, fmt);
16434         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
16435         va_end(ap);
16436 +       kdb_trap_printk--;
16437  
16438         return r;
16439  }
16440 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/events/core.c linux-4.14/kernel/events/core.c
16441 --- linux-4.14.orig/kernel/events/core.c        2018-09-05 11:03:22.000000000 +0200
16442 +++ linux-4.14/kernel/events/core.c     2018-09-05 11:05:07.000000000 +0200
16443 @@ -1065,7 +1065,7 @@
16444         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
16445  
16446         raw_spin_lock_init(&cpuctx->hrtimer_lock);
16447 -       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
16448 +       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
16449         timer->function = perf_mux_hrtimer_handler;
16450  }
16451  
16452 @@ -8750,7 +8750,7 @@
16453         if (!is_sampling_event(event))
16454                 return;
16455  
16456 -       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16457 +       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
16458         hwc->hrtimer.function = perf_swevent_hrtimer;
16459  
16460         /*
16461 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/exit.c linux-4.14/kernel/exit.c
16462 --- linux-4.14.orig/kernel/exit.c       2018-09-05 11:03:22.000000000 +0200
16463 +++ linux-4.14/kernel/exit.c    2018-09-05 11:05:07.000000000 +0200
16464 @@ -159,7 +159,7 @@
16465          * Do this under ->siglock, we can race with another thread
16466          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
16467          */
16468 -       flush_sigqueue(&tsk->pending);
16469 +       flush_task_sigqueue(tsk);
16470         tsk->sighand = NULL;
16471         spin_unlock(&sighand->siglock);
16472  
16473 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/fork.c linux-4.14/kernel/fork.c
16474 --- linux-4.14.orig/kernel/fork.c       2018-09-05 11:03:28.000000000 +0200
16475 +++ linux-4.14/kernel/fork.c    2018-09-05 11:05:07.000000000 +0200
16476 @@ -40,6 +40,7 @@
16477  #include <linux/hmm.h>
16478  #include <linux/fs.h>
16479  #include <linux/mm.h>
16480 +#include <linux/kprobes.h>
16481  #include <linux/vmacache.h>
16482  #include <linux/nsproxy.h>
16483  #include <linux/capability.h>
16484 @@ -407,13 +408,24 @@
16485         if (atomic_dec_and_test(&sig->sigcnt))
16486                 free_signal_struct(sig);
16487  }
16488 -
16489 +#ifdef CONFIG_PREEMPT_RT_BASE
16490 +static
16491 +#endif
16492  void __put_task_struct(struct task_struct *tsk)
16493  {
16494         WARN_ON(!tsk->exit_state);
16495         WARN_ON(atomic_read(&tsk->usage));
16496         WARN_ON(tsk == current);
16497  
16498 +       /*
16499 +        * Remove function-return probe instances associated with this
16500 +        * task and put them back on the free list.
16501 +        */
16502 +       kprobe_flush_task(tsk);
16503 +
16504 +       /* Task is done with its stack. */
16505 +       put_task_stack(tsk);
16506 +
16507         cgroup_free(tsk);
16508         task_numa_free(tsk);
16509         security_task_free(tsk);
16510 @@ -424,7 +436,18 @@
16511         if (!profile_handoff_task(tsk))
16512                 free_task(tsk);
16513  }
16514 +#ifndef CONFIG_PREEMPT_RT_BASE
16515  EXPORT_SYMBOL_GPL(__put_task_struct);
16516 +#else
16517 +void __put_task_struct_cb(struct rcu_head *rhp)
16518 +{
16519 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
16520 +
16521 +       __put_task_struct(tsk);
16522 +
16523 +}
16524 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
16525 +#endif
16526  
16527  void __init __weak arch_task_cache_init(void) { }
16528  
16529 @@ -563,7 +586,8 @@
16530  #ifdef CONFIG_CC_STACKPROTECTOR
16531         tsk->stack_canary = get_random_canary();
16532  #endif
16533 -
16534 +       if (orig->cpus_ptr == &orig->cpus_mask)
16535 +               tsk->cpus_ptr = &tsk->cpus_mask;
16536         /*
16537          * One for us, one for whoever does the "release_task()" (usually
16538          * parent)
16539 @@ -575,6 +599,7 @@
16540         tsk->splice_pipe = NULL;
16541         tsk->task_frag.page = NULL;
16542         tsk->wake_q.next = NULL;
16543 +       tsk->wake_q_sleeper.next = NULL;
16544  
16545         account_kernel_stack(tsk, 1);
16546  
16547 @@ -915,6 +940,19 @@
16548  }
16549  EXPORT_SYMBOL_GPL(__mmdrop);
16550  
16551 +#ifdef CONFIG_PREEMPT_RT_BASE
16552 +/*
16553 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
16554 + * want another facility to make this work.
16555 + */
16556 +void __mmdrop_delayed(struct rcu_head *rhp)
16557 +{
16558 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
16559 +
16560 +       __mmdrop(mm);
16561 +}
16562 +#endif
16563 +
16564  static inline void __mmput(struct mm_struct *mm)
16565  {
16566         VM_BUG_ON(atomic_read(&mm->mm_users));
16567 @@ -1494,6 +1532,9 @@
16568   */
16569  static void posix_cpu_timers_init(struct task_struct *tsk)
16570  {
16571 +#ifdef CONFIG_PREEMPT_RT_BASE
16572 +       tsk->posix_timer_list = NULL;
16573 +#endif
16574         tsk->cputime_expires.prof_exp = 0;
16575         tsk->cputime_expires.virt_exp = 0;
16576         tsk->cputime_expires.sched_exp = 0;
16577 @@ -1646,6 +1687,7 @@
16578         spin_lock_init(&p->alloc_lock);
16579  
16580         init_sigpending(&p->pending);
16581 +       p->sigqueue_cache = NULL;
16582  
16583         p->utime = p->stime = p->gtime = 0;
16584  #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
16585 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/futex.c linux-4.14/kernel/futex.c
16586 --- linux-4.14.orig/kernel/futex.c      2018-09-05 11:03:22.000000000 +0200
16587 +++ linux-4.14/kernel/futex.c   2018-09-05 11:05:07.000000000 +0200
16588 @@ -936,7 +936,9 @@
16589                 if (head->next != next) {
16590                         /* retain curr->pi_lock for the loop invariant */
16591                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
16592 +                       raw_spin_unlock_irq(&curr->pi_lock);
16593                         spin_unlock(&hb->lock);
16594 +                       raw_spin_lock_irq(&curr->pi_lock);
16595                         put_pi_state(pi_state);
16596                         continue;
16597                 }
16598 @@ -1430,6 +1432,7 @@
16599         struct task_struct *new_owner;
16600         bool postunlock = false;
16601         DEFINE_WAKE_Q(wake_q);
16602 +       DEFINE_WAKE_Q(wake_sleeper_q);
16603         int ret = 0;
16604  
16605         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
16606 @@ -1491,13 +1494,13 @@
16607         pi_state->owner = new_owner;
16608         raw_spin_unlock(&new_owner->pi_lock);
16609  
16610 -       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
16611 -
16612 +       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
16613 +                                            &wake_sleeper_q);
16614  out_unlock:
16615         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
16616  
16617         if (postunlock)
16618 -               rt_mutex_postunlock(&wake_q);
16619 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
16620  
16621         return ret;
16622  }
16623 @@ -2104,6 +2107,16 @@
16624                                 requeue_pi_wake_futex(this, &key2, hb2);
16625                                 drop_count++;
16626                                 continue;
16627 +                       } else if (ret == -EAGAIN) {
16628 +                               /*
16629 +                                * Waiter was woken by timeout or
16630 +                                * signal and has set pi_blocked_on to
16631 +                                * PI_WAKEUP_INPROGRESS before we
16632 +                                * tried to enqueue it on the rtmutex.
16633 +                                */
16634 +                               this->pi_state = NULL;
16635 +                               put_pi_state(pi_state);
16636 +                               continue;
16637                         } else if (ret) {
16638                                 /*
16639                                  * rt_mutex_start_proxy_lock() detected a
16640 @@ -2642,10 +2655,9 @@
16641         if (abs_time) {
16642                 to = &timeout;
16643  
16644 -               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
16645 -                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
16646 -                                     HRTIMER_MODE_ABS);
16647 -               hrtimer_init_sleeper(to, current);
16648 +               hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
16649 +                                             CLOCK_REALTIME : CLOCK_MONOTONIC,
16650 +                                             HRTIMER_MODE_ABS, current);
16651                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
16652                                              current->timer_slack_ns);
16653         }
16654 @@ -2744,9 +2756,8 @@
16655  
16656         if (time) {
16657                 to = &timeout;
16658 -               hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
16659 -                                     HRTIMER_MODE_ABS);
16660 -               hrtimer_init_sleeper(to, current);
16661 +               hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
16662 +                                             HRTIMER_MODE_ABS, current);
16663                 hrtimer_set_expires(&to->timer, *time);
16664         }
16665  
16666 @@ -2801,7 +2812,7 @@
16667                 goto no_block;
16668         }
16669  
16670 -       rt_mutex_init_waiter(&rt_waiter);
16671 +       rt_mutex_init_waiter(&rt_waiter, false);
16672  
16673         /*
16674          * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
16675 @@ -2816,9 +2827,18 @@
16676          * lock handoff sequence.
16677          */
16678         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
16679 +       /*
16680 +        * the migrate_disable() here disables migration in the in_atomic() fast
16681 +        * path which is enabled again in the following spin_unlock(). We have
16682 +        * one migrate_disable() pending in the slow-path which is reversed
16683 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
16684 +        */
16685 +       migrate_disable();
16686 +
16687         spin_unlock(q.lock_ptr);
16688         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
16689         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
16690 +       migrate_enable();
16691  
16692         if (ret) {
16693                 if (ret == 1)
16694 @@ -2965,11 +2985,21 @@
16695                  * observed.
16696                  */
16697                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
16698 +               /*
16699 +                * Magic trickery for now to make the RT migrate disable
16700 +                * logic happy. The following spin_unlock() happens with
16701 +                * interrupts disabled so the internal migrate_enable()
16702 +                * won't undo the migrate_disable() which was issued when
16703 +                * locking hb->lock.
16704 +                */
16705 +               migrate_disable();
16706                 spin_unlock(&hb->lock);
16707  
16708                 /* drops pi_state->pi_mutex.wait_lock */
16709                 ret = wake_futex_pi(uaddr, uval, pi_state);
16710  
16711 +               migrate_enable();
16712 +
16713                 put_pi_state(pi_state);
16714  
16715                 /*
16716 @@ -3127,7 +3157,7 @@
16717         struct hrtimer_sleeper timeout, *to = NULL;
16718         struct futex_pi_state *pi_state = NULL;
16719         struct rt_mutex_waiter rt_waiter;
16720 -       struct futex_hash_bucket *hb;
16721 +       struct futex_hash_bucket *hb, *hb2;
16722         union futex_key key2 = FUTEX_KEY_INIT;
16723         struct futex_q q = futex_q_init;
16724         int res, ret;
16725 @@ -3143,10 +3173,9 @@
16726  
16727         if (abs_time) {
16728                 to = &timeout;
16729 -               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
16730 -                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
16731 -                                     HRTIMER_MODE_ABS);
16732 -               hrtimer_init_sleeper(to, current);
16733 +               hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
16734 +                                             CLOCK_REALTIME : CLOCK_MONOTONIC,
16735 +                                             HRTIMER_MODE_ABS, current);
16736                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
16737                                              current->timer_slack_ns);
16738         }
16739 @@ -3155,7 +3184,7 @@
16740          * The waiter is allocated on our stack, manipulated by the requeue
16741          * code while we sleep on uaddr.
16742          */
16743 -       rt_mutex_init_waiter(&rt_waiter);
16744 +       rt_mutex_init_waiter(&rt_waiter, false);
16745  
16746         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
16747         if (unlikely(ret != 0))
16748 @@ -3186,20 +3215,55 @@
16749         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
16750         futex_wait_queue_me(hb, &q, to);
16751  
16752 -       spin_lock(&hb->lock);
16753 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
16754 -       spin_unlock(&hb->lock);
16755 -       if (ret)
16756 -               goto out_put_keys;
16757 +       /*
16758 +        * On RT we must avoid races with requeue and trying to block
16759 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
16760 +        * serializing access to pi_blocked_on with pi_lock.
16761 +        */
16762 +       raw_spin_lock_irq(&current->pi_lock);
16763 +       if (current->pi_blocked_on) {
16764 +               /*
16765 +                * We have been requeued or are in the process of
16766 +                * being requeued.
16767 +                */
16768 +               raw_spin_unlock_irq(&current->pi_lock);
16769 +       } else {
16770 +               /*
16771 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
16772 +                * prevents a concurrent requeue from moving us to the
16773 +                * uaddr2 rtmutex. After that we can safely acquire
16774 +                * (and possibly block on) hb->lock.
16775 +                */
16776 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
16777 +               raw_spin_unlock_irq(&current->pi_lock);
16778 +
16779 +               spin_lock(&hb->lock);
16780 +
16781 +               /*
16782 +                * Clean up pi_blocked_on. We might leak it otherwise
16783 +                * when we succeeded with the hb->lock in the fast
16784 +                * path.
16785 +                */
16786 +               raw_spin_lock_irq(&current->pi_lock);
16787 +               current->pi_blocked_on = NULL;
16788 +               raw_spin_unlock_irq(&current->pi_lock);
16789 +
16790 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
16791 +               spin_unlock(&hb->lock);
16792 +               if (ret)
16793 +                       goto out_put_keys;
16794 +       }
16795  
16796         /*
16797 -        * In order for us to be here, we know our q.key == key2, and since
16798 -        * we took the hb->lock above, we also know that futex_requeue() has
16799 -        * completed and we no longer have to concern ourselves with a wakeup
16800 -        * race with the atomic proxy lock acquisition by the requeue code. The
16801 -        * futex_requeue dropped our key1 reference and incremented our key2
16802 -        * reference count.
16803 +        * In order to be here, we have either been requeued, are in
16804 +        * the process of being requeued, or requeue successfully
16805 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
16806 +        * non-null above, we may be racing with a requeue.  Do not
16807 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
16808 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
16809 +        * reference and incremented our key2 reference count.
16810          */
16811 +       hb2 = hash_futex(&key2);
16812  
16813         /* Check if the requeue code acquired the second futex for us. */
16814         if (!q.rt_waiter) {
16815 @@ -3208,7 +3272,8 @@
16816                  * did a lock-steal - fix up the PI-state in that case.
16817                  */
16818                 if (q.pi_state && (q.pi_state->owner != current)) {
16819 -                       spin_lock(q.lock_ptr);
16820 +                       spin_lock(&hb2->lock);
16821 +                       BUG_ON(&hb2->lock != q.lock_ptr);
16822                         ret = fixup_pi_state_owner(uaddr2, &q, current);
16823                         if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
16824                                 pi_state = q.pi_state;
16825 @@ -3219,7 +3284,7 @@
16826                          * the requeue_pi() code acquired for us.
16827                          */
16828                         put_pi_state(q.pi_state);
16829 -                       spin_unlock(q.lock_ptr);
16830 +                       spin_unlock(&hb2->lock);
16831                 }
16832         } else {
16833                 struct rt_mutex *pi_mutex;
16834 @@ -3233,7 +3298,8 @@
16835                 pi_mutex = &q.pi_state->pi_mutex;
16836                 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
16837  
16838 -               spin_lock(q.lock_ptr);
16839 +               spin_lock(&hb2->lock);
16840 +               BUG_ON(&hb2->lock != q.lock_ptr);
16841                 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
16842                         ret = 0;
16843  
16844 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/handle.c linux-4.14/kernel/irq/handle.c
16845 --- linux-4.14.orig/kernel/irq/handle.c 2017-11-12 19:46:13.000000000 +0100
16846 +++ linux-4.14/kernel/irq/handle.c      2018-09-05 11:05:07.000000000 +0200
16847 @@ -183,10 +183,16 @@
16848  {
16849         irqreturn_t retval;
16850         unsigned int flags = 0;
16851 +       struct pt_regs *regs = get_irq_regs();
16852 +       u64 ip = regs ? instruction_pointer(regs) : 0;
16853  
16854         retval = __handle_irq_event_percpu(desc, &flags);
16855  
16856 -       add_interrupt_randomness(desc->irq_data.irq, flags);
16857 +#ifdef CONFIG_PREEMPT_RT_FULL
16858 +       desc->random_ip = ip;
16859 +#else
16860 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
16861 +#endif
16862  
16863         if (!noirqdebug)
16864                 note_interrupt(desc, retval);
16865 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/manage.c linux-4.14/kernel/irq/manage.c
16866 --- linux-4.14.orig/kernel/irq/manage.c 2018-09-05 11:03:22.000000000 +0200
16867 +++ linux-4.14/kernel/irq/manage.c      2018-09-05 11:05:07.000000000 +0200
16868 @@ -24,6 +24,7 @@
16869  #include "internals.h"
16870  
16871  #ifdef CONFIG_IRQ_FORCED_THREADING
16872 +# ifndef CONFIG_PREEMPT_RT_BASE
16873  __read_mostly bool force_irqthreads;
16874  
16875  static int __init setup_forced_irqthreads(char *arg)
16876 @@ -32,6 +33,7 @@
16877         return 0;
16878  }
16879  early_param("threadirqs", setup_forced_irqthreads);
16880 +# endif
16881  #endif
16882  
16883  static void __synchronize_hardirq(struct irq_desc *desc)
16884 @@ -224,7 +226,12 @@
16885  
16886         if (desc->affinity_notify) {
16887                 kref_get(&desc->affinity_notify->kref);
16888 +
16889 +#ifdef CONFIG_PREEMPT_RT_BASE
16890 +               swork_queue(&desc->affinity_notify->swork);
16891 +#else
16892                 schedule_work(&desc->affinity_notify->work);
16893 +#endif
16894         }
16895         irqd_set(data, IRQD_AFFINITY_SET);
16896  
16897 @@ -262,10 +269,8 @@
16898  }
16899  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
16900  
16901 -static void irq_affinity_notify(struct work_struct *work)
16902 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
16903  {
16904 -       struct irq_affinity_notify *notify =
16905 -               container_of(work, struct irq_affinity_notify, work);
16906         struct irq_desc *desc = irq_to_desc(notify->irq);
16907         cpumask_var_t cpumask;
16908         unsigned long flags;
16909 @@ -287,6 +292,35 @@
16910         kref_put(&notify->kref, notify->release);
16911  }
16912  
16913 +#ifdef CONFIG_PREEMPT_RT_BASE
16914 +static void init_helper_thread(void)
16915 +{
16916 +       static int init_sworker_once;
16917 +
16918 +       if (init_sworker_once)
16919 +               return;
16920 +       if (WARN_ON(swork_get()))
16921 +               return;
16922 +       init_sworker_once = 1;
16923 +}
16924 +
16925 +static void irq_affinity_notify(struct swork_event *swork)
16926 +{
16927 +       struct irq_affinity_notify *notify =
16928 +               container_of(swork, struct irq_affinity_notify, swork);
16929 +       _irq_affinity_notify(notify);
16930 +}
16931 +
16932 +#else
16933 +
16934 +static void irq_affinity_notify(struct work_struct *work)
16935 +{
16936 +       struct irq_affinity_notify *notify =
16937 +               container_of(work, struct irq_affinity_notify, work);
16938 +       _irq_affinity_notify(notify);
16939 +}
16940 +#endif
16941 +
16942  /**
16943   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
16944   *     @irq:           Interrupt for which to enable/disable notification
16945 @@ -315,7 +349,12 @@
16946         if (notify) {
16947                 notify->irq = irq;
16948                 kref_init(&notify->kref);
16949 +#ifdef CONFIG_PREEMPT_RT_BASE
16950 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
16951 +               init_helper_thread();
16952 +#else
16953                 INIT_WORK(&notify->work, irq_affinity_notify);
16954 +#endif
16955         }
16956  
16957         raw_spin_lock_irqsave(&desc->lock, flags);
16958 @@ -883,7 +922,15 @@
16959         local_bh_disable();
16960         ret = action->thread_fn(action->irq, action->dev_id);
16961         irq_finalize_oneshot(desc, action);
16962 -       local_bh_enable();
16963 +       /*
16964 +        * Interrupts which have real time requirements can be set up
16965 +        * to avoid softirq processing in the thread handler. This is
16966 +        * safe as these interrupts do not raise soft interrupts.
16967 +        */
16968 +       if (irq_settings_no_softirq_call(desc))
16969 +               _local_bh_enable();
16970 +       else
16971 +               local_bh_enable();
16972         return ret;
16973  }
16974  
16975 @@ -980,6 +1027,12 @@
16976                 if (action_ret == IRQ_WAKE_THREAD)
16977                         irq_wake_secondary(desc, action);
16978  
16979 +#ifdef CONFIG_PREEMPT_RT_FULL
16980 +               migrate_disable();
16981 +               add_interrupt_randomness(action->irq, 0,
16982 +                                desc->random_ip ^ (unsigned long) action);
16983 +               migrate_enable();
16984 +#endif
16985                 wake_threads_waitq(desc);
16986         }
16987  
16988 @@ -1378,6 +1431,9 @@
16989                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
16990                 }
16991  
16992 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
16993 +                       irq_settings_set_no_softirq_call(desc);
16994 +
16995                 if (irq_settings_can_autoenable(desc)) {
16996                         irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
16997                 } else {
16998 @@ -2159,7 +2215,7 @@
16999   *     This call sets the internal irqchip state of an interrupt,
17000   *     depending on the value of @which.
17001   *
17002 - *     This function should be called with preemption disabled if the
17003 + *     This function should be called with migration disabled if the
17004   *     interrupt controller has per-cpu registers.
17005   */
17006  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17007 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/settings.h linux-4.14/kernel/irq/settings.h
17008 --- linux-4.14.orig/kernel/irq/settings.h       2017-11-12 19:46:13.000000000 +0100
17009 +++ linux-4.14/kernel/irq/settings.h    2018-09-05 11:05:07.000000000 +0200
17010 @@ -17,6 +17,7 @@
17011         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
17012         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
17013         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
17014 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
17015         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
17016  };
17017  
17018 @@ -31,6 +32,7 @@
17019  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
17020  #define IRQ_IS_POLLED          GOT_YOU_MORON
17021  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
17022 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
17023  #undef IRQF_MODIFY_MASK
17024  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
17025  
17026 @@ -41,6 +43,16 @@
17027         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17028  }
17029  
17030 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17031 +{
17032 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17033 +}
17034 +
17035 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17036 +{
17037 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17038 +}
17039 +
17040  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17041  {
17042         return desc->status_use_accessors & _IRQ_PER_CPU;
17043 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq/spurious.c linux-4.14/kernel/irq/spurious.c
17044 --- linux-4.14.orig/kernel/irq/spurious.c       2017-11-12 19:46:13.000000000 +0100
17045 +++ linux-4.14/kernel/irq/spurious.c    2018-09-05 11:05:07.000000000 +0200
17046 @@ -445,6 +445,10 @@
17047  
17048  static int __init irqfixup_setup(char *str)
17049  {
17050 +#ifdef CONFIG_PREEMPT_RT_BASE
17051 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17052 +       return 1;
17053 +#endif
17054         irqfixup = 1;
17055         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17056         printk(KERN_WARNING "This may impact system performance.\n");
17057 @@ -457,6 +461,10 @@
17058  
17059  static int __init irqpoll_setup(char *str)
17060  {
17061 +#ifdef CONFIG_PREEMPT_RT_BASE
17062 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17063 +       return 1;
17064 +#endif
17065         irqfixup = 2;
17066         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17067                                 "enabled\n");
17068 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/irq_work.c linux-4.14/kernel/irq_work.c
17069 --- linux-4.14.orig/kernel/irq_work.c   2017-11-12 19:46:13.000000000 +0100
17070 +++ linux-4.14/kernel/irq_work.c        2018-09-05 11:05:07.000000000 +0200
17071 @@ -17,6 +17,7 @@
17072  #include <linux/cpu.h>
17073  #include <linux/notifier.h>
17074  #include <linux/smp.h>
17075 +#include <linux/interrupt.h>
17076  #include <asm/processor.h>
17077  
17078  
17079 @@ -65,6 +66,8 @@
17080   */
17081  bool irq_work_queue_on(struct irq_work *work, int cpu)
17082  {
17083 +       struct llist_head *list;
17084 +
17085         /* All work should have been flushed before going offline */
17086         WARN_ON_ONCE(cpu_is_offline(cpu));
17087  
17088 @@ -75,7 +78,12 @@
17089         if (!irq_work_claim(work))
17090                 return false;
17091  
17092 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17093 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17094 +               list = &per_cpu(lazy_list, cpu);
17095 +       else
17096 +               list = &per_cpu(raised_list, cpu);
17097 +
17098 +       if (llist_add(&work->llnode, list))
17099                 arch_send_call_function_single_ipi(cpu);
17100  
17101         return true;
17102 @@ -86,6 +94,9 @@
17103  /* Enqueue the irq work @work on the current CPU */
17104  bool irq_work_queue(struct irq_work *work)
17105  {
17106 +       struct llist_head *list;
17107 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17108 +
17109         /* Only queue if not already pending */
17110         if (!irq_work_claim(work))
17111                 return false;
17112 @@ -93,13 +104,15 @@
17113         /* Queue the entry and raise the IPI if needed. */
17114         preempt_disable();
17115  
17116 -       /* If the work is "lazy", handle it from next tick if any */
17117 -       if (work->flags & IRQ_WORK_LAZY) {
17118 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17119 -                   tick_nohz_tick_stopped())
17120 -                       arch_irq_work_raise();
17121 -       } else {
17122 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17123 +       lazy_work = work->flags & IRQ_WORK_LAZY;
17124 +
17125 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17126 +               list = this_cpu_ptr(&lazy_list);
17127 +       else
17128 +               list = this_cpu_ptr(&raised_list);
17129 +
17130 +       if (llist_add(&work->llnode, list)) {
17131 +               if (!lazy_work || tick_nohz_tick_stopped())
17132                         arch_irq_work_raise();
17133         }
17134  
17135 @@ -116,9 +129,8 @@
17136         raised = this_cpu_ptr(&raised_list);
17137         lazy = this_cpu_ptr(&lazy_list);
17138  
17139 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
17140 -               if (llist_empty(lazy))
17141 -                       return false;
17142 +       if (llist_empty(raised) && llist_empty(lazy))
17143 +               return false;
17144  
17145         /* All work should have been flushed before going offline */
17146         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17147 @@ -132,7 +144,7 @@
17148         struct irq_work *work;
17149         struct llist_node *llnode;
17150  
17151 -       BUG_ON(!irqs_disabled());
17152 +       BUG_ON_NONRT(!irqs_disabled());
17153  
17154         if (llist_empty(list))
17155                 return;
17156 @@ -169,7 +181,16 @@
17157  void irq_work_run(void)
17158  {
17159         irq_work_run_list(this_cpu_ptr(&raised_list));
17160 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
17161 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17162 +               /*
17163 +                * NOTE: we raise softirq via IPI for safety,
17164 +                * and execute in irq_work_tick() to move the
17165 +                * overhead from hard to soft irq context.
17166 +                */
17167 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
17168 +                       raise_softirq(TIMER_SOFTIRQ);
17169 +       } else
17170 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17171  }
17172  EXPORT_SYMBOL_GPL(irq_work_run);
17173  
17174 @@ -179,8 +200,17 @@
17175  
17176         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17177                 irq_work_run_list(raised);
17178 +
17179 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17180 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17181 +}
17182 +
17183 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17184 +void irq_work_tick_soft(void)
17185 +{
17186         irq_work_run_list(this_cpu_ptr(&lazy_list));
17187  }
17188 +#endif
17189  
17190  /*
17191   * Synchronize against the irq_work @entry, ensures the entry is not
17192 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/Kconfig.locks linux-4.14/kernel/Kconfig.locks
17193 --- linux-4.14.orig/kernel/Kconfig.locks        2017-11-12 19:46:13.000000000 +0100
17194 +++ linux-4.14/kernel/Kconfig.locks     2018-09-05 11:05:07.000000000 +0200
17195 @@ -225,11 +225,11 @@
17196  
17197  config MUTEX_SPIN_ON_OWNER
17198         def_bool y
17199 -       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
17200 +       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
17201  
17202  config RWSEM_SPIN_ON_OWNER
17203         def_bool y
17204 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
17205 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
17206  
17207  config LOCK_SPIN_ON_OWNER
17208         def_bool y
17209 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/Kconfig.preempt linux-4.14/kernel/Kconfig.preempt
17210 --- linux-4.14.orig/kernel/Kconfig.preempt      2017-11-12 19:46:13.000000000 +0100
17211 +++ linux-4.14/kernel/Kconfig.preempt   2018-09-05 11:05:07.000000000 +0200
17212 @@ -1,3 +1,16 @@
17213 +config PREEMPT
17214 +       bool
17215 +       select PREEMPT_COUNT
17216 +
17217 +config PREEMPT_RT_BASE
17218 +       bool
17219 +       select PREEMPT
17220 +
17221 +config HAVE_PREEMPT_LAZY
17222 +       bool
17223 +
17224 +config PREEMPT_LAZY
17225 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
17226  
17227  choice
17228         prompt "Preemption Model"
17229 @@ -33,9 +46,9 @@
17230  
17231           Select this if you are building a kernel for a desktop system.
17232  
17233 -config PREEMPT
17234 +config PREEMPT__LL
17235         bool "Preemptible Kernel (Low-Latency Desktop)"
17236 -       select PREEMPT_COUNT
17237 +       select PREEMPT
17238         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
17239         help
17240           This option reduces the latency of the kernel by making
17241 @@ -52,6 +65,22 @@
17242           embedded system with latency requirements in the milliseconds
17243           range.
17244  
17245 +config PREEMPT_RTB
17246 +       bool "Preemptible Kernel (Basic RT)"
17247 +       select PREEMPT_RT_BASE
17248 +       help
17249 +         This option is basically the same as (Low-Latency Desktop) but
17250 +         enables changes which are preliminary for the full preemptible
17251 +         RT kernel.
17252 +
17253 +config PREEMPT_RT_FULL
17254 +       bool "Fully Preemptible Kernel (RT)"
17255 +       depends on IRQ_FORCED_THREADING
17256 +       select PREEMPT_RT_BASE
17257 +       select PREEMPT_RCU
17258 +       help
17259 +         All and everything
17260 +
17261  endchoice
17262  
17263  config PREEMPT_COUNT
17264 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/ksysfs.c linux-4.14/kernel/ksysfs.c
17265 --- linux-4.14.orig/kernel/ksysfs.c     2017-11-12 19:46:13.000000000 +0100
17266 +++ linux-4.14/kernel/ksysfs.c  2018-09-05 11:05:07.000000000 +0200
17267 @@ -140,6 +140,15 @@
17268  
17269  #endif /* CONFIG_CRASH_CORE */
17270  
17271 +#if defined(CONFIG_PREEMPT_RT_FULL)
17272 +static ssize_t realtime_show(struct kobject *kobj,
17273 +                            struct kobj_attribute *attr, char *buf)
17274 +{
17275 +       return sprintf(buf, "%d\n", 1);
17276 +}
17277 +KERNEL_ATTR_RO(realtime);
17278 +#endif
17279 +
17280  /* whether file capabilities are enabled */
17281  static ssize_t fscaps_show(struct kobject *kobj,
17282                                   struct kobj_attribute *attr, char *buf)
17283 @@ -231,6 +240,9 @@
17284         &rcu_expedited_attr.attr,
17285         &rcu_normal_attr.attr,
17286  #endif
17287 +#ifdef CONFIG_PREEMPT_RT_FULL
17288 +       &realtime_attr.attr,
17289 +#endif
17290         NULL
17291  };
17292  
17293 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/lockdep.c linux-4.14/kernel/locking/lockdep.c
17294 --- linux-4.14.orig/kernel/locking/lockdep.c    2018-09-05 11:03:29.000000000 +0200
17295 +++ linux-4.14/kernel/locking/lockdep.c 2018-09-05 11:05:07.000000000 +0200
17296 @@ -3916,6 +3916,7 @@
17297                 }
17298         }
17299  
17300 +#ifndef CONFIG_PREEMPT_RT_FULL
17301         /*
17302          * We dont accurately track softirq state in e.g.
17303          * hardirq contexts (such as on 4KSTACKS), so only
17304 @@ -3930,6 +3931,7 @@
17305                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
17306                 }
17307         }
17308 +#endif
17309  
17310         if (!debug_locks)
17311                 print_irqtrace_events(current);
17312 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/locktorture.c linux-4.14/kernel/locking/locktorture.c
17313 --- linux-4.14.orig/kernel/locking/locktorture.c        2018-09-05 11:03:22.000000000 +0200
17314 +++ linux-4.14/kernel/locking/locktorture.c     2018-09-05 11:05:07.000000000 +0200
17315 @@ -26,7 +26,6 @@
17316  #include <linux/kthread.h>
17317  #include <linux/sched/rt.h>
17318  #include <linux/spinlock.h>
17319 -#include <linux/rwlock.h>
17320  #include <linux/mutex.h>
17321  #include <linux/rwsem.h>
17322  #include <linux/smp.h>
17323 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/Makefile linux-4.14/kernel/locking/Makefile
17324 --- linux-4.14.orig/kernel/locking/Makefile     2017-11-12 19:46:13.000000000 +0100
17325 +++ linux-4.14/kernel/locking/Makefile  2018-09-05 11:05:07.000000000 +0200
17326 @@ -3,7 +3,7 @@
17327  # and is generally not a function of system call inputs.
17328  KCOV_INSTRUMENT                := n
17329  
17330 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17331 +obj-y += semaphore.o percpu-rwsem.o
17332  
17333  ifdef CONFIG_FUNCTION_TRACER
17334  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17335 @@ -12,7 +12,11 @@
17336  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17337  endif
17338  
17339 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17340 +obj-y += mutex.o
17341  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17342 +endif
17343 +obj-y += rwsem.o
17344  obj-$(CONFIG_LOCKDEP) += lockdep.o
17345  ifeq ($(CONFIG_PROC_FS),y)
17346  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17347 @@ -25,8 +29,11 @@
17348  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17349  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17350  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17351 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17352  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17353  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17354 +endif
17355 +obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
17356  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17357  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17358  obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
17359 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/mutex-rt.c linux-4.14/kernel/locking/mutex-rt.c
17360 --- linux-4.14.orig/kernel/locking/mutex-rt.c   1970-01-01 01:00:00.000000000 +0100
17361 +++ linux-4.14/kernel/locking/mutex-rt.c        2018-09-05 11:05:07.000000000 +0200
17362 @@ -0,0 +1,223 @@
17363 +/*
17364 + * kernel/rt.c
17365 + *
17366 + * Real-Time Preemption Support
17367 + *
17368 + * started by Ingo Molnar:
17369 + *
17370 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17371 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17372 + *
17373 + * historic credit for proving that Linux spinlocks can be implemented via
17374 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
17375 + * and others) who prototyped it on 2.4 and did lots of comparative
17376 + * research and analysis; TimeSys, for proving that you can implement a
17377 + * fully preemptible kernel via the use of IRQ threading and mutexes;
17378 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
17379 + * right one; and to MontaVista, who ported pmutexes to 2.6.
17380 + *
17381 + * This code is a from-scratch implementation and is not based on pmutexes,
17382 + * but the idea of converting spinlocks to mutexes is used here too.
17383 + *
17384 + * lock debugging, locking tree, deadlock detection:
17385 + *
17386 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
17387 + *  Released under the General Public License (GPL).
17388 + *
17389 + * Includes portions of the generic R/W semaphore implementation from:
17390 + *
17391 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
17392 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
17393 + *  - Derived also from comments by Linus
17394 + *
17395 + * Pending ownership of locks and ownership stealing:
17396 + *
17397 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
17398 + *
17399 + *   (also by Steven Rostedt)
17400 + *    - Converted single pi_lock to individual task locks.
17401 + *
17402 + * By Esben Nielsen:
17403 + *    Doing priority inheritance with help of the scheduler.
17404 + *
17405 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17406 + *  - major rework based on Esben Nielsens initial patch
17407 + *  - replaced thread_info references by task_struct refs
17408 + *  - removed task->pending_owner dependency
17409 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
17410 + *    in the scheduler return path as discussed with Steven Rostedt
17411 + *
17412 + *  Copyright (C) 2006, Kihon Technologies Inc.
17413 + *    Steven Rostedt <rostedt@goodmis.org>
17414 + *  - debugged and patched Thomas Gleixner's rework.
17415 + *  - added back the cmpxchg to the rework.
17416 + *  - turned atomic require back on for SMP.
17417 + */
17418 +
17419 +#include <linux/spinlock.h>
17420 +#include <linux/rtmutex.h>
17421 +#include <linux/sched.h>
17422 +#include <linux/delay.h>
17423 +#include <linux/module.h>
17424 +#include <linux/kallsyms.h>
17425 +#include <linux/syscalls.h>
17426 +#include <linux/interrupt.h>
17427 +#include <linux/plist.h>
17428 +#include <linux/fs.h>
17429 +#include <linux/futex.h>
17430 +#include <linux/hrtimer.h>
17431 +
17432 +#include "rtmutex_common.h"
17433 +
17434 +/*
17435 + * struct mutex functions
17436 + */
17437 +void __mutex_do_init(struct mutex *mutex, const char *name,
17438 +                    struct lock_class_key *key)
17439 +{
17440 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
17441 +       /*
17442 +        * Make sure we are not reinitializing a held lock:
17443 +        */
17444 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
17445 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
17446 +#endif
17447 +       mutex->lock.save_state = 0;
17448 +}
17449 +EXPORT_SYMBOL(__mutex_do_init);
17450 +
17451 +void __lockfunc _mutex_lock(struct mutex *lock)
17452 +{
17453 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17454 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17455 +}
17456 +EXPORT_SYMBOL(_mutex_lock);
17457 +
17458 +void __lockfunc _mutex_lock_io(struct mutex *lock)
17459 +{
17460 +       int token;
17461 +
17462 +       token = io_schedule_prepare();
17463 +       _mutex_lock(lock);
17464 +       io_schedule_finish(token);
17465 +}
17466 +EXPORT_SYMBOL_GPL(_mutex_lock_io);
17467 +
17468 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
17469 +{
17470 +       int ret;
17471 +
17472 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17473 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
17474 +       if (ret)
17475 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
17476 +       return ret;
17477 +}
17478 +EXPORT_SYMBOL(_mutex_lock_interruptible);
17479 +
17480 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
17481 +{
17482 +       int ret;
17483 +
17484 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17485 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
17486 +       if (ret)
17487 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
17488 +       return ret;
17489 +}
17490 +EXPORT_SYMBOL(_mutex_lock_killable);
17491 +
17492 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
17493 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
17494 +{
17495 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17496 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17497 +}
17498 +EXPORT_SYMBOL(_mutex_lock_nested);
17499 +
17500 +void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
17501 +{
17502 +       int token;
17503 +
17504 +       token = io_schedule_prepare();
17505 +
17506 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17507 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17508 +
17509 +       io_schedule_finish(token);
17510 +}
17511 +EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
17512 +
17513 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
17514 +{
17515 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
17516 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
17517 +}
17518 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
17519 +
17520 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
17521 +{
17522 +       int ret;
17523 +
17524 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
17525 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
17526 +       if (ret)
17527 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
17528 +       return ret;
17529 +}
17530 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
17531 +
17532 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
17533 +{
17534 +       int ret;
17535 +
17536 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
17537 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
17538 +       if (ret)
17539 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
17540 +       return ret;
17541 +}
17542 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
17543 +#endif
17544 +
17545 +int __lockfunc _mutex_trylock(struct mutex *lock)
17546 +{
17547 +       int ret = __rt_mutex_trylock(&lock->lock);
17548 +
17549 +       if (ret)
17550 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
17551 +
17552 +       return ret;
17553 +}
17554 +EXPORT_SYMBOL(_mutex_trylock);
17555 +
17556 +void __lockfunc _mutex_unlock(struct mutex *lock)
17557 +{
17558 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
17559 +       __rt_mutex_unlock(&lock->lock);
17560 +}
17561 +EXPORT_SYMBOL(_mutex_unlock);
17562 +
17563 +/**
17564 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
17565 + * @cnt: the atomic which we are to dec
17566 + * @lock: the mutex to return holding if we dec to 0
17567 + *
17568 + * return true and hold lock if we dec to 0, return false otherwise
17569 + */
17570 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
17571 +{
17572 +       /* dec if we can't possibly hit 0 */
17573 +       if (atomic_add_unless(cnt, -1, 1))
17574 +               return 0;
17575 +       /* we might hit 0, so take the lock */
17576 +       mutex_lock(lock);
17577 +       if (!atomic_dec_and_test(cnt)) {
17578 +               /* when we actually did the dec, we didn't hit 0 */
17579 +               mutex_unlock(lock);
17580 +               return 0;
17581 +       }
17582 +       /* we hit 0, and we hold the lock */
17583 +       return 1;
17584 +}
17585 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
17586 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rtmutex.c linux-4.14/kernel/locking/rtmutex.c
17587 --- linux-4.14.orig/kernel/locking/rtmutex.c    2018-09-05 11:03:22.000000000 +0200
17588 +++ linux-4.14/kernel/locking/rtmutex.c 2018-09-05 11:05:07.000000000 +0200
17589 @@ -7,6 +7,11 @@
17590   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17591   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
17592   *  Copyright (C) 2006 Esben Nielsen
17593 + *  Adaptive Spinlocks:
17594 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
17595 + *                                  and Peter Morreale,
17596 + * Adaptive Spinlocks simplification:
17597 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
17598   *
17599   *  See Documentation/locking/rt-mutex-design.txt for details.
17600   */
17601 @@ -18,6 +23,8 @@
17602  #include <linux/sched/wake_q.h>
17603  #include <linux/sched/debug.h>
17604  #include <linux/timer.h>
17605 +#include <linux/ww_mutex.h>
17606 +#include <linux/blkdev.h>
17607  
17608  #include "rtmutex_common.h"
17609  
17610 @@ -135,6 +142,12 @@
17611                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
17612  }
17613  
17614 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
17615 +{
17616 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
17617 +               waiter != PI_REQUEUE_INPROGRESS;
17618 +}
17619 +
17620  /*
17621   * We can speed up the acquire/release, if there's no debugging state to be
17622   * set up.
17623 @@ -228,7 +241,7 @@
17624   * Only use with rt_mutex_waiter_{less,equal}()
17625   */
17626  #define task_to_waiter(p)      \
17627 -       &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
17628 +       &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
17629  
17630  static inline int
17631  rt_mutex_waiter_less(struct rt_mutex_waiter *left,
17632 @@ -268,6 +281,27 @@
17633         return 1;
17634  }
17635  
17636 +#define STEAL_NORMAL  0
17637 +#define STEAL_LATERAL 1
17638 +
17639 +static inline int
17640 +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
17641 +{
17642 +       struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
17643 +
17644 +       if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
17645 +               return 1;
17646 +
17647 +       /*
17648 +        * Note that RT tasks are excluded from lateral-steals
17649 +        * to prevent the introduction of an unbounded latency.
17650 +        */
17651 +       if (mode == STEAL_NORMAL || rt_task(waiter->task))
17652 +               return 0;
17653 +
17654 +       return rt_mutex_waiter_equal(waiter, top_waiter);
17655 +}
17656 +
17657  static void
17658  rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
17659  {
17660 @@ -372,6 +406,14 @@
17661         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
17662  }
17663  
17664 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
17665 +{
17666 +       if (waiter->savestate)
17667 +               wake_up_lock_sleeper(waiter->task);
17668 +       else
17669 +               wake_up_process(waiter->task);
17670 +}
17671 +
17672  /*
17673   * Max number of times we'll walk the boosting chain:
17674   */
17675 @@ -379,7 +421,8 @@
17676  
17677  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
17678  {
17679 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
17680 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
17681 +               p->pi_blocked_on->lock : NULL;
17682  }
17683  
17684  /*
17685 @@ -515,7 +558,7 @@
17686          * reached or the state of the chain has changed while we
17687          * dropped the locks.
17688          */
17689 -       if (!waiter)
17690 +       if (!rt_mutex_real_waiter(waiter))
17691                 goto out_unlock_pi;
17692  
17693         /*
17694 @@ -696,13 +739,16 @@
17695          * follow here. This is the end of the chain we are walking.
17696          */
17697         if (!rt_mutex_owner(lock)) {
17698 +               struct rt_mutex_waiter *lock_top_waiter;
17699 +
17700                 /*
17701                  * If the requeue [7] above changed the top waiter,
17702                  * then we need to wake the new top waiter up to try
17703                  * to get the lock.
17704                  */
17705 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
17706 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
17707 +               lock_top_waiter = rt_mutex_top_waiter(lock);
17708 +               if (prerequeue_top_waiter != lock_top_waiter)
17709 +                       rt_mutex_wake_waiter(lock_top_waiter);
17710                 raw_spin_unlock_irq(&lock->wait_lock);
17711                 return 0;
17712         }
17713 @@ -804,9 +850,11 @@
17714   * @task:   The task which wants to acquire the lock
17715   * @waiter: The waiter that is queued to the lock's wait tree if the
17716   *         callsite called task_blocked_on_lock(), otherwise NULL
17717 + * @mode:   Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
17718   */
17719 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
17720 -                               struct rt_mutex_waiter *waiter)
17721 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
17722 +                                 struct task_struct *task,
17723 +                                 struct rt_mutex_waiter *waiter, int mode)
17724  {
17725         lockdep_assert_held(&lock->wait_lock);
17726  
17727 @@ -842,12 +890,11 @@
17728          */
17729         if (waiter) {
17730                 /*
17731 -                * If waiter is not the highest priority waiter of
17732 -                * @lock, give up.
17733 +                * If waiter is not the highest priority waiter of @lock,
17734 +                * or its peer when lateral steal is allowed, give up.
17735                  */
17736 -               if (waiter != rt_mutex_top_waiter(lock))
17737 +               if (!rt_mutex_steal(lock, waiter, mode))
17738                         return 0;
17739 -
17740                 /*
17741                  * We can acquire the lock. Remove the waiter from the
17742                  * lock waiters tree.
17743 @@ -865,14 +912,12 @@
17744                  */
17745                 if (rt_mutex_has_waiters(lock)) {
17746                         /*
17747 -                        * If @task->prio is greater than or equal to
17748 -                        * the top waiter priority (kernel view),
17749 -                        * @task lost.
17750 +                        * If @task->prio is greater than the top waiter
17751 +                        * priority (kernel view), or equal to it when a
17752 +                        * lateral steal is forbidden, @task lost.
17753                          */
17754 -                       if (!rt_mutex_waiter_less(task_to_waiter(task),
17755 -                                                 rt_mutex_top_waiter(lock)))
17756 +                       if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
17757                                 return 0;
17758 -
17759                         /*
17760                          * The current top waiter stays enqueued. We
17761                          * don't have to change anything in the lock
17762 @@ -919,6 +964,351 @@
17763         return 1;
17764  }
17765  
17766 +#ifdef CONFIG_PREEMPT_RT_FULL
17767 +/*
17768 + * preemptible spin_lock functions:
17769 + */
17770 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
17771 +                                        void  (*slowfn)(struct rt_mutex *lock))
17772 +{
17773 +       might_sleep_no_state_check();
17774 +
17775 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
17776 +               return;
17777 +       else
17778 +               slowfn(lock);
17779 +}
17780 +
17781 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
17782 +                                          void  (*slowfn)(struct rt_mutex *lock))
17783 +{
17784 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
17785 +               return;
17786 +       else
17787 +               slowfn(lock);
17788 +}
17789 +#ifdef CONFIG_SMP
17790 +/*
17791 + * Note that owner is a speculative pointer and dereferencing relies
17792 + * on rcu_read_lock() and the check against the lock owner.
17793 + */
17794 +static int adaptive_wait(struct rt_mutex *lock,
17795 +                        struct task_struct *owner)
17796 +{
17797 +       int res = 0;
17798 +
17799 +       rcu_read_lock();
17800 +       for (;;) {
17801 +               if (owner != rt_mutex_owner(lock))
17802 +                       break;
17803 +               /*
17804 +                * Ensure that owner->on_cpu is dereferenced _after_
17805 +                * checking the above to be valid.
17806 +                */
17807 +               barrier();
17808 +               if (!owner->on_cpu) {
17809 +                       res = 1;
17810 +                       break;
17811 +               }
17812 +               cpu_relax();
17813 +       }
17814 +       rcu_read_unlock();
17815 +       return res;
17816 +}
17817 +#else
17818 +static int adaptive_wait(struct rt_mutex *lock,
17819 +                        struct task_struct *orig_owner)
17820 +{
17821 +       return 1;
17822 +}
17823 +#endif
17824 +
17825 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
17826 +                                  struct rt_mutex_waiter *waiter,
17827 +                                  struct task_struct *task,
17828 +                                  enum rtmutex_chainwalk chwalk);
17829 +/*
17830 + * Slow path lock function spin_lock style: this variant is very
17831 + * careful not to miss any non-lock wakeups.
17832 + *
17833 + * We store the current state under p->pi_lock in p->saved_state and
17834 + * the try_to_wake_up() code handles this accordingly.
17835 + */
17836 +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
17837 +                                         struct rt_mutex_waiter *waiter,
17838 +                                         unsigned long flags)
17839 +{
17840 +       struct task_struct *lock_owner, *self = current;
17841 +       struct rt_mutex_waiter *top_waiter;
17842 +       int ret;
17843 +
17844 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
17845 +               return;
17846 +
17847 +       BUG_ON(rt_mutex_owner(lock) == self);
17848 +
17849 +       /*
17850 +        * We save whatever state the task is in and we'll restore it
17851 +        * after acquiring the lock taking real wakeups into account
17852 +        * as well. We are serialized via pi_lock against wakeups. See
17853 +        * try_to_wake_up().
17854 +        */
17855 +       raw_spin_lock(&self->pi_lock);
17856 +       self->saved_state = self->state;
17857 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
17858 +       raw_spin_unlock(&self->pi_lock);
17859 +
17860 +       ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
17861 +       BUG_ON(ret);
17862 +
17863 +       for (;;) {
17864 +               /* Try to acquire the lock again. */
17865 +               if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
17866 +                       break;
17867 +
17868 +               top_waiter = rt_mutex_top_waiter(lock);
17869 +               lock_owner = rt_mutex_owner(lock);
17870 +
17871 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17872 +
17873 +               debug_rt_mutex_print_deadlock(waiter);
17874 +
17875 +               if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
17876 +                       schedule();
17877 +
17878 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
17879 +
17880 +               raw_spin_lock(&self->pi_lock);
17881 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
17882 +               raw_spin_unlock(&self->pi_lock);
17883 +       }
17884 +
17885 +       /*
17886 +        * Restore the task state to current->saved_state. We set it
17887 +        * to the original state above and the try_to_wake_up() code
17888 +        * has possibly updated it when a real (non-rtmutex) wakeup
17889 +        * happened while we were blocked. Clear saved_state so
17890 +        * try_to_wakeup() does not get confused.
17891 +        */
17892 +       raw_spin_lock(&self->pi_lock);
17893 +       __set_current_state_no_track(self->saved_state);
17894 +       self->saved_state = TASK_RUNNING;
17895 +       raw_spin_unlock(&self->pi_lock);
17896 +
17897 +       /*
17898 +        * try_to_take_rt_mutex() sets the waiter bit
17899 +        * unconditionally. We might have to fix that up:
17900 +        */
17901 +       fixup_rt_mutex_waiters(lock);
17902 +
17903 +       BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
17904 +       BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
17905 +}
17906 +
17907 +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
17908 +{
17909 +       struct rt_mutex_waiter waiter;
17910 +       unsigned long flags;
17911 +
17912 +       rt_mutex_init_waiter(&waiter, true);
17913 +
17914 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
17915 +       rt_spin_lock_slowlock_locked(lock, &waiter, flags);
17916 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17917 +       debug_rt_mutex_free_waiter(&waiter);
17918 +}
17919 +
17920 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
17921 +                                            struct wake_q_head *wake_q,
17922 +                                            struct wake_q_head *wq_sleeper);
17923 +/*
17924 + * Slow path to release a rt_mutex spin_lock style
17925 + */
17926 +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
17927 +{
17928 +       unsigned long flags;
17929 +       DEFINE_WAKE_Q(wake_q);
17930 +       DEFINE_WAKE_Q(wake_sleeper_q);
17931 +       bool postunlock;
17932 +
17933 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
17934 +       postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
17935 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
17936 +
17937 +       if (postunlock)
17938 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
17939 +}
17940 +
17941 +void __lockfunc rt_spin_lock(spinlock_t *lock)
17942 +{
17943 +       sleeping_lock_inc();
17944 +       migrate_disable();
17945 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
17946 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
17947 +}
17948 +EXPORT_SYMBOL(rt_spin_lock);
17949 +
17950 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
17951 +{
17952 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
17953 +}
17954 +
17955 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
17956 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
17957 +{
17958 +       sleeping_lock_inc();
17959 +       migrate_disable();
17960 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
17961 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
17962 +}
17963 +EXPORT_SYMBOL(rt_spin_lock_nested);
17964 +#endif
17965 +
17966 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
17967 +{
17968 +       /* NOTE: we always pass in '1' for nested, for simplicity */
17969 +       spin_release(&lock->dep_map, 1, _RET_IP_);
17970 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
17971 +       migrate_enable();
17972 +       sleeping_lock_dec();
17973 +}
17974 +EXPORT_SYMBOL(rt_spin_unlock);
17975 +
17976 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
17977 +{
17978 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
17979 +}
17980 +EXPORT_SYMBOL(__rt_spin_unlock);
17981 +
17982 +/*
17983 + * Wait for the lock to get unlocked: instead of polling for an unlock
17984 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
17985 + * schedule if there's contention:
17986 + */
17987 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
17988 +{
17989 +       spin_lock(lock);
17990 +       spin_unlock(lock);
17991 +}
17992 +EXPORT_SYMBOL(rt_spin_unlock_wait);
17993 +
17994 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
17995 +{
17996 +       int ret;
17997 +
17998 +       sleeping_lock_inc();
17999 +       migrate_disable();
18000 +       ret = __rt_mutex_trylock(&lock->lock);
18001 +       if (ret) {
18002 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18003 +       } else {
18004 +               migrate_enable();
18005 +               sleeping_lock_dec();
18006 +       }
18007 +       return ret;
18008 +}
18009 +EXPORT_SYMBOL(rt_spin_trylock);
18010 +
18011 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
18012 +{
18013 +       int ret;
18014 +
18015 +       local_bh_disable();
18016 +       ret = __rt_mutex_trylock(&lock->lock);
18017 +       if (ret) {
18018 +               sleeping_lock_inc();
18019 +               migrate_disable();
18020 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18021 +       } else
18022 +               local_bh_enable();
18023 +       return ret;
18024 +}
18025 +EXPORT_SYMBOL(rt_spin_trylock_bh);
18026 +
18027 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
18028 +{
18029 +       int ret;
18030 +
18031 +       *flags = 0;
18032 +       ret = __rt_mutex_trylock(&lock->lock);
18033 +       if (ret) {
18034 +               sleeping_lock_inc();
18035 +               migrate_disable();
18036 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18037 +       }
18038 +       return ret;
18039 +}
18040 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
18041 +
18042 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
18043 +{
18044 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
18045 +       if (atomic_add_unless(atomic, -1, 1))
18046 +               return 0;
18047 +       rt_spin_lock(lock);
18048 +       if (atomic_dec_and_test(atomic))
18049 +               return 1;
18050 +       rt_spin_unlock(lock);
18051 +       return 0;
18052 +}
18053 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
18054 +
18055 +void
18056 +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
18057 +{
18058 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18059 +       /*
18060 +        * Make sure we are not reinitializing a held lock:
18061 +        */
18062 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
18063 +       lockdep_init_map(&lock->dep_map, name, key, 0);
18064 +#endif
18065 +}
18066 +EXPORT_SYMBOL(__rt_spin_lock_init);
18067 +
18068 +#endif /* PREEMPT_RT_FULL */
18069 +
18070 +#ifdef CONFIG_PREEMPT_RT_FULL
18071 +       static inline int __sched
18072 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18073 +{
18074 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18075 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
18076 +
18077 +       if (!hold_ctx)
18078 +               return 0;
18079 +
18080 +       if (unlikely(ctx == hold_ctx))
18081 +               return -EALREADY;
18082 +
18083 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
18084 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
18085 +#ifdef CONFIG_DEBUG_MUTEXES
18086 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
18087 +               ctx->contending_lock = ww;
18088 +#endif
18089 +               return -EDEADLK;
18090 +       }
18091 +
18092 +       return 0;
18093 +}
18094 +#else
18095 +       static inline int __sched
18096 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18097 +{
18098 +       BUG();
18099 +       return 0;
18100 +}
18101 +
18102 +#endif
18103 +
18104 +static inline int
18105 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18106 +                    struct rt_mutex_waiter *waiter)
18107 +{
18108 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
18109 +}
18110 +
18111  /*
18112   * Task blocks on lock.
18113   *
18114 @@ -951,6 +1341,22 @@
18115                 return -EDEADLK;
18116  
18117         raw_spin_lock(&task->pi_lock);
18118 +       /*
18119 +        * In the case of futex requeue PI, this will be a proxy
18120 +        * lock. The task will wake unaware that it is enqueueed on
18121 +        * this lock. Avoid blocking on two locks and corrupting
18122 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
18123 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
18124 +        * before requeue (due to a signal or timeout). Do not enqueue
18125 +        * the task if PI_WAKEUP_INPROGRESS is set.
18126 +        */
18127 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
18128 +               raw_spin_unlock(&task->pi_lock);
18129 +               return -EAGAIN;
18130 +       }
18131 +
18132 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
18133 +
18134         waiter->task = task;
18135         waiter->lock = lock;
18136         waiter->prio = task->prio;
18137 @@ -974,7 +1380,7 @@
18138                 rt_mutex_enqueue_pi(owner, waiter);
18139  
18140                 rt_mutex_adjust_prio(owner);
18141 -               if (owner->pi_blocked_on)
18142 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
18143                         chain_walk = 1;
18144         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
18145                 chain_walk = 1;
18146 @@ -1016,6 +1422,7 @@
18147   * Called with lock->wait_lock held and interrupts disabled.
18148   */
18149  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18150 +                                   struct wake_q_head *wake_sleeper_q,
18151                                     struct rt_mutex *lock)
18152  {
18153         struct rt_mutex_waiter *waiter;
18154 @@ -1055,7 +1462,10 @@
18155          * Pairs with preempt_enable() in rt_mutex_postunlock();
18156          */
18157         preempt_disable();
18158 -       wake_q_add(wake_q, waiter->task);
18159 +       if (waiter->savestate)
18160 +               wake_q_add_sleeper(wake_sleeper_q, waiter->task);
18161 +       else
18162 +               wake_q_add(wake_q, waiter->task);
18163         raw_spin_unlock(&current->pi_lock);
18164  }
18165  
18166 @@ -1070,7 +1480,7 @@
18167  {
18168         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
18169         struct task_struct *owner = rt_mutex_owner(lock);
18170 -       struct rt_mutex *next_lock;
18171 +       struct rt_mutex *next_lock = NULL;
18172  
18173         lockdep_assert_held(&lock->wait_lock);
18174  
18175 @@ -1096,7 +1506,8 @@
18176         rt_mutex_adjust_prio(owner);
18177  
18178         /* Store the lock on which owner is blocked or NULL */
18179 -       next_lock = task_blocked_on_lock(owner);
18180 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
18181 +               next_lock = task_blocked_on_lock(owner);
18182  
18183         raw_spin_unlock(&owner->pi_lock);
18184  
18185 @@ -1132,26 +1543,28 @@
18186         raw_spin_lock_irqsave(&task->pi_lock, flags);
18187  
18188         waiter = task->pi_blocked_on;
18189 -       if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18190 +       if (!rt_mutex_real_waiter(waiter) ||
18191 +           rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18192                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18193                 return;
18194         }
18195         next_lock = waiter->lock;
18196 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18197  
18198         /* gets dropped in rt_mutex_adjust_prio_chain()! */
18199         get_task_struct(task);
18200  
18201 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18202         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
18203                                    next_lock, NULL, task);
18204  }
18205  
18206 -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18207 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
18208  {
18209         debug_rt_mutex_init_waiter(waiter);
18210         RB_CLEAR_NODE(&waiter->pi_tree_entry);
18211         RB_CLEAR_NODE(&waiter->tree_entry);
18212         waiter->task = NULL;
18213 +       waiter->savestate = savestate;
18214  }
18215  
18216  /**
18217 @@ -1167,7 +1580,8 @@
18218  static int __sched
18219  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18220                     struct hrtimer_sleeper *timeout,
18221 -                   struct rt_mutex_waiter *waiter)
18222 +                   struct rt_mutex_waiter *waiter,
18223 +                   struct ww_acquire_ctx *ww_ctx)
18224  {
18225         int ret = 0;
18226  
18227 @@ -1176,16 +1590,17 @@
18228                 if (try_to_take_rt_mutex(lock, current, waiter))
18229                         break;
18230  
18231 -               /*
18232 -                * TASK_INTERRUPTIBLE checks for signals and
18233 -                * timeout. Ignored otherwise.
18234 -                */
18235 -               if (likely(state == TASK_INTERRUPTIBLE)) {
18236 -                       /* Signal pending? */
18237 -                       if (signal_pending(current))
18238 -                               ret = -EINTR;
18239 -                       if (timeout && !timeout->task)
18240 -                               ret = -ETIMEDOUT;
18241 +               if (timeout && !timeout->task) {
18242 +                       ret = -ETIMEDOUT;
18243 +                       break;
18244 +               }
18245 +               if (signal_pending_state(state, current)) {
18246 +                       ret = -EINTR;
18247 +                       break;
18248 +               }
18249 +
18250 +               if (ww_ctx && ww_ctx->acquired > 0) {
18251 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
18252                         if (ret)
18253                                 break;
18254                 }
18255 @@ -1224,33 +1639,104 @@
18256         }
18257  }
18258  
18259 -/*
18260 - * Slow path lock function:
18261 - */
18262 -static int __sched
18263 -rt_mutex_slowlock(struct rt_mutex *lock, int state,
18264 -                 struct hrtimer_sleeper *timeout,
18265 -                 enum rtmutex_chainwalk chwalk)
18266 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
18267 +                                                  struct ww_acquire_ctx *ww_ctx)
18268  {
18269 -       struct rt_mutex_waiter waiter;
18270 -       unsigned long flags;
18271 -       int ret = 0;
18272 +#ifdef CONFIG_DEBUG_MUTEXES
18273 +       /*
18274 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
18275 +        * but released with a normal mutex_unlock in this call.
18276 +        *
18277 +        * This should never happen, always use ww_mutex_unlock.
18278 +        */
18279 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
18280  
18281 -       rt_mutex_init_waiter(&waiter);
18282 +       /*
18283 +        * Not quite done after calling ww_acquire_done() ?
18284 +        */
18285 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
18286 +
18287 +       if (ww_ctx->contending_lock) {
18288 +               /*
18289 +                * After -EDEADLK you tried to
18290 +                * acquire a different ww_mutex? Bad!
18291 +                */
18292 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
18293 +
18294 +               /*
18295 +                * You called ww_mutex_lock after receiving -EDEADLK,
18296 +                * but 'forgot' to unlock everything else first?
18297 +                */
18298 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
18299 +               ww_ctx->contending_lock = NULL;
18300 +       }
18301  
18302         /*
18303 -        * Technically we could use raw_spin_[un]lock_irq() here, but this can
18304 -        * be called in early boot if the cmpxchg() fast path is disabled
18305 -        * (debug, no architecture support). In this case we will acquire the
18306 -        * rtmutex with lock->wait_lock held. But we cannot unconditionally
18307 -        * enable interrupts in that early boot case. So we need to use the
18308 -        * irqsave/restore variants.
18309 +        * Naughty, using a different class will lead to undefined behavior!
18310          */
18311 -       raw_spin_lock_irqsave(&lock->wait_lock, flags);
18312 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
18313 +#endif
18314 +       ww_ctx->acquired++;
18315 +}
18316 +
18317 +#ifdef CONFIG_PREEMPT_RT_FULL
18318 +static void ww_mutex_account_lock(struct rt_mutex *lock,
18319 +                                 struct ww_acquire_ctx *ww_ctx)
18320 +{
18321 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18322 +       struct rt_mutex_waiter *waiter, *n;
18323 +
18324 +       /*
18325 +        * This branch gets optimized out for the common case,
18326 +        * and is only important for ww_mutex_lock.
18327 +        */
18328 +       ww_mutex_lock_acquired(ww, ww_ctx);
18329 +       ww->ctx = ww_ctx;
18330 +
18331 +       /*
18332 +        * Give any possible sleeping processes the chance to wake up,
18333 +        * so they can recheck if they have to back off.
18334 +        */
18335 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
18336 +                                            tree_entry) {
18337 +               /* XXX debug rt mutex waiter wakeup */
18338 +
18339 +               BUG_ON(waiter->lock != lock);
18340 +               rt_mutex_wake_waiter(waiter);
18341 +       }
18342 +}
18343 +
18344 +#else
18345 +
18346 +static void ww_mutex_account_lock(struct rt_mutex *lock,
18347 +                                 struct ww_acquire_ctx *ww_ctx)
18348 +{
18349 +       BUG();
18350 +}
18351 +#endif
18352 +
18353 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
18354 +                                    struct hrtimer_sleeper *timeout,
18355 +                                    enum rtmutex_chainwalk chwalk,
18356 +                                    struct ww_acquire_ctx *ww_ctx,
18357 +                                    struct rt_mutex_waiter *waiter)
18358 +{
18359 +       int ret;
18360 +
18361 +#ifdef CONFIG_PREEMPT_RT_FULL
18362 +       if (ww_ctx) {
18363 +               struct ww_mutex *ww;
18364 +
18365 +               ww = container_of(lock, struct ww_mutex, base.lock);
18366 +               if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
18367 +                       return -EALREADY;
18368 +       }
18369 +#endif
18370  
18371         /* Try to acquire the lock again: */
18372         if (try_to_take_rt_mutex(lock, current, NULL)) {
18373 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18374 +               if (ww_ctx)
18375 +                       ww_mutex_account_lock(lock, ww_ctx);
18376                 return 0;
18377         }
18378  
18379 @@ -1260,17 +1746,27 @@
18380         if (unlikely(timeout))
18381                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
18382  
18383 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
18384 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
18385  
18386 -       if (likely(!ret))
18387 +       if (likely(!ret)) {
18388                 /* sleep on the mutex */
18389 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
18390 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
18391 +                                         ww_ctx);
18392 +       } else if (ww_ctx) {
18393 +               /* ww_mutex received EDEADLK, let it become EALREADY */
18394 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
18395 +               BUG_ON(!ret);
18396 +       }
18397  
18398         if (unlikely(ret)) {
18399                 __set_current_state(TASK_RUNNING);
18400                 if (rt_mutex_has_waiters(lock))
18401 -                       remove_waiter(lock, &waiter);
18402 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
18403 +                       remove_waiter(lock, waiter);
18404 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
18405 +               if (!ww_ctx)
18406 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
18407 +       } else if (ww_ctx) {
18408 +               ww_mutex_account_lock(lock, ww_ctx);
18409         }
18410  
18411         /*
18412 @@ -1278,6 +1774,36 @@
18413          * unconditionally. We might have to fix that up.
18414          */
18415         fixup_rt_mutex_waiters(lock);
18416 +       return ret;
18417 +}
18418 +
18419 +/*
18420 + * Slow path lock function:
18421 + */
18422 +static int __sched
18423 +rt_mutex_slowlock(struct rt_mutex *lock, int state,
18424 +                 struct hrtimer_sleeper *timeout,
18425 +                 enum rtmutex_chainwalk chwalk,
18426 +                 struct ww_acquire_ctx *ww_ctx)
18427 +{
18428 +       struct rt_mutex_waiter waiter;
18429 +       unsigned long flags;
18430 +       int ret = 0;
18431 +
18432 +       rt_mutex_init_waiter(&waiter, false);
18433 +
18434 +       /*
18435 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
18436 +        * be called in early boot if the cmpxchg() fast path is disabled
18437 +        * (debug, no architecture support). In this case we will acquire the
18438 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
18439 +        * enable interrupts in that early boot case. So we need to use the
18440 +        * irqsave/restore variants.
18441 +        */
18442 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
18443 +
18444 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
18445 +                                      &waiter);
18446  
18447         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18448  
18449 @@ -1338,7 +1864,8 @@
18450   * Return whether the current task needs to call rt_mutex_postunlock().
18451   */
18452  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
18453 -                                       struct wake_q_head *wake_q)
18454 +                                       struct wake_q_head *wake_q,
18455 +                                       struct wake_q_head *wake_sleeper_q)
18456  {
18457         unsigned long flags;
18458  
18459 @@ -1392,7 +1919,7 @@
18460          *
18461          * Queue the next waiter for wakeup once we release the wait_lock.
18462          */
18463 -       mark_wakeup_next_waiter(wake_q, lock);
18464 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
18465         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18466  
18467         return true; /* call rt_mutex_postunlock() */
18468 @@ -1406,29 +1933,45 @@
18469   */
18470  static inline int
18471  rt_mutex_fastlock(struct rt_mutex *lock, int state,
18472 +                 struct ww_acquire_ctx *ww_ctx,
18473                   int (*slowfn)(struct rt_mutex *lock, int state,
18474                                 struct hrtimer_sleeper *timeout,
18475 -                               enum rtmutex_chainwalk chwalk))
18476 +                               enum rtmutex_chainwalk chwalk,
18477 +                               struct ww_acquire_ctx *ww_ctx))
18478  {
18479         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18480                 return 0;
18481  
18482 -       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
18483 +       /*
18484 +        * If rt_mutex blocks, the function sched_submit_work will not call
18485 +        * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
18486 +        * We must call blk_schedule_flush_plug here, if we don't call it,
18487 +        * a deadlock in device mapper may happen.
18488 +        */
18489 +       if (unlikely(blk_needs_flush_plug(current)))
18490 +               blk_schedule_flush_plug(current);
18491 +
18492 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
18493  }
18494  
18495  static inline int
18496  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
18497                         struct hrtimer_sleeper *timeout,
18498                         enum rtmutex_chainwalk chwalk,
18499 +                       struct ww_acquire_ctx *ww_ctx,
18500                         int (*slowfn)(struct rt_mutex *lock, int state,
18501                                       struct hrtimer_sleeper *timeout,
18502 -                                     enum rtmutex_chainwalk chwalk))
18503 +                                     enum rtmutex_chainwalk chwalk,
18504 +                                     struct ww_acquire_ctx *ww_ctx))
18505  {
18506         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
18507             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18508                 return 0;
18509  
18510 -       return slowfn(lock, state, timeout, chwalk);
18511 +       if (unlikely(blk_needs_flush_plug(current)))
18512 +               blk_schedule_flush_plug(current);
18513 +
18514 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
18515  }
18516  
18517  static inline int
18518 @@ -1444,9 +1987,11 @@
18519  /*
18520   * Performs the wakeup of the the top-waiter and re-enables preemption.
18521   */
18522 -void rt_mutex_postunlock(struct wake_q_head *wake_q)
18523 +void rt_mutex_postunlock(struct wake_q_head *wake_q,
18524 +                        struct wake_q_head *wake_sleeper_q)
18525  {
18526         wake_up_q(wake_q);
18527 +       wake_up_q_sleeper(wake_sleeper_q);
18528  
18529         /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
18530         preempt_enable();
18531 @@ -1455,15 +2000,40 @@
18532  static inline void
18533  rt_mutex_fastunlock(struct rt_mutex *lock,
18534                     bool (*slowfn)(struct rt_mutex *lock,
18535 -                                  struct wake_q_head *wqh))
18536 +                                  struct wake_q_head *wqh,
18537 +                                  struct wake_q_head *wq_sleeper))
18538  {
18539         DEFINE_WAKE_Q(wake_q);
18540 +       DEFINE_WAKE_Q(wake_sleeper_q);
18541  
18542         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18543                 return;
18544  
18545 -       if (slowfn(lock, &wake_q))
18546 -               rt_mutex_postunlock(&wake_q);
18547 +       if (slowfn(lock, &wake_q, &wake_sleeper_q))
18548 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18549 +}
18550 +
18551 +int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
18552 +{
18553 +       might_sleep();
18554 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
18555 +}
18556 +
18557 +/**
18558 + * rt_mutex_lock_state - lock a rt_mutex with a given state
18559 + *
18560 + * @lock:      The rt_mutex to be locked
18561 + * @state:     The state to set when blocking on the rt_mutex
18562 + */
18563 +static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
18564 +{
18565 +       int ret;
18566 +
18567 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18568 +       ret = __rt_mutex_lock_state(lock, state);
18569 +       if (ret)
18570 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18571 +       return ret;
18572  }
18573  
18574  /**
18575 @@ -1473,10 +2043,7 @@
18576   */
18577  void __sched rt_mutex_lock(struct rt_mutex *lock)
18578  {
18579 -       might_sleep();
18580 -
18581 -       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18582 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
18583 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
18584  }
18585  EXPORT_SYMBOL_GPL(rt_mutex_lock);
18586  
18587 @@ -1491,16 +2058,7 @@
18588   */
18589  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
18590  {
18591 -       int ret;
18592 -
18593 -       might_sleep();
18594 -
18595 -       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18596 -       ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
18597 -       if (ret)
18598 -               mutex_release(&lock->dep_map, 1, _RET_IP_);
18599 -
18600 -       return ret;
18601 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
18602  }
18603  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
18604  
18605 @@ -1518,6 +2076,22 @@
18606  }
18607  
18608  /**
18609 + * rt_mutex_lock_killable - lock a rt_mutex killable
18610 + *
18611 + * @lock:              the rt_mutex to be locked
18612 + * @detect_deadlock:   deadlock detection on/off
18613 + *
18614 + * Returns:
18615 + *  0          on success
18616 + * -EINTR      when interrupted by a signal
18617 + */
18618 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
18619 +{
18620 +       return rt_mutex_lock_state(lock, TASK_KILLABLE);
18621 +}
18622 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
18623 +
18624 +/**
18625   * rt_mutex_timed_lock - lock a rt_mutex interruptible
18626   *                     the timeout structure is provided
18627   *                     by the caller
18628 @@ -1540,6 +2114,7 @@
18629         mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18630         ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
18631                                        RT_MUTEX_MIN_CHAINWALK,
18632 +                                      NULL,
18633                                        rt_mutex_slowlock);
18634         if (ret)
18635                 mutex_release(&lock->dep_map, 1, _RET_IP_);
18636 @@ -1548,6 +2123,18 @@
18637  }
18638  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
18639  
18640 +int __sched __rt_mutex_trylock(struct rt_mutex *lock)
18641 +{
18642 +#ifdef CONFIG_PREEMPT_RT_FULL
18643 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
18644 +#else
18645 +       if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
18646 +#endif
18647 +               return 0;
18648 +
18649 +       return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
18650 +}
18651 +
18652  /**
18653   * rt_mutex_trylock - try to lock a rt_mutex
18654   *
18655 @@ -1563,10 +2150,7 @@
18656  {
18657         int ret;
18658  
18659 -       if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
18660 -               return 0;
18661 -
18662 -       ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
18663 +       ret = __rt_mutex_trylock(lock);
18664         if (ret)
18665                 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18666  
18667 @@ -1574,6 +2158,11 @@
18668  }
18669  EXPORT_SYMBOL_GPL(rt_mutex_trylock);
18670  
18671 +void __sched __rt_mutex_unlock(struct rt_mutex *lock)
18672 +{
18673 +       rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
18674 +}
18675 +
18676  /**
18677   * rt_mutex_unlock - unlock a rt_mutex
18678   *
18679 @@ -1582,16 +2171,13 @@
18680  void __sched rt_mutex_unlock(struct rt_mutex *lock)
18681  {
18682         mutex_release(&lock->dep_map, 1, _RET_IP_);
18683 -       rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
18684 +       __rt_mutex_unlock(lock);
18685  }
18686  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
18687  
18688 -/**
18689 - * Futex variant, that since futex variants do not use the fast-path, can be
18690 - * simple and will not need to retry.
18691 - */
18692 -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
18693 -                                   struct wake_q_head *wake_q)
18694 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
18695 +                                            struct wake_q_head *wake_q,
18696 +                                            struct wake_q_head *wq_sleeper)
18697  {
18698         lockdep_assert_held(&lock->wait_lock);
18699  
18700 @@ -1608,22 +2194,35 @@
18701          * avoid inversion prior to the wakeup.  preempt_disable()
18702          * therein pairs with rt_mutex_postunlock().
18703          */
18704 -       mark_wakeup_next_waiter(wake_q, lock);
18705 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
18706  
18707         return true; /* call postunlock() */
18708  }
18709  
18710 +/**
18711 + * Futex variant, that since futex variants do not use the fast-path, can be
18712 + * simple and will not need to retry.
18713 + */
18714 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
18715 +                                   struct wake_q_head *wake_q,
18716 +                                   struct wake_q_head *wq_sleeper)
18717 +{
18718 +       return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
18719 +}
18720 +
18721  void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
18722  {
18723         DEFINE_WAKE_Q(wake_q);
18724 +       DEFINE_WAKE_Q(wake_sleeper_q);
18725 +       unsigned long flags;
18726         bool postunlock;
18727  
18728 -       raw_spin_lock_irq(&lock->wait_lock);
18729 -       postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
18730 -       raw_spin_unlock_irq(&lock->wait_lock);
18731 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
18732 +       postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
18733 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18734  
18735         if (postunlock)
18736 -               rt_mutex_postunlock(&wake_q);
18737 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18738  }
18739  
18740  /**
18741 @@ -1662,7 +2261,7 @@
18742         if (name && key)
18743                 debug_rt_mutex_init(lock, name, key);
18744  }
18745 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
18746 +EXPORT_SYMBOL(__rt_mutex_init);
18747  
18748  /**
18749   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
18750 @@ -1682,6 +2281,14 @@
18751                                 struct task_struct *proxy_owner)
18752  {
18753         __rt_mutex_init(lock, NULL, NULL);
18754 +#ifdef CONFIG_DEBUG_SPINLOCK
18755 +       /*
18756 +        * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
18757 +        * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
18758 +        * lock.
18759 +        */
18760 +       raw_spin_lock_init(&lock->wait_lock);
18761 +#endif
18762         debug_rt_mutex_proxy_lock(lock, proxy_owner);
18763         rt_mutex_set_owner(lock, proxy_owner);
18764  }
18765 @@ -1714,6 +2321,34 @@
18766         if (try_to_take_rt_mutex(lock, task, NULL))
18767                 return 1;
18768  
18769 +#ifdef CONFIG_PREEMPT_RT_FULL
18770 +       /*
18771 +        * In PREEMPT_RT there's an added race.
18772 +        * If the task, that we are about to requeue, times out,
18773 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
18774 +        * to skip this task. But right after the task sets
18775 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
18776 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
18777 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
18778 +        * lock that it blocks on. We *must not* place this task
18779 +        * on this proxy lock in that case.
18780 +        *
18781 +        * To prevent this race, we first take the task's pi_lock
18782 +        * and check if it has updated its pi_blocked_on. If it has,
18783 +        * we assume that it woke up and we return -EAGAIN.
18784 +        * Otherwise, we set the task's pi_blocked_on to
18785 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
18786 +        * it will know that we are in the process of requeuing it.
18787 +        */
18788 +       raw_spin_lock(&task->pi_lock);
18789 +       if (task->pi_blocked_on) {
18790 +               raw_spin_unlock(&task->pi_lock);
18791 +               return -EAGAIN;
18792 +       }
18793 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
18794 +       raw_spin_unlock(&task->pi_lock);
18795 +#endif
18796 +
18797         /* We enforce deadlock detection for futexes */
18798         ret = task_blocks_on_rt_mutex(lock, waiter, task,
18799                                       RT_MUTEX_FULL_CHAINWALK);
18800 @@ -1728,7 +2363,7 @@
18801                 ret = 0;
18802         }
18803  
18804 -       if (unlikely(ret))
18805 +       if (ret && rt_mutex_has_waiters(lock))
18806                 remove_waiter(lock, waiter);
18807  
18808         debug_rt_mutex_print_deadlock(waiter);
18809 @@ -1803,17 +2438,36 @@
18810                                struct hrtimer_sleeper *to,
18811                                struct rt_mutex_waiter *waiter)
18812  {
18813 +       struct task_struct *tsk = current;
18814         int ret;
18815  
18816         raw_spin_lock_irq(&lock->wait_lock);
18817         /* sleep on the mutex */
18818         set_current_state(TASK_INTERRUPTIBLE);
18819 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
18820 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
18821         /*
18822          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
18823          * have to fix that up.
18824          */
18825         fixup_rt_mutex_waiters(lock);
18826 +       /*
18827 +        * RT has a problem here when the wait got interrupted by a timeout
18828 +        * or a signal. task->pi_blocked_on is still set. The task must
18829 +        * acquire the hash bucket lock when returning from this function.
18830 +        *
18831 +        * If the hash bucket lock is contended then the
18832 +        * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
18833 +        * task_blocks_on_rt_mutex() will trigger. This can be avoided by
18834 +        * clearing task->pi_blocked_on which removes the task from the
18835 +        * boosting chain of the rtmutex. That's correct because the task
18836 +        * is not longer blocked on it.
18837 +        */
18838 +       if (ret) {
18839 +               raw_spin_lock(&tsk->pi_lock);
18840 +               tsk->pi_blocked_on = NULL;
18841 +               raw_spin_unlock(&tsk->pi_lock);
18842 +       }
18843 +
18844         raw_spin_unlock_irq(&lock->wait_lock);
18845  
18846         return ret;
18847 @@ -1874,3 +2528,99 @@
18848  
18849         return cleanup;
18850  }
18851 +
18852 +static inline int
18853 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
18854 +{
18855 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
18856 +       unsigned tmp;
18857 +
18858 +       if (ctx->deadlock_inject_countdown-- == 0) {
18859 +               tmp = ctx->deadlock_inject_interval;
18860 +               if (tmp > UINT_MAX/4)
18861 +                       tmp = UINT_MAX;
18862 +               else
18863 +                       tmp = tmp*2 + tmp + tmp/2;
18864 +
18865 +               ctx->deadlock_inject_interval = tmp;
18866 +               ctx->deadlock_inject_countdown = tmp;
18867 +               ctx->contending_lock = lock;
18868 +
18869 +               ww_mutex_unlock(lock);
18870 +
18871 +               return -EDEADLK;
18872 +       }
18873 +#endif
18874 +
18875 +       return 0;
18876 +}
18877 +
18878 +#ifdef CONFIG_PREEMPT_RT_FULL
18879 +int __sched
18880 +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
18881 +{
18882 +       int ret;
18883 +
18884 +       might_sleep();
18885 +
18886 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0,
18887 +                          ctx ? &ctx->dep_map : NULL, _RET_IP_);
18888 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
18889 +                               ctx);
18890 +       if (ret)
18891 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
18892 +       else if (!ret && ctx && ctx->acquired > 1)
18893 +               return ww_mutex_deadlock_injection(lock, ctx);
18894 +
18895 +       return ret;
18896 +}
18897 +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
18898 +
18899 +int __sched
18900 +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
18901 +{
18902 +       int ret;
18903 +
18904 +       might_sleep();
18905 +
18906 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0,
18907 +                          ctx ? &ctx->dep_map : NULL, _RET_IP_);
18908 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
18909 +                               ctx);
18910 +       if (ret)
18911 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
18912 +       else if (!ret && ctx && ctx->acquired > 1)
18913 +               return ww_mutex_deadlock_injection(lock, ctx);
18914 +
18915 +       return ret;
18916 +}
18917 +EXPORT_SYMBOL_GPL(ww_mutex_lock);
18918 +
18919 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
18920 +{
18921 +       int nest = !!lock->ctx;
18922 +
18923 +       /*
18924 +        * The unlocking fastpath is the 0->1 transition from 'locked'
18925 +        * into 'unlocked' state:
18926 +        */
18927 +       if (nest) {
18928 +#ifdef CONFIG_DEBUG_MUTEXES
18929 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
18930 +#endif
18931 +               if (lock->ctx->acquired > 0)
18932 +                       lock->ctx->acquired--;
18933 +               lock->ctx = NULL;
18934 +       }
18935 +
18936 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
18937 +       __rt_mutex_unlock(&lock->base.lock);
18938 +}
18939 +EXPORT_SYMBOL(ww_mutex_unlock);
18940 +
18941 +int __rt_mutex_owner_current(struct rt_mutex *lock)
18942 +{
18943 +       return rt_mutex_owner(lock) == current;
18944 +}
18945 +EXPORT_SYMBOL(__rt_mutex_owner_current);
18946 +#endif
18947 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rtmutex_common.h linux-4.14/kernel/locking/rtmutex_common.h
18948 --- linux-4.14.orig/kernel/locking/rtmutex_common.h     2018-09-05 11:03:22.000000000 +0200
18949 +++ linux-4.14/kernel/locking/rtmutex_common.h  2018-09-05 11:05:07.000000000 +0200
18950 @@ -15,6 +15,7 @@
18951  
18952  #include <linux/rtmutex.h>
18953  #include <linux/sched/wake_q.h>
18954 +#include <linux/sched/debug.h>
18955  
18956  /*
18957   * This is the control structure for tasks blocked on a rt_mutex,
18958 @@ -29,6 +30,7 @@
18959         struct rb_node          pi_tree_entry;
18960         struct task_struct      *task;
18961         struct rt_mutex         *lock;
18962 +       bool                    savestate;
18963  #ifdef CONFIG_DEBUG_RT_MUTEXES
18964         unsigned long           ip;
18965         struct pid              *deadlock_task_pid;
18966 @@ -129,12 +131,15 @@
18967  /*
18968   * PI-futex support (proxy locking functions, etc.):
18969   */
18970 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
18971 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
18972 +
18973  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
18974  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
18975                                        struct task_struct *proxy_owner);
18976  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
18977                                   struct task_struct *proxy_owner);
18978 -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
18979 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
18980  extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
18981                                      struct rt_mutex_waiter *waiter,
18982                                      struct task_struct *task);
18983 @@ -152,9 +157,27 @@
18984  
18985  extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
18986  extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
18987 -                                struct wake_q_head *wqh);
18988 +                                struct wake_q_head *wqh,
18989 +                                struct wake_q_head *wq_sleeper);
18990  
18991 -extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
18992 +extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
18993 +                               struct wake_q_head *wake_sleeper_q);
18994 +
18995 +/* RW semaphore special interface */
18996 +struct ww_acquire_ctx;
18997 +
18998 +extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
18999 +extern int __rt_mutex_trylock(struct rt_mutex *lock);
19000 +extern void __rt_mutex_unlock(struct rt_mutex *lock);
19001 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
19002 +                                    struct hrtimer_sleeper *timeout,
19003 +                                    enum rtmutex_chainwalk chwalk,
19004 +                                    struct ww_acquire_ctx *ww_ctx,
19005 +                                    struct rt_mutex_waiter *waiter);
19006 +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
19007 +                                         struct rt_mutex_waiter *waiter,
19008 +                                         unsigned long flags);
19009 +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
19010  
19011  #ifdef CONFIG_DEBUG_RT_MUTEXES
19012  # include "rtmutex-debug.h"
19013 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rwlock-rt.c linux-4.14/kernel/locking/rwlock-rt.c
19014 --- linux-4.14.orig/kernel/locking/rwlock-rt.c  1970-01-01 01:00:00.000000000 +0100
19015 +++ linux-4.14/kernel/locking/rwlock-rt.c       2018-09-05 11:05:07.000000000 +0200
19016 @@ -0,0 +1,378 @@
19017 +/*
19018 + */
19019 +#include <linux/sched/debug.h>
19020 +#include <linux/export.h>
19021 +
19022 +#include "rtmutex_common.h"
19023 +#include <linux/rwlock_types_rt.h>
19024 +
19025 +/*
19026 + * RT-specific reader/writer locks
19027 + *
19028 + * write_lock()
19029 + *  1) Lock lock->rtmutex
19030 + *  2) Remove the reader BIAS to force readers into the slow path
19031 + *  3) Wait until all readers have left the critical region
19032 + *  4) Mark it write locked
19033 + *
19034 + * write_unlock()
19035 + *  1) Remove the write locked marker
19036 + *  2) Set the reader BIAS so readers can use the fast path again
19037 + *  3) Unlock lock->rtmutex to release blocked readers
19038 + *
19039 + * read_lock()
19040 + *  1) Try fast path acquisition (reader BIAS is set)
19041 + *  2) Take lock->rtmutex.wait_lock which protects the writelocked flag
19042 + *  3) If !writelocked, acquire it for read
19043 + *  4) If writelocked, block on lock->rtmutex
19044 + *  5) unlock lock->rtmutex, goto 1)
19045 + *
19046 + * read_unlock()
19047 + *  1) Try fast path release (reader count != 1)
19048 + *  2) Wake the writer waiting in write_lock()#3
19049 + *
19050 + * read_lock()#3 has the consequence, that rw locks on RT are not writer
19051 + * fair, but writers, which should be avoided in RT tasks (think tasklist
19052 + * lock), are subject to the rtmutex priority/DL inheritance mechanism.
19053 + *
19054 + * It's possible to make the rw locks writer fair by keeping a list of
19055 + * active readers. A blocked writer would force all newly incoming readers
19056 + * to block on the rtmutex, but the rtmutex would have to be proxy locked
19057 + * for one reader after the other. We can't use multi-reader inheritance
19058 + * because there is no way to support that with
19059 + * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
19060 + * mechanism is a major surgery for a very dubious value.
19061 + *
19062 + * The risk of writer starvation is there, but the pathological use cases
19063 + * which trigger it are not necessarily the typical RT workloads.
19064 + */
19065 +
19066 +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
19067 +                            struct lock_class_key *key)
19068 +{
19069 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19070 +       /*
19071 +        * Make sure we are not reinitializing a held semaphore:
19072 +        */
19073 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19074 +       lockdep_init_map(&lock->dep_map, name, key, 0);
19075 +#endif
19076 +       atomic_set(&lock->readers, READER_BIAS);
19077 +       rt_mutex_init(&lock->rtmutex);
19078 +       lock->rtmutex.save_state = 1;
19079 +}
19080 +
19081 +int __read_rt_trylock(struct rt_rw_lock *lock)
19082 +{
19083 +       int r, old;
19084 +
19085 +       /*
19086 +        * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
19087 +        * set.
19088 +        */
19089 +       for (r = atomic_read(&lock->readers); r < 0;) {
19090 +               old = atomic_cmpxchg(&lock->readers, r, r + 1);
19091 +               if (likely(old == r))
19092 +                       return 1;
19093 +               r = old;
19094 +       }
19095 +       return 0;
19096 +}
19097 +
19098 +void __sched __read_rt_lock(struct rt_rw_lock *lock)
19099 +{
19100 +       struct rt_mutex *m = &lock->rtmutex;
19101 +       struct rt_mutex_waiter waiter;
19102 +       unsigned long flags;
19103 +
19104 +       if (__read_rt_trylock(lock))
19105 +               return;
19106 +
19107 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19108 +       /*
19109 +        * Allow readers as long as the writer has not completely
19110 +        * acquired the semaphore for write.
19111 +        */
19112 +       if (atomic_read(&lock->readers) != WRITER_BIAS) {
19113 +               atomic_inc(&lock->readers);
19114 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19115 +               return;
19116 +       }
19117 +
19118 +       /*
19119 +        * Call into the slow lock path with the rtmutex->wait_lock
19120 +        * held, so this can't result in the following race:
19121 +        *
19122 +        * Reader1              Reader2         Writer
19123 +        *                      read_lock()
19124 +        *                                      write_lock()
19125 +        *                                      rtmutex_lock(m)
19126 +        *                                      swait()
19127 +        * read_lock()
19128 +        * unlock(m->wait_lock)
19129 +        *                      read_unlock()
19130 +        *                      swake()
19131 +        *                                      lock(m->wait_lock)
19132 +        *                                      lock->writelocked=true
19133 +        *                                      unlock(m->wait_lock)
19134 +        *
19135 +        *                                      write_unlock()
19136 +        *                                      lock->writelocked=false
19137 +        *                                      rtmutex_unlock(m)
19138 +        *                      read_lock()
19139 +        *                                      write_lock()
19140 +        *                                      rtmutex_lock(m)
19141 +        *                                      swait()
19142 +        * rtmutex_lock(m)
19143 +        *
19144 +        * That would put Reader1 behind the writer waiting on
19145 +        * Reader2 to call read_unlock() which might be unbound.
19146 +        */
19147 +       rt_mutex_init_waiter(&waiter, false);
19148 +       rt_spin_lock_slowlock_locked(m, &waiter, flags);
19149 +       /*
19150 +        * The slowlock() above is guaranteed to return with the rtmutex is
19151 +        * now held, so there can't be a writer active. Increment the reader
19152 +        * count and immediately drop the rtmutex again.
19153 +        */
19154 +       atomic_inc(&lock->readers);
19155 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19156 +       rt_spin_lock_slowunlock(m);
19157 +
19158 +       debug_rt_mutex_free_waiter(&waiter);
19159 +}
19160 +
19161 +void __read_rt_unlock(struct rt_rw_lock *lock)
19162 +{
19163 +       struct rt_mutex *m = &lock->rtmutex;
19164 +       struct task_struct *tsk;
19165 +
19166 +       /*
19167 +        * sem->readers can only hit 0 when a writer is waiting for the
19168 +        * active readers to leave the critical region.
19169 +        */
19170 +       if (!atomic_dec_and_test(&lock->readers))
19171 +               return;
19172 +
19173 +       raw_spin_lock_irq(&m->wait_lock);
19174 +       /*
19175 +        * Wake the writer, i.e. the rtmutex owner. It might release the
19176 +        * rtmutex concurrently in the fast path, but to clean up the rw
19177 +        * lock it needs to acquire m->wait_lock. The worst case which can
19178 +        * happen is a spurious wakeup.
19179 +        */
19180 +       tsk = rt_mutex_owner(m);
19181 +       if (tsk)
19182 +               wake_up_process(tsk);
19183 +
19184 +       raw_spin_unlock_irq(&m->wait_lock);
19185 +}
19186 +
19187 +static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
19188 +                                 unsigned long flags)
19189 +{
19190 +       struct rt_mutex *m = &lock->rtmutex;
19191 +
19192 +       atomic_add(READER_BIAS - bias, &lock->readers);
19193 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19194 +       rt_spin_lock_slowunlock(m);
19195 +}
19196 +
19197 +void __sched __write_rt_lock(struct rt_rw_lock *lock)
19198 +{
19199 +       struct rt_mutex *m = &lock->rtmutex;
19200 +       struct task_struct *self = current;
19201 +       unsigned long flags;
19202 +
19203 +       /* Take the rtmutex as a first step */
19204 +       __rt_spin_lock(m);
19205 +
19206 +       /* Force readers into slow path */
19207 +       atomic_sub(READER_BIAS, &lock->readers);
19208 +
19209 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19210 +
19211 +       raw_spin_lock(&self->pi_lock);
19212 +       self->saved_state = self->state;
19213 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19214 +       raw_spin_unlock(&self->pi_lock);
19215 +
19216 +       for (;;) {
19217 +               /* Have all readers left the critical region? */
19218 +               if (!atomic_read(&lock->readers)) {
19219 +                       atomic_set(&lock->readers, WRITER_BIAS);
19220 +                       raw_spin_lock(&self->pi_lock);
19221 +                       __set_current_state_no_track(self->saved_state);
19222 +                       self->saved_state = TASK_RUNNING;
19223 +                       raw_spin_unlock(&self->pi_lock);
19224 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19225 +                       return;
19226 +               }
19227 +
19228 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19229 +
19230 +               if (atomic_read(&lock->readers) != 0)
19231 +                       schedule();
19232 +
19233 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
19234 +
19235 +               raw_spin_lock(&self->pi_lock);
19236 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19237 +               raw_spin_unlock(&self->pi_lock);
19238 +       }
19239 +}
19240 +
19241 +int __write_rt_trylock(struct rt_rw_lock *lock)
19242 +{
19243 +       struct rt_mutex *m = &lock->rtmutex;
19244 +       unsigned long flags;
19245 +
19246 +       if (!__rt_mutex_trylock(m))
19247 +               return 0;
19248 +
19249 +       atomic_sub(READER_BIAS, &lock->readers);
19250 +
19251 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19252 +       if (!atomic_read(&lock->readers)) {
19253 +               atomic_set(&lock->readers, WRITER_BIAS);
19254 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19255 +               return 1;
19256 +       }
19257 +       __write_unlock_common(lock, 0, flags);
19258 +       return 0;
19259 +}
19260 +
19261 +void __write_rt_unlock(struct rt_rw_lock *lock)
19262 +{
19263 +       struct rt_mutex *m = &lock->rtmutex;
19264 +       unsigned long flags;
19265 +
19266 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19267 +       __write_unlock_common(lock, WRITER_BIAS, flags);
19268 +}
19269 +
19270 +/* Map the reader biased implementation */
19271 +static inline int do_read_rt_trylock(rwlock_t *rwlock)
19272 +{
19273 +       return __read_rt_trylock(rwlock);
19274 +}
19275 +
19276 +static inline int do_write_rt_trylock(rwlock_t *rwlock)
19277 +{
19278 +       return __write_rt_trylock(rwlock);
19279 +}
19280 +
19281 +static inline void do_read_rt_lock(rwlock_t *rwlock)
19282 +{
19283 +       __read_rt_lock(rwlock);
19284 +}
19285 +
19286 +static inline void do_write_rt_lock(rwlock_t *rwlock)
19287 +{
19288 +       __write_rt_lock(rwlock);
19289 +}
19290 +
19291 +static inline void do_read_rt_unlock(rwlock_t *rwlock)
19292 +{
19293 +       __read_rt_unlock(rwlock);
19294 +}
19295 +
19296 +static inline void do_write_rt_unlock(rwlock_t *rwlock)
19297 +{
19298 +       __write_rt_unlock(rwlock);
19299 +}
19300 +
19301 +static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
19302 +                                    struct lock_class_key *key)
19303 +{
19304 +       __rwlock_biased_rt_init(rwlock, name, key);
19305 +}
19306 +
19307 +int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
19308 +{
19309 +       return  atomic_read(&rwlock->readers) < 0;
19310 +}
19311 +
19312 +int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
19313 +{
19314 +       return atomic_read(&rwlock->readers) == READER_BIAS;
19315 +}
19316 +
19317 +/*
19318 + * The common functions which get wrapped into the rwlock API.
19319 + */
19320 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19321 +{
19322 +       int ret;
19323 +
19324 +       sleeping_lock_inc();
19325 +       migrate_disable();
19326 +       ret = do_read_rt_trylock(rwlock);
19327 +       if (ret) {
19328 +               rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
19329 +       } else {
19330 +               migrate_enable();
19331 +               sleeping_lock_dec();
19332 +       }
19333 +       return ret;
19334 +}
19335 +EXPORT_SYMBOL(rt_read_trylock);
19336 +
19337 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19338 +{
19339 +       int ret;
19340 +
19341 +       sleeping_lock_inc();
19342 +       migrate_disable();
19343 +       ret = do_write_rt_trylock(rwlock);
19344 +       if (ret) {
19345 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19346 +       } else {
19347 +               migrate_enable();
19348 +               sleeping_lock_dec();
19349 +       }
19350 +       return ret;
19351 +}
19352 +EXPORT_SYMBOL(rt_write_trylock);
19353 +
19354 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
19355 +{
19356 +       sleeping_lock_inc();
19357 +       migrate_disable();
19358 +       rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
19359 +       do_read_rt_lock(rwlock);
19360 +}
19361 +EXPORT_SYMBOL(rt_read_lock);
19362 +
19363 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
19364 +{
19365 +       sleeping_lock_inc();
19366 +       migrate_disable();
19367 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19368 +       do_write_rt_lock(rwlock);
19369 +}
19370 +EXPORT_SYMBOL(rt_write_lock);
19371 +
19372 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19373 +{
19374 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19375 +       do_read_rt_unlock(rwlock);
19376 +       migrate_enable();
19377 +       sleeping_lock_dec();
19378 +}
19379 +EXPORT_SYMBOL(rt_read_unlock);
19380 +
19381 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19382 +{
19383 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19384 +       do_write_rt_unlock(rwlock);
19385 +       migrate_enable();
19386 +       sleeping_lock_dec();
19387 +}
19388 +EXPORT_SYMBOL(rt_write_unlock);
19389 +
19390 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19391 +{
19392 +       do_rwlock_rt_init(rwlock, name, key);
19393 +}
19394 +EXPORT_SYMBOL(__rt_rwlock_init);
19395 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/rwsem-rt.c linux-4.14/kernel/locking/rwsem-rt.c
19396 --- linux-4.14.orig/kernel/locking/rwsem-rt.c   1970-01-01 01:00:00.000000000 +0100
19397 +++ linux-4.14/kernel/locking/rwsem-rt.c        2018-09-05 11:05:07.000000000 +0200
19398 @@ -0,0 +1,269 @@
19399 +/*
19400 + */
19401 +#include <linux/rwsem.h>
19402 +#include <linux/sched/debug.h>
19403 +#include <linux/sched/signal.h>
19404 +#include <linux/export.h>
19405 +
19406 +#include "rtmutex_common.h"
19407 +
19408 +/*
19409 + * RT-specific reader/writer semaphores
19410 + *
19411 + * down_write()
19412 + *  1) Lock sem->rtmutex
19413 + *  2) Remove the reader BIAS to force readers into the slow path
19414 + *  3) Wait until all readers have left the critical region
19415 + *  4) Mark it write locked
19416 + *
19417 + * up_write()
19418 + *  1) Remove the write locked marker
19419 + *  2) Set the reader BIAS so readers can use the fast path again
19420 + *  3) Unlock sem->rtmutex to release blocked readers
19421 + *
19422 + * down_read()
19423 + *  1) Try fast path acquisition (reader BIAS is set)
19424 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
19425 + *  3) If !writelocked, acquire it for read
19426 + *  4) If writelocked, block on sem->rtmutex
19427 + *  5) unlock sem->rtmutex, goto 1)
19428 + *
19429 + * up_read()
19430 + *  1) Try fast path release (reader count != 1)
19431 + *  2) Wake the writer waiting in down_write()#3
19432 + *
19433 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
19434 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
19435 + * are subject to the rtmutex priority/DL inheritance mechanism.
19436 + *
19437 + * It's possible to make the rw semaphores writer fair by keeping a list of
19438 + * active readers. A blocked writer would force all newly incoming readers to
19439 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
19440 + * reader after the other. We can't use multi-reader inheritance because there
19441 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
19442 + * reader boosting/handover mechanism is a major surgery for a very dubious
19443 + * value.
19444 + *
19445 + * The risk of writer starvation is there, but the pathological use cases
19446 + * which trigger it are not necessarily the typical RT workloads.
19447 + */
19448 +
19449 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
19450 +                 struct lock_class_key *key)
19451 +{
19452 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19453 +       /*
19454 +        * Make sure we are not reinitializing a held semaphore:
19455 +        */
19456 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
19457 +       lockdep_init_map(&sem->dep_map, name, key, 0);
19458 +#endif
19459 +       atomic_set(&sem->readers, READER_BIAS);
19460 +}
19461 +EXPORT_SYMBOL(__rwsem_init);
19462 +
19463 +int __down_read_trylock(struct rw_semaphore *sem)
19464 +{
19465 +       int r, old;
19466 +
19467 +       /*
19468 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
19469 +        * set.
19470 +        */
19471 +       for (r = atomic_read(&sem->readers); r < 0;) {
19472 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
19473 +               if (likely(old == r))
19474 +                       return 1;
19475 +               r = old;
19476 +       }
19477 +       return 0;
19478 +}
19479 +
19480 +void __sched __down_read(struct rw_semaphore *sem)
19481 +{
19482 +       struct rt_mutex *m = &sem->rtmutex;
19483 +       struct rt_mutex_waiter waiter;
19484 +
19485 +       if (__down_read_trylock(sem))
19486 +               return;
19487 +
19488 +       might_sleep();
19489 +       raw_spin_lock_irq(&m->wait_lock);
19490 +       /*
19491 +        * Allow readers as long as the writer has not completely
19492 +        * acquired the semaphore for write.
19493 +        */
19494 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
19495 +               atomic_inc(&sem->readers);
19496 +               raw_spin_unlock_irq(&m->wait_lock);
19497 +               return;
19498 +       }
19499 +
19500 +       /*
19501 +        * Call into the slow lock path with the rtmutex->wait_lock
19502 +        * held, so this can't result in the following race:
19503 +        *
19504 +        * Reader1              Reader2         Writer
19505 +        *                      down_read()
19506 +        *                                      down_write()
19507 +        *                                      rtmutex_lock(m)
19508 +        *                                      swait()
19509 +        * down_read()
19510 +        * unlock(m->wait_lock)
19511 +        *                      up_read()
19512 +        *                      swake()
19513 +        *                                      lock(m->wait_lock)
19514 +        *                                      sem->writelocked=true
19515 +        *                                      unlock(m->wait_lock)
19516 +        *
19517 +        *                                      up_write()
19518 +        *                                      sem->writelocked=false
19519 +        *                                      rtmutex_unlock(m)
19520 +        *                      down_read()
19521 +        *                                      down_write()
19522 +        *                                      rtmutex_lock(m)
19523 +        *                                      swait()
19524 +        * rtmutex_lock(m)
19525 +        *
19526 +        * That would put Reader1 behind the writer waiting on
19527 +        * Reader2 to call up_read() which might be unbound.
19528 +        */
19529 +       rt_mutex_init_waiter(&waiter, false);
19530 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
19531 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
19532 +                                &waiter);
19533 +       /*
19534 +        * The slowlock() above is guaranteed to return with the rtmutex is
19535 +        * now held, so there can't be a writer active. Increment the reader
19536 +        * count and immediately drop the rtmutex again.
19537 +        */
19538 +       atomic_inc(&sem->readers);
19539 +       raw_spin_unlock_irq(&m->wait_lock);
19540 +       __rt_mutex_unlock(m);
19541 +
19542 +       debug_rt_mutex_free_waiter(&waiter);
19543 +}
19544 +
19545 +void __up_read(struct rw_semaphore *sem)
19546 +{
19547 +       struct rt_mutex *m = &sem->rtmutex;
19548 +       struct task_struct *tsk;
19549 +
19550 +       /*
19551 +        * sem->readers can only hit 0 when a writer is waiting for the
19552 +        * active readers to leave the critical region.
19553 +        */
19554 +       if (!atomic_dec_and_test(&sem->readers))
19555 +               return;
19556 +
19557 +       might_sleep();
19558 +       raw_spin_lock_irq(&m->wait_lock);
19559 +       /*
19560 +        * Wake the writer, i.e. the rtmutex owner. It might release the
19561 +        * rtmutex concurrently in the fast path (due to a signal), but to
19562 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
19563 +        * case which can happen is a spurious wakeup.
19564 +        */
19565 +       tsk = rt_mutex_owner(m);
19566 +       if (tsk)
19567 +               wake_up_process(tsk);
19568 +
19569 +       raw_spin_unlock_irq(&m->wait_lock);
19570 +}
19571 +
19572 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
19573 +                             unsigned long flags)
19574 +{
19575 +       struct rt_mutex *m = &sem->rtmutex;
19576 +
19577 +       atomic_add(READER_BIAS - bias, &sem->readers);
19578 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19579 +       __rt_mutex_unlock(m);
19580 +}
19581 +
19582 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
19583 +{
19584 +       struct rt_mutex *m = &sem->rtmutex;
19585 +       unsigned long flags;
19586 +
19587 +       /* Take the rtmutex as a first step */
19588 +       if (__rt_mutex_lock_state(m, state))
19589 +               return -EINTR;
19590 +
19591 +       /* Force readers into slow path */
19592 +       atomic_sub(READER_BIAS, &sem->readers);
19593 +       might_sleep();
19594 +
19595 +       set_current_state(state);
19596 +       for (;;) {
19597 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
19598 +               /* Have all readers left the critical region? */
19599 +               if (!atomic_read(&sem->readers)) {
19600 +                       atomic_set(&sem->readers, WRITER_BIAS);
19601 +                       __set_current_state(TASK_RUNNING);
19602 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19603 +                       return 0;
19604 +               }
19605 +
19606 +               if (signal_pending_state(state, current)) {
19607 +                       __set_current_state(TASK_RUNNING);
19608 +                       __up_write_unlock(sem, 0, flags);
19609 +                       return -EINTR;
19610 +               }
19611 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19612 +
19613 +               if (atomic_read(&sem->readers) != 0) {
19614 +                       schedule();
19615 +                       set_current_state(state);
19616 +               }
19617 +       }
19618 +}
19619 +
19620 +void __sched __down_write(struct rw_semaphore *sem)
19621 +{
19622 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
19623 +}
19624 +
19625 +int __sched __down_write_killable(struct rw_semaphore *sem)
19626 +{
19627 +       return __down_write_common(sem, TASK_KILLABLE);
19628 +}
19629 +
19630 +int __down_write_trylock(struct rw_semaphore *sem)
19631 +{
19632 +       struct rt_mutex *m = &sem->rtmutex;
19633 +       unsigned long flags;
19634 +
19635 +       if (!__rt_mutex_trylock(m))
19636 +               return 0;
19637 +
19638 +       atomic_sub(READER_BIAS, &sem->readers);
19639 +
19640 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19641 +       if (!atomic_read(&sem->readers)) {
19642 +               atomic_set(&sem->readers, WRITER_BIAS);
19643 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19644 +               return 1;
19645 +       }
19646 +       __up_write_unlock(sem, 0, flags);
19647 +       return 0;
19648 +}
19649 +
19650 +void __up_write(struct rw_semaphore *sem)
19651 +{
19652 +       struct rt_mutex *m = &sem->rtmutex;
19653 +       unsigned long flags;
19654 +
19655 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19656 +       __up_write_unlock(sem, WRITER_BIAS, flags);
19657 +}
19658 +
19659 +void __downgrade_write(struct rw_semaphore *sem)
19660 +{
19661 +       struct rt_mutex *m = &sem->rtmutex;
19662 +       unsigned long flags;
19663 +
19664 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19665 +       /* Release it and account current as reader */
19666 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
19667 +}
19668 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/spinlock.c linux-4.14/kernel/locking/spinlock.c
19669 --- linux-4.14.orig/kernel/locking/spinlock.c   2017-11-12 19:46:13.000000000 +0100
19670 +++ linux-4.14/kernel/locking/spinlock.c        2018-09-05 11:05:07.000000000 +0200
19671 @@ -125,8 +125,11 @@
19672   *         __[spin|read|write]_lock_bh()
19673   */
19674  BUILD_LOCK_OPS(spin, raw_spinlock);
19675 +
19676 +#ifndef CONFIG_PREEMPT_RT_FULL
19677  BUILD_LOCK_OPS(read, rwlock);
19678  BUILD_LOCK_OPS(write, rwlock);
19679 +#endif
19680  
19681  #endif
19682  
19683 @@ -210,6 +213,8 @@
19684  EXPORT_SYMBOL(_raw_spin_unlock_bh);
19685  #endif
19686  
19687 +#ifndef CONFIG_PREEMPT_RT_FULL
19688 +
19689  #ifndef CONFIG_INLINE_READ_TRYLOCK
19690  int __lockfunc _raw_read_trylock(rwlock_t *lock)
19691  {
19692 @@ -354,6 +359,8 @@
19693  EXPORT_SYMBOL(_raw_write_unlock_bh);
19694  #endif
19695  
19696 +#endif /* !PREEMPT_RT_FULL */
19697 +
19698  #ifdef CONFIG_DEBUG_LOCK_ALLOC
19699  
19700  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
19701 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/locking/spinlock_debug.c linux-4.14/kernel/locking/spinlock_debug.c
19702 --- linux-4.14.orig/kernel/locking/spinlock_debug.c     2017-11-12 19:46:13.000000000 +0100
19703 +++ linux-4.14/kernel/locking/spinlock_debug.c  2018-09-05 11:05:07.000000000 +0200
19704 @@ -31,6 +31,7 @@
19705  
19706  EXPORT_SYMBOL(__raw_spin_lock_init);
19707  
19708 +#ifndef CONFIG_PREEMPT_RT_FULL
19709  void __rwlock_init(rwlock_t *lock, const char *name,
19710                    struct lock_class_key *key)
19711  {
19712 @@ -48,6 +49,7 @@
19713  }
19714  
19715  EXPORT_SYMBOL(__rwlock_init);
19716 +#endif
19717  
19718  static void spin_dump(raw_spinlock_t *lock, const char *msg)
19719  {
19720 @@ -135,6 +137,7 @@
19721         arch_spin_unlock(&lock->raw_lock);
19722  }
19723  
19724 +#ifndef CONFIG_PREEMPT_RT_FULL
19725  static void rwlock_bug(rwlock_t *lock, const char *msg)
19726  {
19727         if (!debug_locks_off())
19728 @@ -224,3 +227,5 @@
19729         debug_write_unlock(lock);
19730         arch_write_unlock(&lock->raw_lock);
19731  }
19732 +
19733 +#endif
19734 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/panic.c linux-4.14/kernel/panic.c
19735 --- linux-4.14.orig/kernel/panic.c      2017-11-12 19:46:13.000000000 +0100
19736 +++ linux-4.14/kernel/panic.c   2018-09-05 11:05:07.000000000 +0200
19737 @@ -482,9 +482,11 @@
19738  
19739  static int init_oops_id(void)
19740  {
19741 +#ifndef CONFIG_PREEMPT_RT_FULL
19742         if (!oops_id)
19743                 get_random_bytes(&oops_id, sizeof(oops_id));
19744         else
19745 +#endif
19746                 oops_id++;
19747  
19748         return 0;
19749 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/power/hibernate.c linux-4.14/kernel/power/hibernate.c
19750 --- linux-4.14.orig/kernel/power/hibernate.c    2017-11-12 19:46:13.000000000 +0100
19751 +++ linux-4.14/kernel/power/hibernate.c 2018-09-05 11:05:07.000000000 +0200
19752 @@ -287,6 +287,8 @@
19753  
19754         local_irq_disable();
19755  
19756 +       system_state = SYSTEM_SUSPEND;
19757 +
19758         error = syscore_suspend();
19759         if (error) {
19760                 pr_err("Some system devices failed to power down, aborting hibernation\n");
19761 @@ -317,6 +319,7 @@
19762         syscore_resume();
19763  
19764   Enable_irqs:
19765 +       system_state = SYSTEM_RUNNING;
19766         local_irq_enable();
19767  
19768   Enable_cpus:
19769 @@ -445,6 +448,7 @@
19770                 goto Enable_cpus;
19771  
19772         local_irq_disable();
19773 +       system_state = SYSTEM_SUSPEND;
19774  
19775         error = syscore_suspend();
19776         if (error)
19777 @@ -478,6 +482,7 @@
19778         syscore_resume();
19779  
19780   Enable_irqs:
19781 +       system_state = SYSTEM_RUNNING;
19782         local_irq_enable();
19783  
19784   Enable_cpus:
19785 @@ -563,6 +568,7 @@
19786                 goto Enable_cpus;
19787  
19788         local_irq_disable();
19789 +       system_state = SYSTEM_SUSPEND;
19790         syscore_suspend();
19791         if (pm_wakeup_pending()) {
19792                 error = -EAGAIN;
19793 @@ -575,6 +581,7 @@
19794  
19795   Power_up:
19796         syscore_resume();
19797 +       system_state = SYSTEM_RUNNING;
19798         local_irq_enable();
19799  
19800   Enable_cpus:
19801 @@ -672,6 +679,10 @@
19802         return error;
19803  }
19804  
19805 +#ifndef CONFIG_SUSPEND
19806 +bool pm_in_action;
19807 +#endif
19808 +
19809  /**
19810   * hibernate - Carry out system hibernation, including saving the image.
19811   */
19812 @@ -685,6 +696,8 @@
19813                 return -EPERM;
19814         }
19815  
19816 +       pm_in_action = true;
19817 +
19818         lock_system_sleep();
19819         /* The snapshot device should not be opened while we're running */
19820         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
19821 @@ -763,6 +776,7 @@
19822         atomic_inc(&snapshot_device_available);
19823   Unlock:
19824         unlock_system_sleep();
19825 +       pm_in_action = false;
19826         pr_info("hibernation exit\n");
19827  
19828         return error;
19829 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/power/suspend.c linux-4.14/kernel/power/suspend.c
19830 --- linux-4.14.orig/kernel/power/suspend.c      2018-09-05 11:03:22.000000000 +0200
19831 +++ linux-4.14/kernel/power/suspend.c   2018-09-05 11:05:07.000000000 +0200
19832 @@ -428,6 +428,8 @@
19833         arch_suspend_disable_irqs();
19834         BUG_ON(!irqs_disabled());
19835  
19836 +       system_state = SYSTEM_SUSPEND;
19837 +
19838         error = syscore_suspend();
19839         if (!error) {
19840                 *wakeup = pm_wakeup_pending();
19841 @@ -443,6 +445,8 @@
19842                 syscore_resume();
19843         }
19844  
19845 +       system_state = SYSTEM_RUNNING;
19846 +
19847         arch_suspend_enable_irqs();
19848         BUG_ON(irqs_disabled());
19849  
19850 @@ -589,6 +593,8 @@
19851         return error;
19852  }
19853  
19854 +bool pm_in_action;
19855 +
19856  /**
19857   * pm_suspend - Externally visible function for suspending the system.
19858   * @state: System sleep state to enter.
19859 @@ -603,6 +609,7 @@
19860         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
19861                 return -EINVAL;
19862  
19863 +       pm_in_action = true;
19864         pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
19865         error = enter_state(state);
19866         if (error) {
19867 @@ -612,6 +619,7 @@
19868                 suspend_stats.success++;
19869         }
19870         pr_info("suspend exit\n");
19871 +       pm_in_action = false;
19872         return error;
19873  }
19874  EXPORT_SYMBOL(pm_suspend);
19875 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/printk/printk.c linux-4.14/kernel/printk/printk.c
19876 --- linux-4.14.orig/kernel/printk/printk.c      2017-11-12 19:46:13.000000000 +0100
19877 +++ linux-4.14/kernel/printk/printk.c   2018-09-05 11:05:07.000000000 +0200
19878 @@ -400,6 +400,65 @@
19879                 printk_safe_exit_irqrestore(flags);     \
19880         } while (0)
19881  
19882 +#ifdef CONFIG_EARLY_PRINTK
19883 +struct console *early_console;
19884 +
19885 +static void early_vprintk(const char *fmt, va_list ap)
19886 +{
19887 +       if (early_console) {
19888 +               char buf[512];
19889 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
19890 +
19891 +               early_console->write(early_console, buf, n);
19892 +       }
19893 +}
19894 +
19895 +asmlinkage void early_printk(const char *fmt, ...)
19896 +{
19897 +       va_list ap;
19898 +
19899 +       va_start(ap, fmt);
19900 +       early_vprintk(fmt, ap);
19901 +       va_end(ap);
19902 +}
19903 +
19904 +/*
19905 + * This is independent of any log levels - a global
19906 + * kill switch that turns off all of printk.
19907 + *
19908 + * Used by the NMI watchdog if early-printk is enabled.
19909 + */
19910 +static bool __read_mostly printk_killswitch;
19911 +
19912 +static int __init force_early_printk_setup(char *str)
19913 +{
19914 +       printk_killswitch = true;
19915 +       return 0;
19916 +}
19917 +early_param("force_early_printk", force_early_printk_setup);
19918 +
19919 +void printk_kill(void)
19920 +{
19921 +       printk_killswitch = true;
19922 +}
19923 +
19924 +#ifdef CONFIG_PRINTK
19925 +static int forced_early_printk(const char *fmt, va_list ap)
19926 +{
19927 +       if (!printk_killswitch)
19928 +               return 0;
19929 +       early_vprintk(fmt, ap);
19930 +       return 1;
19931 +}
19932 +#endif
19933 +
19934 +#else
19935 +static inline int forced_early_printk(const char *fmt, va_list ap)
19936 +{
19937 +       return 0;
19938 +}
19939 +#endif
19940 +
19941  #ifdef CONFIG_PRINTK
19942  DECLARE_WAIT_QUEUE_HEAD(log_wait);
19943  /* the next printk record to read by syslog(READ) or /proc/kmsg */
19944 @@ -1348,6 +1407,8 @@
19945  {
19946         char *text;
19947         int len = 0;
19948 +       int attempts = 0;
19949 +       int num_msg;
19950  
19951         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
19952         if (!text)
19953 @@ -1359,6 +1420,14 @@
19954                 u64 seq;
19955                 u32 idx;
19956  
19957 +try_again:
19958 +               attempts++;
19959 +               if (attempts > 10) {
19960 +                       len = -EBUSY;
19961 +                       goto out;
19962 +               }
19963 +               num_msg = 0;
19964 +
19965                 /*
19966                  * Find first record that fits, including all following records,
19967                  * into the user-provided buffer for this dump.
19968 @@ -1371,6 +1440,14 @@
19969                         len += msg_print_text(msg, true, NULL, 0);
19970                         idx = log_next(idx);
19971                         seq++;
19972 +                       num_msg++;
19973 +                       if (num_msg > 5) {
19974 +                               num_msg = 0;
19975 +                               logbuf_unlock_irq();
19976 +                               logbuf_lock_irq();
19977 +                               if (clear_seq < log_first_seq)
19978 +                                       goto try_again;
19979 +                       }
19980                 }
19981  
19982                 /* move first record forward until length fits into the buffer */
19983 @@ -1382,6 +1459,14 @@
19984                         len -= msg_print_text(msg, true, NULL, 0);
19985                         idx = log_next(idx);
19986                         seq++;
19987 +                       num_msg++;
19988 +                       if (num_msg > 5) {
19989 +                               num_msg = 0;
19990 +                               logbuf_unlock_irq();
19991 +                               logbuf_lock_irq();
19992 +                               if (clear_seq < log_first_seq)
19993 +                                       goto try_again;
19994 +                       }
19995                 }
19996  
19997                 /* last message fitting into this dump */
19998 @@ -1420,6 +1505,7 @@
19999                 clear_seq = log_next_seq;
20000                 clear_idx = log_next_idx;
20001         }
20002 +out:
20003         logbuf_unlock_irq();
20004  
20005         kfree(text);
20006 @@ -1558,6 +1644,12 @@
20007         if (!console_drivers)
20008                 return;
20009  
20010 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20011 +               if (in_irq() || in_nmi())
20012 +                       return;
20013 +       }
20014 +
20015 +       migrate_disable();
20016         for_each_console(con) {
20017                 if (exclusive_console && con != exclusive_console)
20018                         continue;
20019 @@ -1573,6 +1665,7 @@
20020                 else
20021                         con->write(con, text, len);
20022         }
20023 +       migrate_enable();
20024  }
20025  
20026  int printk_delay_msec __read_mostly;
20027 @@ -1692,6 +1785,13 @@
20028         int printed_len;
20029         bool in_sched = false;
20030  
20031 +       /*
20032 +        * Fall back to early_printk if a debugging subsystem has
20033 +        * killed printk output
20034 +        */
20035 +       if (unlikely(forced_early_printk(fmt, args)))
20036 +               return 1;
20037 +
20038         if (level == LOGLEVEL_SCHED) {
20039                 level = LOGLEVEL_DEFAULT;
20040                 in_sched = true;
20041 @@ -1748,12 +1848,22 @@
20042  
20043         /* If called from the scheduler, we can not call up(). */
20044         if (!in_sched) {
20045 +               int may_trylock = 1;
20046 +
20047 +#ifdef CONFIG_PREEMPT_RT_FULL
20048 +               /*
20049 +                * we can't take a sleeping lock with IRQs or preeption disabled
20050 +                * so we can't print in these contexts
20051 +                */
20052 +               if (!(preempt_count() == 0 && !irqs_disabled()))
20053 +                       may_trylock = 0;
20054 +#endif
20055                 /*
20056                  * Try to acquire and then immediately release the console
20057                  * semaphore.  The release will print out buffers and wake up
20058                  * /dev/kmsg and syslog() users.
20059                  */
20060 -               if (console_trylock())
20061 +               if (may_trylock && console_trylock())
20062                         console_unlock();
20063         }
20064  
20065 @@ -1863,26 +1973,6 @@
20066  
20067  #endif /* CONFIG_PRINTK */
20068  
20069 -#ifdef CONFIG_EARLY_PRINTK
20070 -struct console *early_console;
20071 -
20072 -asmlinkage __visible void early_printk(const char *fmt, ...)
20073 -{
20074 -       va_list ap;
20075 -       char buf[512];
20076 -       int n;
20077 -
20078 -       if (!early_console)
20079 -               return;
20080 -
20081 -       va_start(ap, fmt);
20082 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
20083 -       va_end(ap);
20084 -
20085 -       early_console->write(early_console, buf, n);
20086 -}
20087 -#endif
20088 -
20089  static int __add_preferred_console(char *name, int idx, char *options,
20090                                    char *brl_options)
20091  {
20092 @@ -2229,10 +2319,15 @@
20093                 console_seq++;
20094                 raw_spin_unlock(&logbuf_lock);
20095  
20096 +#ifdef CONFIG_PREEMPT_RT_FULL
20097 +               printk_safe_exit_irqrestore(flags);
20098 +               call_console_drivers(ext_text, ext_len, text, len);
20099 +#else
20100                 stop_critical_timings();        /* don't trace print latency */
20101                 call_console_drivers(ext_text, ext_len, text, len);
20102                 start_critical_timings();
20103                 printk_safe_exit_irqrestore(flags);
20104 +#endif
20105  
20106                 if (do_cond_resched)
20107                         cond_resched();
20108 @@ -2286,6 +2381,11 @@
20109  {
20110         struct console *c;
20111  
20112 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20113 +               if (in_irq() || in_nmi())
20114 +                       return;
20115 +       }
20116 +
20117         /*
20118          * console_unblank can no longer be called in interrupt context unless
20119          * oops_in_progress is set to 1..
20120 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/ptrace.c linux-4.14/kernel/ptrace.c
20121 --- linux-4.14.orig/kernel/ptrace.c     2017-11-12 19:46:13.000000000 +0100
20122 +++ linux-4.14/kernel/ptrace.c  2018-09-05 11:05:07.000000000 +0200
20123 @@ -175,7 +175,14 @@
20124  
20125         spin_lock_irq(&task->sighand->siglock);
20126         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20127 -               task->state = __TASK_TRACED;
20128 +               unsigned long flags;
20129 +
20130 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
20131 +               if (task->state & __TASK_TRACED)
20132 +                       task->state = __TASK_TRACED;
20133 +               else
20134 +                       task->saved_state = __TASK_TRACED;
20135 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20136                 ret = true;
20137         }
20138         spin_unlock_irq(&task->sighand->siglock);
20139 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/Kconfig linux-4.14/kernel/rcu/Kconfig
20140 --- linux-4.14.orig/kernel/rcu/Kconfig  2017-11-12 19:46:13.000000000 +0100
20141 +++ linux-4.14/kernel/rcu/Kconfig       2018-09-05 11:05:07.000000000 +0200
20142 @@ -36,7 +36,7 @@
20143  
20144  config RCU_EXPERT
20145         bool "Make expert-level adjustments to RCU configuration"
20146 -       default n
20147 +       default y if PREEMPT_RT_FULL
20148         help
20149           This option needs to be enabled if you wish to make
20150           expert-level adjustments to RCU configuration.  By default,
20151 @@ -172,7 +172,7 @@
20152  
20153  config RCU_FAST_NO_HZ
20154         bool "Accelerate last non-dyntick-idle CPU's grace periods"
20155 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
20156 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
20157         default n
20158         help
20159           This option permits CPUs to enter dynticks-idle state even if
20160 @@ -191,7 +191,7 @@
20161  config RCU_BOOST
20162         bool "Enable RCU priority boosting"
20163         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
20164 -       default n
20165 +       default y if PREEMPT_RT_FULL
20166         help
20167           This option boosts the priority of preempted RCU readers that
20168           block the current preemptible RCU grace period for too long.
20169 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcu.h linux-4.14/kernel/rcu/rcu.h
20170 --- linux-4.14.orig/kernel/rcu/rcu.h    2017-11-12 19:46:13.000000000 +0100
20171 +++ linux-4.14/kernel/rcu/rcu.h 2018-09-05 11:05:07.000000000 +0200
20172 @@ -462,18 +462,26 @@
20173  extern unsigned long rcutorture_testseq;
20174  extern unsigned long rcutorture_vernum;
20175  unsigned long rcu_batches_started(void);
20176 -unsigned long rcu_batches_started_bh(void);
20177  unsigned long rcu_batches_started_sched(void);
20178  unsigned long rcu_batches_completed(void);
20179 -unsigned long rcu_batches_completed_bh(void);
20180  unsigned long rcu_batches_completed_sched(void);
20181  unsigned long rcu_exp_batches_completed(void);
20182  unsigned long rcu_exp_batches_completed_sched(void);
20183  unsigned long srcu_batches_completed(struct srcu_struct *sp);
20184  void show_rcu_gp_kthreads(void);
20185  void rcu_force_quiescent_state(void);
20186 -void rcu_bh_force_quiescent_state(void);
20187  void rcu_sched_force_quiescent_state(void);
20188 +
20189 +#ifndef CONFIG_PREEMPT_RT_FULL
20190 +void rcu_bh_force_quiescent_state(void);
20191 +unsigned long rcu_batches_started_bh(void);
20192 +unsigned long rcu_batches_completed_bh(void);
20193 +#else
20194 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
20195 +# define rcu_batches_completed_bh      rcu_batches_completed
20196 +# define rcu_batches_started_bh                rcu_batches_completed
20197 +#endif
20198 +
20199  #endif /* #else #ifdef CONFIG_TINY_RCU */
20200  
20201  #ifdef CONFIG_RCU_NOCB_CPU
20202 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcu_segcblist.c linux-4.14/kernel/rcu/rcu_segcblist.c
20203 --- linux-4.14.orig/kernel/rcu/rcu_segcblist.c  2017-11-12 19:46:13.000000000 +0100
20204 +++ linux-4.14/kernel/rcu/rcu_segcblist.c       2018-09-05 11:05:07.000000000 +0200
20205 @@ -23,6 +23,7 @@
20206  #include <linux/types.h>
20207  #include <linux/kernel.h>
20208  #include <linux/interrupt.h>
20209 +#include <linux/rcupdate.h>
20210  
20211  #include "rcu_segcblist.h"
20212  
20213 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/rcutorture.c linux-4.14/kernel/rcu/rcutorture.c
20214 --- linux-4.14.orig/kernel/rcu/rcutorture.c     2017-11-12 19:46:13.000000000 +0100
20215 +++ linux-4.14/kernel/rcu/rcutorture.c  2018-09-05 11:05:07.000000000 +0200
20216 @@ -417,6 +417,7 @@
20217         .name           = "rcu"
20218  };
20219  
20220 +#ifndef CONFIG_PREEMPT_RT_FULL
20221  /*
20222   * Definitions for rcu_bh torture testing.
20223   */
20224 @@ -456,6 +457,12 @@
20225         .name           = "rcu_bh"
20226  };
20227  
20228 +#else
20229 +static struct rcu_torture_ops rcu_bh_ops = {
20230 +       .ttype          = INVALID_RCU_FLAVOR,
20231 +};
20232 +#endif
20233 +
20234  /*
20235   * Don't even think about trying any of these in real life!!!
20236   * The names includes "busted", and they really means it!
20237 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/srcutree.c linux-4.14/kernel/rcu/srcutree.c
20238 --- linux-4.14.orig/kernel/rcu/srcutree.c       2017-11-12 19:46:13.000000000 +0100
20239 +++ linux-4.14/kernel/rcu/srcutree.c    2018-09-05 11:05:07.000000000 +0200
20240 @@ -36,6 +36,8 @@
20241  #include <linux/delay.h>
20242  #include <linux/module.h>
20243  #include <linux/srcu.h>
20244 +#include <linux/cpu.h>
20245 +#include <linux/locallock.h>
20246  
20247  #include "rcu.h"
20248  #include "rcu_segcblist.h"
20249 @@ -53,6 +55,33 @@
20250  static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
20251  static void process_srcu(struct work_struct *work);
20252  
20253 +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
20254 +#define spin_lock_rcu_node(p)                                  \
20255 +do {                                                                   \
20256 +       spin_lock(&ACCESS_PRIVATE(p, lock));                    \
20257 +       smp_mb__after_unlock_lock();                                    \
20258 +} while (0)
20259 +
20260 +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
20261 +
20262 +#define spin_lock_irq_rcu_node(p)                                      \
20263 +do {                                                                   \
20264 +       spin_lock_irq(&ACCESS_PRIVATE(p, lock));                        \
20265 +       smp_mb__after_unlock_lock();                                    \
20266 +} while (0)
20267 +
20268 +#define spin_unlock_irq_rcu_node(p)                                    \
20269 +       spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
20270 +
20271 +#define spin_lock_irqsave_rcu_node(p, flags)                   \
20272 +do {                                                                   \
20273 +       spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags);     \
20274 +       smp_mb__after_unlock_lock();                                    \
20275 +} while (0)
20276 +
20277 +#define spin_unlock_irqrestore_rcu_node(p, flags)                      \
20278 +       spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
20279 +
20280  /*
20281   * Initialize SRCU combining tree.  Note that statically allocated
20282   * srcu_struct structures might already have srcu_read_lock() and
20283 @@ -77,7 +106,7 @@
20284  
20285         /* Each pass through this loop initializes one srcu_node structure. */
20286         rcu_for_each_node_breadth_first(sp, snp) {
20287 -               raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20288 +               spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20289                 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
20290                              ARRAY_SIZE(snp->srcu_data_have_cbs));
20291                 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
20292 @@ -111,7 +140,7 @@
20293         snp_first = sp->level[level];
20294         for_each_possible_cpu(cpu) {
20295                 sdp = per_cpu_ptr(sp->sda, cpu);
20296 -               raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20297 +               spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20298                 rcu_segcblist_init(&sdp->srcu_cblist);
20299                 sdp->srcu_cblist_invoking = false;
20300                 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
20301 @@ -170,7 +199,7 @@
20302         /* Don't re-initialize a lock while it is held. */
20303         debug_check_no_locks_freed((void *)sp, sizeof(*sp));
20304         lockdep_init_map(&sp->dep_map, name, key, 0);
20305 -       raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20306 +       spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20307         return init_srcu_struct_fields(sp, false);
20308  }
20309  EXPORT_SYMBOL_GPL(__init_srcu_struct);
20310 @@ -187,7 +216,7 @@
20311   */
20312  int init_srcu_struct(struct srcu_struct *sp)
20313  {
20314 -       raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20315 +       spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20316         return init_srcu_struct_fields(sp, false);
20317  }
20318  EXPORT_SYMBOL_GPL(init_srcu_struct);
20319 @@ -210,13 +239,13 @@
20320         /* The smp_load_acquire() pairs with the smp_store_release(). */
20321         if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
20322                 return; /* Already initialized. */
20323 -       raw_spin_lock_irqsave_rcu_node(sp, flags);
20324 +       spin_lock_irqsave_rcu_node(sp, flags);
20325         if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
20326 -               raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20327 +               spin_unlock_irqrestore_rcu_node(sp, flags);
20328                 return;
20329         }
20330         init_srcu_struct_fields(sp, true);
20331 -       raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20332 +       spin_unlock_irqrestore_rcu_node(sp, flags);
20333  }
20334  
20335  /*
20336 @@ -425,21 +454,6 @@
20337  }
20338  
20339  /*
20340 - * Track online CPUs to guide callback workqueue placement.
20341 - */
20342 -DEFINE_PER_CPU(bool, srcu_online);
20343 -
20344 -void srcu_online_cpu(unsigned int cpu)
20345 -{
20346 -       WRITE_ONCE(per_cpu(srcu_online, cpu), true);
20347 -}
20348 -
20349 -void srcu_offline_cpu(unsigned int cpu)
20350 -{
20351 -       WRITE_ONCE(per_cpu(srcu_online, cpu), false);
20352 -}
20353 -
20354 -/*
20355   * Place the workqueue handler on the specified CPU if online, otherwise
20356   * just run it whereever.  This is useful for placing workqueue handlers
20357   * that are to invoke the specified CPU's callbacks.
20358 @@ -450,12 +464,12 @@
20359  {
20360         bool ret;
20361  
20362 -       preempt_disable();
20363 -       if (READ_ONCE(per_cpu(srcu_online, cpu)))
20364 +       cpus_read_lock();
20365 +       if (cpu_online(cpu))
20366                 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
20367         else
20368                 ret = queue_delayed_work(wq, dwork, delay);
20369 -       preempt_enable();
20370 +       cpus_read_unlock();
20371         return ret;
20372  }
20373  
20374 @@ -513,7 +527,7 @@
20375         mutex_lock(&sp->srcu_cb_mutex);
20376  
20377         /* End the current grace period. */
20378 -       raw_spin_lock_irq_rcu_node(sp);
20379 +       spin_lock_irq_rcu_node(sp);
20380         idx = rcu_seq_state(sp->srcu_gp_seq);
20381         WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
20382         cbdelay = srcu_get_delay(sp);
20383 @@ -522,7 +536,7 @@
20384         gpseq = rcu_seq_current(&sp->srcu_gp_seq);
20385         if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
20386                 sp->srcu_gp_seq_needed_exp = gpseq;
20387 -       raw_spin_unlock_irq_rcu_node(sp);
20388 +       spin_unlock_irq_rcu_node(sp);
20389         mutex_unlock(&sp->srcu_gp_mutex);
20390         /* A new grace period can start at this point.  But only one. */
20391  
20392 @@ -530,7 +544,7 @@
20393         idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
20394         idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
20395         rcu_for_each_node_breadth_first(sp, snp) {
20396 -               raw_spin_lock_irq_rcu_node(snp);
20397 +               spin_lock_irq_rcu_node(snp);
20398                 cbs = false;
20399                 if (snp >= sp->level[rcu_num_lvls - 1])
20400                         cbs = snp->srcu_have_cbs[idx] == gpseq;
20401 @@ -540,7 +554,7 @@
20402                         snp->srcu_gp_seq_needed_exp = gpseq;
20403                 mask = snp->srcu_data_have_cbs[idx];
20404                 snp->srcu_data_have_cbs[idx] = 0;
20405 -               raw_spin_unlock_irq_rcu_node(snp);
20406 +               spin_unlock_irq_rcu_node(snp);
20407                 if (cbs)
20408                         srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
20409  
20410 @@ -548,11 +562,11 @@
20411                 if (!(gpseq & counter_wrap_check))
20412                         for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
20413                                 sdp = per_cpu_ptr(sp->sda, cpu);
20414 -                               raw_spin_lock_irqsave_rcu_node(sdp, flags);
20415 +                               spin_lock_irqsave_rcu_node(sdp, flags);
20416                                 if (ULONG_CMP_GE(gpseq,
20417                                                  sdp->srcu_gp_seq_needed + 100))
20418                                         sdp->srcu_gp_seq_needed = gpseq;
20419 -                               raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
20420 +                               spin_unlock_irqrestore_rcu_node(sdp, flags);
20421                         }
20422         }
20423  
20424 @@ -560,17 +574,17 @@
20425         mutex_unlock(&sp->srcu_cb_mutex);
20426  
20427         /* Start a new grace period if needed. */
20428 -       raw_spin_lock_irq_rcu_node(sp);
20429 +       spin_lock_irq_rcu_node(sp);
20430         gpseq = rcu_seq_current(&sp->srcu_gp_seq);
20431         if (!rcu_seq_state(gpseq) &&
20432             ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
20433                 srcu_gp_start(sp);
20434 -               raw_spin_unlock_irq_rcu_node(sp);
20435 +               spin_unlock_irq_rcu_node(sp);
20436                 /* Throttle expedited grace periods: Should be rare! */
20437                 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
20438                                     ? 0 : SRCU_INTERVAL);
20439         } else {
20440 -               raw_spin_unlock_irq_rcu_node(sp);
20441 +               spin_unlock_irq_rcu_node(sp);
20442         }
20443  }
20444  
20445 @@ -590,18 +604,18 @@
20446                 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
20447                     ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
20448                         return;
20449 -               raw_spin_lock_irqsave_rcu_node(snp, flags);
20450 +               spin_lock_irqsave_rcu_node(snp, flags);
20451                 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
20452 -                       raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20453 +                       spin_unlock_irqrestore_rcu_node(snp, flags);
20454                         return;
20455                 }
20456                 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
20457 -               raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20458 +               spin_unlock_irqrestore_rcu_node(snp, flags);
20459         }
20460 -       raw_spin_lock_irqsave_rcu_node(sp, flags);
20461 +       spin_lock_irqsave_rcu_node(sp, flags);
20462         if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
20463                 sp->srcu_gp_seq_needed_exp = s;
20464 -       raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20465 +       spin_unlock_irqrestore_rcu_node(sp, flags);
20466  }
20467  
20468  /*
20469 @@ -623,12 +637,12 @@
20470         for (; snp != NULL; snp = snp->srcu_parent) {
20471                 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
20472                         return; /* GP already done and CBs recorded. */
20473 -               raw_spin_lock_irqsave_rcu_node(snp, flags);
20474 +               spin_lock_irqsave_rcu_node(snp, flags);
20475                 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
20476                         snp_seq = snp->srcu_have_cbs[idx];
20477                         if (snp == sdp->mynode && snp_seq == s)
20478                                 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
20479 -                       raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20480 +                       spin_unlock_irqrestore_rcu_node(snp, flags);
20481                         if (snp == sdp->mynode && snp_seq != s) {
20482                                 srcu_schedule_cbs_sdp(sdp, do_norm
20483                                                            ? SRCU_INTERVAL
20484 @@ -644,11 +658,11 @@
20485                         snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
20486                 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
20487                         snp->srcu_gp_seq_needed_exp = s;
20488 -               raw_spin_unlock_irqrestore_rcu_node(snp, flags);
20489 +               spin_unlock_irqrestore_rcu_node(snp, flags);
20490         }
20491  
20492         /* Top of tree, must ensure the grace period will be started. */
20493 -       raw_spin_lock_irqsave_rcu_node(sp, flags);
20494 +       spin_lock_irqsave_rcu_node(sp, flags);
20495         if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
20496                 /*
20497                  * Record need for grace period s.  Pair with load
20498 @@ -667,7 +681,7 @@
20499                 queue_delayed_work(system_power_efficient_wq, &sp->work,
20500                                    srcu_get_delay(sp));
20501         }
20502 -       raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20503 +       spin_unlock_irqrestore_rcu_node(sp, flags);
20504  }
20505  
20506  /*
20507 @@ -736,6 +750,8 @@
20508   * negligible when amoritized over that time period, and the extra latency
20509   * of a needlessly non-expedited grace period is similarly negligible.
20510   */
20511 +static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
20512 +
20513  static bool srcu_might_be_idle(struct srcu_struct *sp)
20514  {
20515         unsigned long curseq;
20516 @@ -744,13 +760,13 @@
20517         unsigned long t;
20518  
20519         /* If the local srcu_data structure has callbacks, not idle.  */
20520 -       local_irq_save(flags);
20521 +       local_lock_irqsave(sp_llock, flags);
20522         sdp = this_cpu_ptr(sp->sda);
20523         if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
20524 -               local_irq_restore(flags);
20525 +               local_unlock_irqrestore(sp_llock, flags);
20526                 return false; /* Callbacks already present, so not idle. */
20527         }
20528 -       local_irq_restore(flags);
20529 +       local_unlock_irqrestore(sp_llock, flags);
20530  
20531         /*
20532          * No local callbacks, so probabalistically probe global state.
20533 @@ -828,9 +844,9 @@
20534                 return;
20535         }
20536         rhp->func = func;
20537 -       local_irq_save(flags);
20538 +       local_lock_irqsave(sp_llock, flags);
20539         sdp = this_cpu_ptr(sp->sda);
20540 -       raw_spin_lock_rcu_node(sdp);
20541 +       spin_lock_rcu_node(sdp);
20542         rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
20543         rcu_segcblist_advance(&sdp->srcu_cblist,
20544                               rcu_seq_current(&sp->srcu_gp_seq));
20545 @@ -844,7 +860,8 @@
20546                 sdp->srcu_gp_seq_needed_exp = s;
20547                 needexp = true;
20548         }
20549 -       raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
20550 +       spin_unlock_rcu_node(sdp);
20551 +       local_unlock_irqrestore(sp_llock, flags);
20552         if (needgp)
20553                 srcu_funnel_gp_start(sp, sdp, s, do_norm);
20554         else if (needexp)
20555 @@ -900,7 +917,7 @@
20556  
20557         /*
20558          * Make sure that later code is ordered after the SRCU grace
20559 -        * period.  This pairs with the raw_spin_lock_irq_rcu_node()
20560 +        * period.  This pairs with the spin_lock_irq_rcu_node()
20561          * in srcu_invoke_callbacks().  Unlike Tree RCU, this is needed
20562          * because the current CPU might have been totally uninvolved with
20563          * (and thus unordered against) that grace period.
20564 @@ -1024,7 +1041,7 @@
20565          */
20566         for_each_possible_cpu(cpu) {
20567                 sdp = per_cpu_ptr(sp->sda, cpu);
20568 -               raw_spin_lock_irq_rcu_node(sdp);
20569 +               spin_lock_irq_rcu_node(sdp);
20570                 atomic_inc(&sp->srcu_barrier_cpu_cnt);
20571                 sdp->srcu_barrier_head.func = srcu_barrier_cb;
20572                 debug_rcu_head_queue(&sdp->srcu_barrier_head);
20573 @@ -1033,7 +1050,7 @@
20574                         debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
20575                         atomic_dec(&sp->srcu_barrier_cpu_cnt);
20576                 }
20577 -               raw_spin_unlock_irq_rcu_node(sdp);
20578 +               spin_unlock_irq_rcu_node(sdp);
20579         }
20580  
20581         /* Remove the initial count, at which point reaching zero can happen. */
20582 @@ -1082,17 +1099,17 @@
20583          */
20584         idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
20585         if (idx == SRCU_STATE_IDLE) {
20586 -               raw_spin_lock_irq_rcu_node(sp);
20587 +               spin_lock_irq_rcu_node(sp);
20588                 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
20589                         WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
20590 -                       raw_spin_unlock_irq_rcu_node(sp);
20591 +                       spin_unlock_irq_rcu_node(sp);
20592                         mutex_unlock(&sp->srcu_gp_mutex);
20593                         return;
20594                 }
20595                 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
20596                 if (idx == SRCU_STATE_IDLE)
20597                         srcu_gp_start(sp);
20598 -               raw_spin_unlock_irq_rcu_node(sp);
20599 +               spin_unlock_irq_rcu_node(sp);
20600                 if (idx != SRCU_STATE_IDLE) {
20601                         mutex_unlock(&sp->srcu_gp_mutex);
20602                         return; /* Someone else started the grace period. */
20603 @@ -1141,19 +1158,19 @@
20604         sdp = container_of(work, struct srcu_data, work.work);
20605         sp = sdp->sp;
20606         rcu_cblist_init(&ready_cbs);
20607 -       raw_spin_lock_irq_rcu_node(sdp);
20608 +       spin_lock_irq_rcu_node(sdp);
20609         rcu_segcblist_advance(&sdp->srcu_cblist,
20610                               rcu_seq_current(&sp->srcu_gp_seq));
20611         if (sdp->srcu_cblist_invoking ||
20612             !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
20613 -               raw_spin_unlock_irq_rcu_node(sdp);
20614 +               spin_unlock_irq_rcu_node(sdp);
20615                 return;  /* Someone else on the job or nothing to do. */
20616         }
20617  
20618         /* We are on the job!  Extract and invoke ready callbacks. */
20619         sdp->srcu_cblist_invoking = true;
20620         rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
20621 -       raw_spin_unlock_irq_rcu_node(sdp);
20622 +       spin_unlock_irq_rcu_node(sdp);
20623         rhp = rcu_cblist_dequeue(&ready_cbs);
20624         for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
20625                 debug_rcu_head_unqueue(rhp);
20626 @@ -1166,13 +1183,13 @@
20627          * Update counts, accelerate new callbacks, and if needed,
20628          * schedule another round of callback invocation.
20629          */
20630 -       raw_spin_lock_irq_rcu_node(sdp);
20631 +       spin_lock_irq_rcu_node(sdp);
20632         rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
20633         (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
20634                                        rcu_seq_snap(&sp->srcu_gp_seq));
20635         sdp->srcu_cblist_invoking = false;
20636         more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
20637 -       raw_spin_unlock_irq_rcu_node(sdp);
20638 +       spin_unlock_irq_rcu_node(sdp);
20639         if (more)
20640                 srcu_schedule_cbs_sdp(sdp, 0);
20641  }
20642 @@ -1185,7 +1202,7 @@
20643  {
20644         bool pushgp = true;
20645  
20646 -       raw_spin_lock_irq_rcu_node(sp);
20647 +       spin_lock_irq_rcu_node(sp);
20648         if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
20649                 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
20650                         /* All requests fulfilled, time to go idle. */
20651 @@ -1195,7 +1212,7 @@
20652                 /* Outstanding request and no GP.  Start one. */
20653                 srcu_gp_start(sp);
20654         }
20655 -       raw_spin_unlock_irq_rcu_node(sp);
20656 +       spin_unlock_irq_rcu_node(sp);
20657  
20658         if (pushgp)
20659                 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
20660 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree.c linux-4.14/kernel/rcu/tree.c
20661 --- linux-4.14.orig/kernel/rcu/tree.c   2017-11-12 19:46:13.000000000 +0100
20662 +++ linux-4.14/kernel/rcu/tree.c        2018-09-05 11:05:07.000000000 +0200
20663 @@ -58,6 +58,11 @@
20664  #include <linux/trace_events.h>
20665  #include <linux/suspend.h>
20666  #include <linux/ftrace.h>
20667 +#include <linux/delay.h>
20668 +#include <linux/gfp.h>
20669 +#include <linux/oom.h>
20670 +#include <linux/smpboot.h>
20671 +#include "../time/tick-internal.h"
20672  
20673  #include "tree.h"
20674  #include "rcu.h"
20675 @@ -243,6 +248,19 @@
20676                            this_cpu_ptr(&rcu_sched_data), true);
20677  }
20678  
20679 +#ifdef CONFIG_PREEMPT_RT_FULL
20680 +static void rcu_preempt_qs(void);
20681 +
20682 +void rcu_bh_qs(void)
20683 +{
20684 +       unsigned long flags;
20685 +
20686 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
20687 +       local_irq_save(flags);
20688 +       rcu_preempt_qs();
20689 +       local_irq_restore(flags);
20690 +}
20691 +#else
20692  void rcu_bh_qs(void)
20693  {
20694         RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
20695 @@ -253,6 +271,7 @@
20696                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
20697         }
20698  }
20699 +#endif
20700  
20701  /*
20702   * Steal a bit from the bottom of ->dynticks for idle entry/exit
20703 @@ -564,11 +583,13 @@
20704  /*
20705   * Return the number of RCU BH batches started thus far for debug & stats.
20706   */
20707 +#ifndef CONFIG_PREEMPT_RT_FULL
20708  unsigned long rcu_batches_started_bh(void)
20709  {
20710         return rcu_bh_state.gpnum;
20711  }
20712  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
20713 +#endif
20714  
20715  /*
20716   * Return the number of RCU batches completed thus far for debug & stats.
20717 @@ -588,6 +609,7 @@
20718  }
20719  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
20720  
20721 +#ifndef CONFIG_PREEMPT_RT_FULL
20722  /*
20723   * Return the number of RCU BH batches completed thus far for debug & stats.
20724   */
20725 @@ -596,6 +618,7 @@
20726         return rcu_bh_state.completed;
20727  }
20728  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
20729 +#endif
20730  
20731  /*
20732   * Return the number of RCU expedited batches completed thus far for
20733 @@ -619,6 +642,7 @@
20734  }
20735  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
20736  
20737 +#ifndef CONFIG_PREEMPT_RT_FULL
20738  /*
20739   * Force a quiescent state.
20740   */
20741 @@ -637,6 +661,13 @@
20742  }
20743  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
20744  
20745 +#else
20746 +void rcu_force_quiescent_state(void)
20747 +{
20748 +}
20749 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
20750 +#endif
20751 +
20752  /*
20753   * Force a quiescent state for RCU-sched.
20754   */
20755 @@ -687,9 +718,11 @@
20756         case RCU_FLAVOR:
20757                 rsp = rcu_state_p;
20758                 break;
20759 +#ifndef CONFIG_PREEMPT_RT_FULL
20760         case RCU_BH_FLAVOR:
20761                 rsp = &rcu_bh_state;
20762                 break;
20763 +#endif
20764         case RCU_SCHED_FLAVOR:
20765                 rsp = &rcu_sched_state;
20766                 break;
20767 @@ -2918,18 +2951,17 @@
20768  /*
20769   * Do RCU core processing for the current CPU.
20770   */
20771 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
20772 +static __latent_entropy void rcu_process_callbacks(void)
20773  {
20774         struct rcu_state *rsp;
20775  
20776         if (cpu_is_offline(smp_processor_id()))
20777                 return;
20778 -       trace_rcu_utilization(TPS("Start RCU core"));
20779         for_each_rcu_flavor(rsp)
20780                 __rcu_process_callbacks(rsp);
20781 -       trace_rcu_utilization(TPS("End RCU core"));
20782  }
20783  
20784 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
20785  /*
20786   * Schedule RCU callback invocation.  If the specified type of RCU
20787   * does not support RCU priority boosting, just do a direct call,
20788 @@ -2941,18 +2973,105 @@
20789  {
20790         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
20791                 return;
20792 -       if (likely(!rsp->boost)) {
20793 -               rcu_do_batch(rsp, rdp);
20794 +       rcu_do_batch(rsp, rdp);
20795 +}
20796 +
20797 +static void rcu_wake_cond(struct task_struct *t, int status)
20798 +{
20799 +       /*
20800 +        * If the thread is yielding, only wake it when this
20801 +        * is invoked from idle
20802 +        */
20803 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
20804 +               wake_up_process(t);
20805 +}
20806 +
20807 +/*
20808 + * Wake up this CPU's rcuc kthread to do RCU core processing.
20809 + */
20810 +static void invoke_rcu_core(void)
20811 +{
20812 +       unsigned long flags;
20813 +       struct task_struct *t;
20814 +
20815 +       if (!cpu_online(smp_processor_id()))
20816                 return;
20817 +       local_irq_save(flags);
20818 +       __this_cpu_write(rcu_cpu_has_work, 1);
20819 +       t = __this_cpu_read(rcu_cpu_kthread_task);
20820 +       if (t != NULL && current != t)
20821 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
20822 +       local_irq_restore(flags);
20823 +}
20824 +
20825 +static void rcu_cpu_kthread_park(unsigned int cpu)
20826 +{
20827 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
20828 +}
20829 +
20830 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
20831 +{
20832 +       return __this_cpu_read(rcu_cpu_has_work);
20833 +}
20834 +
20835 +/*
20836 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
20837 + * RCU softirq used in flavors and configurations of RCU that do not
20838 + * support RCU priority boosting.
20839 + */
20840 +static void rcu_cpu_kthread(unsigned int cpu)
20841 +{
20842 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
20843 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
20844 +       int spincnt;
20845 +
20846 +       for (spincnt = 0; spincnt < 10; spincnt++) {
20847 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
20848 +               local_bh_disable();
20849 +               *statusp = RCU_KTHREAD_RUNNING;
20850 +               this_cpu_inc(rcu_cpu_kthread_loops);
20851 +               local_irq_disable();
20852 +               work = *workp;
20853 +               *workp = 0;
20854 +               local_irq_enable();
20855 +               if (work)
20856 +                       rcu_process_callbacks();
20857 +               local_bh_enable();
20858 +               if (*workp == 0) {
20859 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
20860 +                       *statusp = RCU_KTHREAD_WAITING;
20861 +                       return;
20862 +               }
20863         }
20864 -       invoke_rcu_callbacks_kthread();
20865 +       *statusp = RCU_KTHREAD_YIELDING;
20866 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
20867 +       schedule_timeout_interruptible(2);
20868 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
20869 +       *statusp = RCU_KTHREAD_WAITING;
20870  }
20871  
20872 -static void invoke_rcu_core(void)
20873 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
20874 +       .store                  = &rcu_cpu_kthread_task,
20875 +       .thread_should_run      = rcu_cpu_kthread_should_run,
20876 +       .thread_fn              = rcu_cpu_kthread,
20877 +       .thread_comm            = "rcuc/%u",
20878 +       .setup                  = rcu_cpu_kthread_setup,
20879 +       .park                   = rcu_cpu_kthread_park,
20880 +};
20881 +
20882 +/*
20883 + * Spawn per-CPU RCU core processing kthreads.
20884 + */
20885 +static int __init rcu_spawn_core_kthreads(void)
20886  {
20887 -       if (cpu_online(smp_processor_id()))
20888 -               raise_softirq(RCU_SOFTIRQ);
20889 +       int cpu;
20890 +
20891 +       for_each_possible_cpu(cpu)
20892 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
20893 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
20894 +       return 0;
20895  }
20896 +early_initcall(rcu_spawn_core_kthreads);
20897  
20898  /*
20899   * Handle any core-RCU processing required by a call_rcu() invocation.
20900 @@ -3113,6 +3232,7 @@
20901  }
20902  EXPORT_SYMBOL_GPL(call_rcu_sched);
20903  
20904 +#ifndef CONFIG_PREEMPT_RT_FULL
20905  /**
20906   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
20907   * @head: structure to be used for queueing the RCU updates.
20908 @@ -3140,6 +3260,7 @@
20909         __call_rcu(head, func, &rcu_bh_state, -1, 0);
20910  }
20911  EXPORT_SYMBOL_GPL(call_rcu_bh);
20912 +#endif
20913  
20914  /*
20915   * Queue an RCU callback for lazy invocation after a grace period.
20916 @@ -3225,6 +3346,7 @@
20917  }
20918  EXPORT_SYMBOL_GPL(synchronize_sched);
20919  
20920 +#ifndef CONFIG_PREEMPT_RT_FULL
20921  /**
20922   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
20923   *
20924 @@ -3251,6 +3373,7 @@
20925                 wait_rcu_gp(call_rcu_bh);
20926  }
20927  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
20928 +#endif
20929  
20930  /**
20931   * get_state_synchronize_rcu - Snapshot current RCU state
20932 @@ -3601,6 +3724,7 @@
20933         mutex_unlock(&rsp->barrier_mutex);
20934  }
20935  
20936 +#ifndef CONFIG_PREEMPT_RT_FULL
20937  /**
20938   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
20939   */
20940 @@ -3609,6 +3733,7 @@
20941         _rcu_barrier(&rcu_bh_state);
20942  }
20943  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
20944 +#endif
20945  
20946  /**
20947   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
20948 @@ -3741,8 +3866,6 @@
20949  {
20950         sync_sched_exp_online_cleanup(cpu);
20951         rcutree_affinity_setting(cpu, -1);
20952 -       if (IS_ENABLED(CONFIG_TREE_SRCU))
20953 -               srcu_online_cpu(cpu);
20954         return 0;
20955  }
20956  
20957 @@ -3753,8 +3876,6 @@
20958  int rcutree_offline_cpu(unsigned int cpu)
20959  {
20960         rcutree_affinity_setting(cpu, cpu);
20961 -       if (IS_ENABLED(CONFIG_TREE_SRCU))
20962 -               srcu_offline_cpu(cpu);
20963         return 0;
20964  }
20965  
20966 @@ -4184,12 +4305,13 @@
20967  
20968         rcu_bootup_announce();
20969         rcu_init_geometry();
20970 +#ifndef CONFIG_PREEMPT_RT_FULL
20971         rcu_init_one(&rcu_bh_state);
20972 +#endif
20973         rcu_init_one(&rcu_sched_state);
20974         if (dump_tree)
20975                 rcu_dump_rcu_node_tree(&rcu_sched_state);
20976         __rcu_init_preempt();
20977 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
20978  
20979         /*
20980          * We don't need protection against CPU-hotplug here because
20981 @@ -4200,8 +4322,6 @@
20982         for_each_online_cpu(cpu) {
20983                 rcutree_prepare_cpu(cpu);
20984                 rcu_cpu_starting(cpu);
20985 -               if (IS_ENABLED(CONFIG_TREE_SRCU))
20986 -                       srcu_online_cpu(cpu);
20987         }
20988  }
20989  
20990 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree.h linux-4.14/kernel/rcu/tree.h
20991 --- linux-4.14.orig/kernel/rcu/tree.h   2017-11-12 19:46:13.000000000 +0100
20992 +++ linux-4.14/kernel/rcu/tree.h        2018-09-05 11:05:07.000000000 +0200
20993 @@ -427,7 +427,9 @@
20994   */
20995  extern struct rcu_state rcu_sched_state;
20996  
20997 +#ifndef CONFIG_PREEMPT_RT_FULL
20998  extern struct rcu_state rcu_bh_state;
20999 +#endif
21000  
21001  #ifdef CONFIG_PREEMPT_RCU
21002  extern struct rcu_state rcu_preempt_state;
21003 @@ -436,12 +438,10 @@
21004  int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
21005  bool rcu_eqs_special_set(int cpu);
21006  
21007 -#ifdef CONFIG_RCU_BOOST
21008  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21009  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21010  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21011  DECLARE_PER_CPU(char, rcu_cpu_has_work);
21012 -#endif /* #ifdef CONFIG_RCU_BOOST */
21013  
21014  #ifndef RCU_TREE_NONCORE
21015  
21016 @@ -461,10 +461,9 @@
21017  static void __init __rcu_init_preempt(void);
21018  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21019  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21020 -static void invoke_rcu_callbacks_kthread(void);
21021  static bool rcu_is_callbacks_kthread(void);
21022 +static void rcu_cpu_kthread_setup(unsigned int cpu);
21023  #ifdef CONFIG_RCU_BOOST
21024 -static void rcu_preempt_do_callbacks(void);
21025  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21026                                                  struct rcu_node *rnp);
21027  #endif /* #ifdef CONFIG_RCU_BOOST */
21028 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/tree_plugin.h linux-4.14/kernel/rcu/tree_plugin.h
21029 --- linux-4.14.orig/kernel/rcu/tree_plugin.h    2018-09-05 11:03:22.000000000 +0200
21030 +++ linux-4.14/kernel/rcu/tree_plugin.h 2018-09-05 11:05:07.000000000 +0200
21031 @@ -24,39 +24,16 @@
21032   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21033   */
21034  
21035 -#include <linux/delay.h>
21036 -#include <linux/gfp.h>
21037 -#include <linux/oom.h>
21038 -#include <linux/sched/debug.h>
21039 -#include <linux/smpboot.h>
21040 -#include <uapi/linux/sched/types.h>
21041 -#include "../time/tick-internal.h"
21042 -
21043 -#ifdef CONFIG_RCU_BOOST
21044 -
21045  #include "../locking/rtmutex_common.h"
21046  
21047  /*
21048   * Control variables for per-CPU and per-rcu_node kthreads.  These
21049   * handle all flavors of RCU.
21050   */
21051 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21052  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21053  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21054  DEFINE_PER_CPU(char, rcu_cpu_has_work);
21055  
21056 -#else /* #ifdef CONFIG_RCU_BOOST */
21057 -
21058 -/*
21059 - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
21060 - * all uses are in dead code.  Provide a definition to keep the compiler
21061 - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
21062 - * This probably needs to be excluded from -rt builds.
21063 - */
21064 -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
21065 -
21066 -#endif /* #else #ifdef CONFIG_RCU_BOOST */
21067 -
21068  #ifdef CONFIG_RCU_NOCB_CPU
21069  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21070  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
21071 @@ -324,9 +301,13 @@
21072         struct task_struct *t = current;
21073         struct rcu_data *rdp;
21074         struct rcu_node *rnp;
21075 +       int sleeping_l = 0;
21076  
21077         RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
21078 -       WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
21079 +#if defined(CONFIG_PREEMPT_RT_FULL)
21080 +       sleeping_l = t->sleeping_lock;
21081 +#endif
21082 +       WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
21083         if (t->rcu_read_lock_nesting > 0 &&
21084             !t->rcu_read_unlock_special.b.blocked) {
21085  
21086 @@ -463,7 +444,7 @@
21087         }
21088  
21089         /* Hardware IRQ handlers cannot block, complain if they get here. */
21090 -       if (in_irq() || in_serving_softirq()) {
21091 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21092                 lockdep_rcu_suspicious(__FILE__, __LINE__,
21093                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21094                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21095 @@ -530,7 +511,7 @@
21096  
21097                 /* Unboost if we were boosted. */
21098                 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
21099 -                       rt_mutex_unlock(&rnp->boost_mtx);
21100 +                       rt_mutex_futex_unlock(&rnp->boost_mtx);
21101  
21102                 /*
21103                  * If this was the last task on the expedited lists,
21104 @@ -684,15 +665,6 @@
21105                 t->rcu_read_unlock_special.b.need_qs = true;
21106  }
21107  
21108 -#ifdef CONFIG_RCU_BOOST
21109 -
21110 -static void rcu_preempt_do_callbacks(void)
21111 -{
21112 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21113 -}
21114 -
21115 -#endif /* #ifdef CONFIG_RCU_BOOST */
21116 -
21117  /**
21118   * call_rcu() - Queue an RCU callback for invocation after a grace period.
21119   * @head: structure to be used for queueing the RCU updates.
21120 @@ -915,20 +887,23 @@
21121  
21122  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21123  
21124 +/*
21125 + * If boosting, set rcuc kthreads to realtime priority.
21126 + */
21127 +static void rcu_cpu_kthread_setup(unsigned int cpu)
21128 +{
21129  #ifdef CONFIG_RCU_BOOST
21130 +       struct sched_param sp;
21131  
21132 -#include "../locking/rtmutex_common.h"
21133 -
21134 -static void rcu_wake_cond(struct task_struct *t, int status)
21135 -{
21136 -       /*
21137 -        * If the thread is yielding, only wake it when this
21138 -        * is invoked from idle
21139 -        */
21140 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21141 -               wake_up_process(t);
21142 +       sp.sched_priority = kthread_prio;
21143 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21144 +#endif /* #ifdef CONFIG_RCU_BOOST */
21145  }
21146  
21147 +#ifdef CONFIG_RCU_BOOST
21148 +
21149 +#include "../locking/rtmutex_common.h"
21150 +
21151  /*
21152   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21153   * or ->boost_tasks, advancing the pointer to the next task in the
21154 @@ -1071,23 +1046,6 @@
21155  }
21156  
21157  /*
21158 - * Wake up the per-CPU kthread to invoke RCU callbacks.
21159 - */
21160 -static void invoke_rcu_callbacks_kthread(void)
21161 -{
21162 -       unsigned long flags;
21163 -
21164 -       local_irq_save(flags);
21165 -       __this_cpu_write(rcu_cpu_has_work, 1);
21166 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21167 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
21168 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21169 -                             __this_cpu_read(rcu_cpu_kthread_status));
21170 -       }
21171 -       local_irq_restore(flags);
21172 -}
21173 -
21174 -/*
21175   * Is the current CPU running the RCU-callbacks kthread?
21176   * Caller must have preemption disabled.
21177   */
21178 @@ -1141,67 +1099,6 @@
21179         return 0;
21180  }
21181  
21182 -static void rcu_kthread_do_work(void)
21183 -{
21184 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21185 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21186 -       rcu_preempt_do_callbacks();
21187 -}
21188 -
21189 -static void rcu_cpu_kthread_setup(unsigned int cpu)
21190 -{
21191 -       struct sched_param sp;
21192 -
21193 -       sp.sched_priority = kthread_prio;
21194 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21195 -}
21196 -
21197 -static void rcu_cpu_kthread_park(unsigned int cpu)
21198 -{
21199 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21200 -}
21201 -
21202 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
21203 -{
21204 -       return __this_cpu_read(rcu_cpu_has_work);
21205 -}
21206 -
21207 -/*
21208 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21209 - * RCU softirq used in flavors and configurations of RCU that do not
21210 - * support RCU priority boosting.
21211 - */
21212 -static void rcu_cpu_kthread(unsigned int cpu)
21213 -{
21214 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21215 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21216 -       int spincnt;
21217 -
21218 -       for (spincnt = 0; spincnt < 10; spincnt++) {
21219 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21220 -               local_bh_disable();
21221 -               *statusp = RCU_KTHREAD_RUNNING;
21222 -               this_cpu_inc(rcu_cpu_kthread_loops);
21223 -               local_irq_disable();
21224 -               work = *workp;
21225 -               *workp = 0;
21226 -               local_irq_enable();
21227 -               if (work)
21228 -                       rcu_kthread_do_work();
21229 -               local_bh_enable();
21230 -               if (*workp == 0) {
21231 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21232 -                       *statusp = RCU_KTHREAD_WAITING;
21233 -                       return;
21234 -               }
21235 -       }
21236 -       *statusp = RCU_KTHREAD_YIELDING;
21237 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21238 -       schedule_timeout_interruptible(2);
21239 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21240 -       *statusp = RCU_KTHREAD_WAITING;
21241 -}
21242 -
21243  /*
21244   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21245   * served by the rcu_node in question.  The CPU hotplug lock is still
21246 @@ -1232,26 +1129,12 @@
21247         free_cpumask_var(cm);
21248  }
21249  
21250 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21251 -       .store                  = &rcu_cpu_kthread_task,
21252 -       .thread_should_run      = rcu_cpu_kthread_should_run,
21253 -       .thread_fn              = rcu_cpu_kthread,
21254 -       .thread_comm            = "rcuc/%u",
21255 -       .setup                  = rcu_cpu_kthread_setup,
21256 -       .park                   = rcu_cpu_kthread_park,
21257 -};
21258 -
21259  /*
21260   * Spawn boost kthreads -- called as soon as the scheduler is running.
21261   */
21262  static void __init rcu_spawn_boost_kthreads(void)
21263  {
21264         struct rcu_node *rnp;
21265 -       int cpu;
21266 -
21267 -       for_each_possible_cpu(cpu)
21268 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
21269 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21270         rcu_for_each_leaf_node(rcu_state_p, rnp)
21271                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21272  }
21273 @@ -1274,11 +1157,6 @@
21274         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
21275  }
21276  
21277 -static void invoke_rcu_callbacks_kthread(void)
21278 -{
21279 -       WARN_ON_ONCE(1);
21280 -}
21281 -
21282  static bool rcu_is_callbacks_kthread(void)
21283  {
21284         return false;
21285 @@ -1302,7 +1180,7 @@
21286  
21287  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21288  
21289 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
21290 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21291  
21292  /*
21293   * Check to see if any future RCU-related work will need to be done
21294 @@ -1318,7 +1196,9 @@
21295         *nextevt = KTIME_MAX;
21296         return rcu_cpu_has_callbacks(NULL);
21297  }
21298 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21299  
21300 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
21301  /*
21302   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21303   * after it.
21304 @@ -1414,6 +1294,8 @@
21305         return cbs_ready;
21306  }
21307  
21308 +#ifndef CONFIG_PREEMPT_RT_FULL
21309 +
21310  /*
21311   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21312   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
21313 @@ -1456,6 +1338,7 @@
21314         *nextevt = basemono + dj * TICK_NSEC;
21315         return 0;
21316  }
21317 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21318  
21319  /*
21320   * Prepare a CPU for idle from an RCU perspective.  The first major task
21321 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/rcu/update.c linux-4.14/kernel/rcu/update.c
21322 --- linux-4.14.orig/kernel/rcu/update.c 2018-09-05 11:03:22.000000000 +0200
21323 +++ linux-4.14/kernel/rcu/update.c      2018-09-05 11:05:07.000000000 +0200
21324 @@ -66,7 +66,7 @@
21325  module_param(rcu_expedited, int, 0);
21326  extern int rcu_normal; /* from sysctl */
21327  module_param(rcu_normal, int, 0);
21328 -static int rcu_normal_after_boot;
21329 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
21330  module_param(rcu_normal_after_boot, int, 0);
21331  #endif /* #ifndef CONFIG_TINY_RCU */
21332  
21333 @@ -333,6 +333,7 @@
21334  }
21335  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21336  
21337 +#ifndef CONFIG_PREEMPT_RT_FULL
21338  /**
21339   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21340   *
21341 @@ -359,6 +360,7 @@
21342         return in_softirq() || irqs_disabled();
21343  }
21344  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
21345 +#endif
21346  
21347  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
21348  
21349 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/completion.c linux-4.14/kernel/sched/completion.c
21350 --- linux-4.14.orig/kernel/sched/completion.c   2017-11-12 19:46:13.000000000 +0100
21351 +++ linux-4.14/kernel/sched/completion.c        2018-09-05 11:05:07.000000000 +0200
21352 @@ -32,7 +32,7 @@
21353  {
21354         unsigned long flags;
21355  
21356 -       spin_lock_irqsave(&x->wait.lock, flags);
21357 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21358  
21359         /*
21360          * Perform commit of crossrelease here.
21361 @@ -41,8 +41,8 @@
21362  
21363         if (x->done != UINT_MAX)
21364                 x->done++;
21365 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
21366 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21367 +       swake_up_locked(&x->wait);
21368 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21369  }
21370  EXPORT_SYMBOL(complete);
21371  
21372 @@ -66,10 +66,10 @@
21373  {
21374         unsigned long flags;
21375  
21376 -       spin_lock_irqsave(&x->wait.lock, flags);
21377 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21378         x->done = UINT_MAX;
21379 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
21380 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21381 +       swake_up_all_locked(&x->wait);
21382 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21383  }
21384  EXPORT_SYMBOL(complete_all);
21385  
21386 @@ -78,20 +78,20 @@
21387                    long (*action)(long), long timeout, int state)
21388  {
21389         if (!x->done) {
21390 -               DECLARE_WAITQUEUE(wait, current);
21391 +               DECLARE_SWAITQUEUE(wait);
21392  
21393 -               __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
21394 +               __prepare_to_swait(&x->wait, &wait);
21395                 do {
21396                         if (signal_pending_state(state, current)) {
21397                                 timeout = -ERESTARTSYS;
21398                                 break;
21399                         }
21400                         __set_current_state(state);
21401 -                       spin_unlock_irq(&x->wait.lock);
21402 +                       raw_spin_unlock_irq(&x->wait.lock);
21403                         timeout = action(timeout);
21404 -                       spin_lock_irq(&x->wait.lock);
21405 +                       raw_spin_lock_irq(&x->wait.lock);
21406                 } while (!x->done && timeout);
21407 -               __remove_wait_queue(&x->wait, &wait);
21408 +               __finish_swait(&x->wait, &wait);
21409                 if (!x->done)
21410                         return timeout;
21411         }
21412 @@ -108,9 +108,9 @@
21413  
21414         complete_acquire(x);
21415  
21416 -       spin_lock_irq(&x->wait.lock);
21417 +       raw_spin_lock_irq(&x->wait.lock);
21418         timeout = do_wait_for_common(x, action, timeout, state);
21419 -       spin_unlock_irq(&x->wait.lock);
21420 +       raw_spin_unlock_irq(&x->wait.lock);
21421  
21422         complete_release(x);
21423  
21424 @@ -299,12 +299,12 @@
21425         if (!READ_ONCE(x->done))
21426                 return 0;
21427  
21428 -       spin_lock_irqsave(&x->wait.lock, flags);
21429 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21430         if (!x->done)
21431                 ret = 0;
21432         else if (x->done != UINT_MAX)
21433                 x->done--;
21434 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21435 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21436         return ret;
21437  }
21438  EXPORT_SYMBOL(try_wait_for_completion);
21439 @@ -330,8 +330,8 @@
21440          * otherwise we can end up freeing the completion before complete()
21441          * is done referencing it.
21442          */
21443 -       spin_lock_irqsave(&x->wait.lock, flags);
21444 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21445 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21446 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21447         return true;
21448  }
21449  EXPORT_SYMBOL(completion_done);
21450 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/core.c linux-4.14/kernel/sched/core.c
21451 --- linux-4.14.orig/kernel/sched/core.c 2018-09-05 11:03:22.000000000 +0200
21452 +++ linux-4.14/kernel/sched/core.c      2018-09-05 11:05:07.000000000 +0200
21453 @@ -59,7 +59,11 @@
21454   * Number of tasks to iterate in a single balance run.
21455   * Limited because this is done with IRQs disabled.
21456   */
21457 +#ifndef CONFIG_PREEMPT_RT_FULL
21458  const_debug unsigned int sysctl_sched_nr_migrate = 32;
21459 +#else
21460 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
21461 +#endif
21462  
21463  /*
21464   * period over which we average the RT time consumption, measured
21465 @@ -341,7 +345,7 @@
21466         rq->hrtick_csd.info = rq;
21467  #endif
21468  
21469 -       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21470 +       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
21471         rq->hrtick_timer.function = hrtick;
21472  }
21473  #else  /* CONFIG_SCHED_HRTICK */
21474 @@ -423,9 +427,15 @@
21475  #endif
21476  #endif
21477  
21478 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
21479 +void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
21480 +                 bool sleeper)
21481  {
21482 -       struct wake_q_node *node = &task->wake_q;
21483 +       struct wake_q_node *node;
21484 +
21485 +       if (sleeper)
21486 +               node = &task->wake_q_sleeper;
21487 +       else
21488 +               node = &task->wake_q;
21489  
21490         /*
21491          * Atomically grab the task, if ->wake_q is !nil already it means
21492 @@ -447,24 +457,32 @@
21493         head->lastp = &node->next;
21494  }
21495  
21496 -void wake_up_q(struct wake_q_head *head)
21497 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
21498  {
21499         struct wake_q_node *node = head->first;
21500  
21501         while (node != WAKE_Q_TAIL) {
21502                 struct task_struct *task;
21503  
21504 -               task = container_of(node, struct task_struct, wake_q);
21505 +               if (sleeper)
21506 +                       task = container_of(node, struct task_struct, wake_q_sleeper);
21507 +               else
21508 +                       task = container_of(node, struct task_struct, wake_q);
21509                 BUG_ON(!task);
21510                 /* Task can safely be re-inserted now: */
21511                 node = node->next;
21512 -               task->wake_q.next = NULL;
21513 -
21514 +               if (sleeper)
21515 +                       task->wake_q_sleeper.next = NULL;
21516 +               else
21517 +                       task->wake_q.next = NULL;
21518                 /*
21519                  * wake_up_process() implies a wmb() to pair with the queueing
21520                  * in wake_q_add() so as not to miss wakeups.
21521                  */
21522 -               wake_up_process(task);
21523 +               if (sleeper)
21524 +                       wake_up_lock_sleeper(task);
21525 +               else
21526 +                       wake_up_process(task);
21527                 put_task_struct(task);
21528         }
21529  }
21530 @@ -500,6 +518,48 @@
21531                 trace_sched_wake_idle_without_ipi(cpu);
21532  }
21533  
21534 +#ifdef CONFIG_PREEMPT_LAZY
21535 +
21536 +static int tsk_is_polling(struct task_struct *p)
21537 +{
21538 +#ifdef TIF_POLLING_NRFLAG
21539 +       return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
21540 +#else
21541 +       return 0;
21542 +#endif
21543 +}
21544 +
21545 +void resched_curr_lazy(struct rq *rq)
21546 +{
21547 +       struct task_struct *curr = rq->curr;
21548 +       int cpu;
21549 +
21550 +       if (!sched_feat(PREEMPT_LAZY)) {
21551 +               resched_curr(rq);
21552 +               return;
21553 +       }
21554 +
21555 +       lockdep_assert_held(&rq->lock);
21556 +
21557 +       if (test_tsk_need_resched(curr))
21558 +               return;
21559 +
21560 +       if (test_tsk_need_resched_lazy(curr))
21561 +               return;
21562 +
21563 +       set_tsk_need_resched_lazy(curr);
21564 +
21565 +       cpu = cpu_of(rq);
21566 +       if (cpu == smp_processor_id())
21567 +               return;
21568 +
21569 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
21570 +       smp_mb();
21571 +       if (!tsk_is_polling(curr))
21572 +               smp_send_reschedule(cpu);
21573 +}
21574 +#endif
21575 +
21576  void resched_cpu(int cpu)
21577  {
21578         struct rq *rq = cpu_rq(cpu);
21579 @@ -523,11 +583,14 @@
21580   */
21581  int get_nohz_timer_target(void)
21582  {
21583 -       int i, cpu = smp_processor_id();
21584 +       int i, cpu;
21585         struct sched_domain *sd;
21586  
21587 +       preempt_disable_rt();
21588 +       cpu = smp_processor_id();
21589 +
21590         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
21591 -               return cpu;
21592 +               goto preempt_en_rt;
21593  
21594         rcu_read_lock();
21595         for_each_domain(cpu, sd) {
21596 @@ -546,6 +609,8 @@
21597                 cpu = housekeeping_any_cpu();
21598  unlock:
21599         rcu_read_unlock();
21600 +preempt_en_rt:
21601 +       preempt_enable_rt();
21602         return cpu;
21603  }
21604  
21605 @@ -912,7 +977,7 @@
21606   */
21607  static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
21608  {
21609 -       if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
21610 +       if (!cpumask_test_cpu(cpu, p->cpus_ptr))
21611                 return false;
21612  
21613         if (is_per_cpu_kthread(p))
21614 @@ -1007,7 +1072,7 @@
21615         local_irq_disable();
21616         /*
21617          * We need to explicitly wake pending tasks before running
21618 -        * __migrate_task() such that we will not miss enforcing cpus_allowed
21619 +        * __migrate_task() such that we will not miss enforcing cpus_ptr
21620          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
21621          */
21622         sched_ttwu_pending();
21623 @@ -1038,11 +1103,19 @@
21624   */
21625  void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
21626  {
21627 -       cpumask_copy(&p->cpus_allowed, new_mask);
21628 +       cpumask_copy(&p->cpus_mask, new_mask);
21629         p->nr_cpus_allowed = cpumask_weight(new_mask);
21630  }
21631  
21632 -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21633 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21634 +int __migrate_disabled(struct task_struct *p)
21635 +{
21636 +       return p->migrate_disable;
21637 +}
21638 +#endif
21639 +
21640 +static void __do_set_cpus_allowed_tail(struct task_struct *p,
21641 +                                      const struct cpumask *new_mask)
21642  {
21643         struct rq *rq = task_rq(p);
21644         bool queued, running;
21645 @@ -1071,6 +1144,20 @@
21646                 set_curr_task(rq, p);
21647  }
21648  
21649 +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21650 +{
21651 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21652 +       if (__migrate_disabled(p)) {
21653 +               lockdep_assert_held(&p->pi_lock);
21654 +
21655 +               cpumask_copy(&p->cpus_mask, new_mask);
21656 +               p->migrate_disable_update = 1;
21657 +               return;
21658 +       }
21659 +#endif
21660 +       __do_set_cpus_allowed_tail(p, new_mask);
21661 +}
21662 +
21663  /*
21664   * Change a given task's CPU affinity. Migrate the thread to a
21665   * proper CPU and schedule it away if the CPU it's executing on
21666 @@ -1108,7 +1195,7 @@
21667                 goto out;
21668         }
21669  
21670 -       if (cpumask_equal(&p->cpus_allowed, new_mask))
21671 +       if (cpumask_equal(p->cpus_ptr, new_mask))
21672                 goto out;
21673  
21674         if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
21675 @@ -1129,9 +1216,16 @@
21676         }
21677  
21678         /* Can the task run on the task's current CPU? If so, we're done */
21679 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
21680 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
21681                 goto out;
21682  
21683 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
21684 +       if (__migrate_disabled(p)) {
21685 +               p->migrate_disable_update = 1;
21686 +               goto out;
21687 +       }
21688 +#endif
21689 +
21690         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
21691         if (task_running(rq, p) || p->state == TASK_WAKING) {
21692                 struct migration_arg arg = { p, dest_cpu };
21693 @@ -1269,10 +1363,10 @@
21694         if (task_cpu(arg->src_task) != arg->src_cpu)
21695                 goto unlock;
21696  
21697 -       if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
21698 +       if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
21699                 goto unlock;
21700  
21701 -       if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
21702 +       if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
21703                 goto unlock;
21704  
21705         __migrate_swap_task(arg->src_task, arg->dst_cpu);
21706 @@ -1313,10 +1407,10 @@
21707         if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
21708                 goto out;
21709  
21710 -       if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
21711 +       if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
21712                 goto out;
21713  
21714 -       if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
21715 +       if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
21716                 goto out;
21717  
21718         trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
21719 @@ -1326,6 +1420,18 @@
21720         return ret;
21721  }
21722  
21723 +static bool check_task_state(struct task_struct *p, long match_state)
21724 +{
21725 +       bool match = false;
21726 +
21727 +       raw_spin_lock_irq(&p->pi_lock);
21728 +       if (p->state == match_state || p->saved_state == match_state)
21729 +               match = true;
21730 +       raw_spin_unlock_irq(&p->pi_lock);
21731 +
21732 +       return match;
21733 +}
21734 +
21735  /*
21736   * wait_task_inactive - wait for a thread to unschedule.
21737   *
21738 @@ -1370,7 +1476,7 @@
21739                  * is actually now running somewhere else!
21740                  */
21741                 while (task_running(rq, p)) {
21742 -                       if (match_state && unlikely(p->state != match_state))
21743 +                       if (match_state && !check_task_state(p, match_state))
21744                                 return 0;
21745                         cpu_relax();
21746                 }
21747 @@ -1385,7 +1491,8 @@
21748                 running = task_running(rq, p);
21749                 queued = task_on_rq_queued(p);
21750                 ncsw = 0;
21751 -               if (!match_state || p->state == match_state)
21752 +               if (!match_state || p->state == match_state ||
21753 +                   p->saved_state == match_state)
21754                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
21755                 task_rq_unlock(rq, p, &rf);
21756  
21757 @@ -1460,7 +1567,7 @@
21758  EXPORT_SYMBOL_GPL(kick_process);
21759  
21760  /*
21761 - * ->cpus_allowed is protected by both rq->lock and p->pi_lock
21762 + * ->cpus_ptr is protected by both rq->lock and p->pi_lock
21763   *
21764   * A few notes on cpu_active vs cpu_online:
21765   *
21766 @@ -1500,14 +1607,14 @@
21767                 for_each_cpu(dest_cpu, nodemask) {
21768                         if (!cpu_active(dest_cpu))
21769                                 continue;
21770 -                       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
21771 +                       if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
21772                                 return dest_cpu;
21773                 }
21774         }
21775  
21776         for (;;) {
21777                 /* Any allowed, online CPU? */
21778 -               for_each_cpu(dest_cpu, &p->cpus_allowed) {
21779 +               for_each_cpu(dest_cpu, p->cpus_ptr) {
21780                         if (!is_cpu_allowed(p, dest_cpu))
21781                                 continue;
21782  
21783 @@ -1551,7 +1658,7 @@
21784  }
21785  
21786  /*
21787 - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
21788 + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
21789   */
21790  static inline
21791  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
21792 @@ -1561,11 +1668,11 @@
21793         if (p->nr_cpus_allowed > 1)
21794                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
21795         else
21796 -               cpu = cpumask_any(&p->cpus_allowed);
21797 +               cpu = cpumask_any(p->cpus_ptr);
21798  
21799         /*
21800          * In order not to call set_task_cpu() on a blocking task we need
21801 -        * to rely on ttwu() to place the task on a valid ->cpus_allowed
21802 +        * to rely on ttwu() to place the task on a valid ->cpus_ptr
21803          * CPU.
21804          *
21805          * Since this is common to all placement strategies, this lives here.
21806 @@ -1668,10 +1775,6 @@
21807  {
21808         activate_task(rq, p, en_flags);
21809         p->on_rq = TASK_ON_RQ_QUEUED;
21810 -
21811 -       /* If a worker is waking up, notify the workqueue: */
21812 -       if (p->flags & PF_WQ_WORKER)
21813 -               wq_worker_waking_up(p, cpu_of(rq));
21814  }
21815  
21816  /*
21817 @@ -1995,8 +2098,27 @@
21818          */
21819         raw_spin_lock_irqsave(&p->pi_lock, flags);
21820         smp_mb__after_spinlock();
21821 -       if (!(p->state & state))
21822 +       if (!(p->state & state)) {
21823 +               /*
21824 +                * The task might be running due to a spinlock sleeper
21825 +                * wakeup. Check the saved state and set it to running
21826 +                * if the wakeup condition is true.
21827 +                */
21828 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
21829 +                       if (p->saved_state & state) {
21830 +                               p->saved_state = TASK_RUNNING;
21831 +                               success = 1;
21832 +                       }
21833 +               }
21834                 goto out;
21835 +       }
21836 +
21837 +       /*
21838 +        * If this is a regular wakeup, then we can unconditionally
21839 +        * clear the saved state of a "lock sleeper".
21840 +        */
21841 +       if (!(wake_flags & WF_LOCK_SLEEPER))
21842 +               p->saved_state = TASK_RUNNING;
21843  
21844         trace_sched_waking(p);
21845  
21846 @@ -2093,56 +2215,6 @@
21847  }
21848  
21849  /**
21850 - * try_to_wake_up_local - try to wake up a local task with rq lock held
21851 - * @p: the thread to be awakened
21852 - * @rf: request-queue flags for pinning
21853 - *
21854 - * Put @p on the run-queue if it's not already there. The caller must
21855 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
21856 - * the current task.
21857 - */
21858 -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
21859 -{
21860 -       struct rq *rq = task_rq(p);
21861 -
21862 -       if (WARN_ON_ONCE(rq != this_rq()) ||
21863 -           WARN_ON_ONCE(p == current))
21864 -               return;
21865 -
21866 -       lockdep_assert_held(&rq->lock);
21867 -
21868 -       if (!raw_spin_trylock(&p->pi_lock)) {
21869 -               /*
21870 -                * This is OK, because current is on_cpu, which avoids it being
21871 -                * picked for load-balance and preemption/IRQs are still
21872 -                * disabled avoiding further scheduler activity on it and we've
21873 -                * not yet picked a replacement task.
21874 -                */
21875 -               rq_unlock(rq, rf);
21876 -               raw_spin_lock(&p->pi_lock);
21877 -               rq_relock(rq, rf);
21878 -       }
21879 -
21880 -       if (!(p->state & TASK_NORMAL))
21881 -               goto out;
21882 -
21883 -       trace_sched_waking(p);
21884 -
21885 -       if (!task_on_rq_queued(p)) {
21886 -               if (p->in_iowait) {
21887 -                       delayacct_blkio_end(p);
21888 -                       atomic_dec(&rq->nr_iowait);
21889 -               }
21890 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
21891 -       }
21892 -
21893 -       ttwu_do_wakeup(rq, p, 0, rf);
21894 -       ttwu_stat(p, smp_processor_id(), 0);
21895 -out:
21896 -       raw_spin_unlock(&p->pi_lock);
21897 -}
21898 -
21899 -/**
21900   * wake_up_process - Wake up a specific process
21901   * @p: The process to be woken up.
21902   *
21903 @@ -2160,6 +2232,18 @@
21904  }
21905  EXPORT_SYMBOL(wake_up_process);
21906  
21907 +/**
21908 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
21909 + * @p: The process to be woken up.
21910 + *
21911 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
21912 + * the nature of the wakeup.
21913 + */
21914 +int wake_up_lock_sleeper(struct task_struct *p)
21915 +{
21916 +       return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
21917 +}
21918 +
21919  int wake_up_state(struct task_struct *p, unsigned int state)
21920  {
21921         return try_to_wake_up(p, state, 0);
21922 @@ -2420,6 +2504,9 @@
21923         p->on_cpu = 0;
21924  #endif
21925         init_task_preempt_count(p);
21926 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
21927 +       task_thread_info(p)->preempt_lazy_count = 0;
21928 +#endif
21929  #ifdef CONFIG_SMP
21930         plist_node_init(&p->pushable_tasks, MAX_PRIO);
21931         RB_CLEAR_NODE(&p->pushable_dl_tasks);
21932 @@ -2462,7 +2549,7 @@
21933  #ifdef CONFIG_SMP
21934         /*
21935          * Fork balancing, do it here and not earlier because:
21936 -        *  - cpus_allowed can change in the fork path
21937 +        *  - cpus_ptr can change in the fork path
21938          *  - any previously selected CPU might disappear through hotplug
21939          *
21940          * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
21941 @@ -2675,21 +2762,16 @@
21942         finish_arch_post_lock_switch();
21943  
21944         fire_sched_in_preempt_notifiers(current);
21945 +       /*
21946 +        * We use mmdrop_delayed() here so we don't have to do the
21947 +        * full __mmdrop() when we are the last user.
21948 +        */
21949         if (mm)
21950 -               mmdrop(mm);
21951 +               mmdrop_delayed(mm);
21952         if (unlikely(prev_state == TASK_DEAD)) {
21953                 if (prev->sched_class->task_dead)
21954                         prev->sched_class->task_dead(prev);
21955  
21956 -               /*
21957 -                * Remove function-return probe instances associated with this
21958 -                * task and put them back on the free list.
21959 -                */
21960 -               kprobe_flush_task(prev);
21961 -
21962 -               /* Task is done with its stack. */
21963 -               put_task_stack(prev);
21964 -
21965                 put_task_struct(prev);
21966         }
21967  
21968 @@ -3336,25 +3418,13 @@
21969                                 atomic_inc(&rq->nr_iowait);
21970                                 delayacct_blkio_start();
21971                         }
21972 -
21973 -                       /*
21974 -                        * If a worker went to sleep, notify and ask workqueue
21975 -                        * whether it wants to wake up a task to maintain
21976 -                        * concurrency.
21977 -                        */
21978 -                       if (prev->flags & PF_WQ_WORKER) {
21979 -                               struct task_struct *to_wakeup;
21980 -
21981 -                               to_wakeup = wq_worker_sleeping(prev);
21982 -                               if (to_wakeup)
21983 -                                       try_to_wake_up_local(to_wakeup, &rf);
21984 -                       }
21985                 }
21986                 switch_count = &prev->nvcsw;
21987         }
21988  
21989         next = pick_next_task(rq, prev, &rf);
21990         clear_tsk_need_resched(prev);
21991 +       clear_tsk_need_resched_lazy(prev);
21992         clear_preempt_need_resched();
21993  
21994         if (likely(prev != next)) {
21995 @@ -3407,8 +3477,19 @@
21996  
21997  static inline void sched_submit_work(struct task_struct *tsk)
21998  {
21999 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
22000 +       if (!tsk->state)
22001 +               return;
22002 +       /*
22003 +        * If a worker went to sleep, notify and ask workqueue whether
22004 +        * it wants to wake up a task to maintain concurrency.
22005 +        */
22006 +       if (tsk->flags & PF_WQ_WORKER)
22007 +               wq_worker_sleeping(tsk);
22008 +
22009 +
22010 +       if (tsk_is_pi_blocked(tsk))
22011                 return;
22012 +
22013         /*
22014          * If we are going to sleep and we have plugged IO queued,
22015          * make sure to submit it to avoid deadlocks.
22016 @@ -3417,6 +3498,12 @@
22017                 blk_schedule_flush_plug(tsk);
22018  }
22019  
22020 +static void sched_update_worker(struct task_struct *tsk)
22021 +{
22022 +       if (tsk->flags & PF_WQ_WORKER)
22023 +               wq_worker_running(tsk);
22024 +}
22025 +
22026  asmlinkage __visible void __sched schedule(void)
22027  {
22028         struct task_struct *tsk = current;
22029 @@ -3427,6 +3514,7 @@
22030                 __schedule(false);
22031                 sched_preempt_enable_no_resched();
22032         } while (need_resched());
22033 +       sched_update_worker(tsk);
22034  }
22035  EXPORT_SYMBOL(schedule);
22036  
22037 @@ -3515,6 +3603,30 @@
22038         } while (need_resched());
22039  }
22040  
22041 +#ifdef CONFIG_PREEMPT_LAZY
22042 +/*
22043 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22044 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22045 + * preempt_lazy_count counter >0.
22046 + */
22047 +static __always_inline int preemptible_lazy(void)
22048 +{
22049 +       if (test_thread_flag(TIF_NEED_RESCHED))
22050 +               return 1;
22051 +       if (current_thread_info()->preempt_lazy_count)
22052 +               return 0;
22053 +       return 1;
22054 +}
22055 +
22056 +#else
22057 +
22058 +static inline int preemptible_lazy(void)
22059 +{
22060 +       return 1;
22061 +}
22062 +
22063 +#endif
22064 +
22065  #ifdef CONFIG_PREEMPT
22066  /*
22067   * this is the entry point to schedule() from in-kernel preemption
22068 @@ -3529,7 +3641,8 @@
22069          */
22070         if (likely(!preemptible()))
22071                 return;
22072 -
22073 +       if (!preemptible_lazy())
22074 +               return;
22075         preempt_schedule_common();
22076  }
22077  NOKPROBE_SYMBOL(preempt_schedule);
22078 @@ -3556,6 +3669,9 @@
22079         if (likely(!preemptible()))
22080                 return;
22081  
22082 +       if (!preemptible_lazy())
22083 +               return;
22084 +
22085         do {
22086                 /*
22087                  * Because the function tracer can trace preempt_count_sub()
22088 @@ -3578,7 +3694,16 @@
22089                  * an infinite recursion.
22090                  */
22091                 prev_ctx = exception_enter();
22092 +               /*
22093 +                * The add/subtract must not be traced by the function
22094 +                * tracer. But we still want to account for the
22095 +                * preempt off latency tracer. Since the _notrace versions
22096 +                * of add/subtract skip the accounting for latency tracer
22097 +                * we must force it manually.
22098 +                */
22099 +               start_critical_timings();
22100                 __schedule(true);
22101 +               stop_critical_timings();
22102                 exception_exit(prev_ctx);
22103  
22104                 preempt_latency_stop(1);
22105 @@ -4164,7 +4289,7 @@
22106                          * the entire root_domain to become SCHED_DEADLINE. We
22107                          * will also fail if there's no bandwidth available.
22108                          */
22109 -                       if (!cpumask_subset(span, &p->cpus_allowed) ||
22110 +                       if (!cpumask_subset(span, p->cpus_ptr) ||
22111                             rq->rd->dl_bw.bw == 0) {
22112                                 task_rq_unlock(rq, p, &rf);
22113                                 return -EPERM;
22114 @@ -4758,7 +4883,7 @@
22115                 goto out_unlock;
22116  
22117         raw_spin_lock_irqsave(&p->pi_lock, flags);
22118 -       cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
22119 +       cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
22120         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
22121  
22122  out_unlock:
22123 @@ -4877,6 +5002,7 @@
22124  }
22125  EXPORT_SYMBOL(__cond_resched_lock);
22126  
22127 +#ifndef CONFIG_PREEMPT_RT_FULL
22128  int __sched __cond_resched_softirq(void)
22129  {
22130         BUG_ON(!in_softirq());
22131 @@ -4890,6 +5016,7 @@
22132         return 0;
22133  }
22134  EXPORT_SYMBOL(__cond_resched_softirq);
22135 +#endif
22136  
22137  /**
22138   * yield - yield the current processor to other threads.
22139 @@ -5284,7 +5411,9 @@
22140  
22141         /* Set the preempt count _outside_ the spinlocks! */
22142         init_idle_preempt_count(idle, cpu);
22143 -
22144 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22145 +       task_thread_info(idle)->preempt_lazy_count = 0;
22146 +#endif
22147         /*
22148          * The idle tasks have their own, simple scheduling class:
22149          */
22150 @@ -5323,7 +5452,7 @@
22151          * allowed nodes is unnecessary.  Thus, cpusets are not
22152          * applicable for such threads.  This prevents checking for
22153          * success of set_cpus_allowed_ptr() on all attached tasks
22154 -        * before cpus_allowed may be changed.
22155 +        * before cpus_mask may be changed.
22156          */
22157         if (p->flags & PF_NO_SETAFFINITY) {
22158                 ret = -EINVAL;
22159 @@ -5350,7 +5479,7 @@
22160         if (curr_cpu == target_cpu)
22161                 return 0;
22162  
22163 -       if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
22164 +       if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
22165                 return -EINVAL;
22166  
22167         /* TODO: This is not properly updating schedstats */
22168 @@ -5389,6 +5518,8 @@
22169  #endif /* CONFIG_NUMA_BALANCING */
22170  
22171  #ifdef CONFIG_HOTPLUG_CPU
22172 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22173 +
22174  /*
22175   * Ensure that the idle task is using init_mm right before its CPU goes
22176   * offline.
22177 @@ -5403,7 +5534,12 @@
22178                 switch_mm(mm, &init_mm, current);
22179                 finish_arch_post_lock_switch();
22180         }
22181 -       mmdrop(mm);
22182 +       /*
22183 +        * Defer the cleanup to an alive cpu. On RT we can neither
22184 +        * call mmdrop() nor mmdrop_delayed() from here.
22185 +        */
22186 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
22187 +
22188  }
22189  
22190  /*
22191 @@ -5487,7 +5623,7 @@
22192                 put_prev_task(rq, next);
22193  
22194                 /*
22195 -                * Rules for changing task_struct::cpus_allowed are holding
22196 +                * Rules for changing task_struct::cpus_mask are holding
22197                  * both pi_lock and rq->lock, such that holding either
22198                  * stabilizes the mask.
22199                  *
22200 @@ -5718,6 +5854,10 @@
22201         update_max_interval();
22202         nohz_balance_exit_idle(cpu);
22203         hrtick_clear(rq);
22204 +       if (per_cpu(idle_last_mm, cpu)) {
22205 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
22206 +               per_cpu(idle_last_mm, cpu) = NULL;
22207 +       }
22208         return 0;
22209  }
22210  #endif
22211 @@ -5964,7 +6104,7 @@
22212  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22213  static inline int preempt_count_equals(int preempt_offset)
22214  {
22215 -       int nested = preempt_count() + rcu_preempt_depth();
22216 +       int nested = preempt_count() + sched_rcu_preempt_depth();
22217  
22218         return (nested == preempt_offset);
22219  }
22220 @@ -6756,3 +6896,197 @@
22221   /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
22222   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
22223  };
22224 +
22225 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
22226 +
22227 +static inline void
22228 +update_nr_migratory(struct task_struct *p, long delta)
22229 +{
22230 +       if (unlikely((p->sched_class == &rt_sched_class ||
22231 +                     p->sched_class == &dl_sched_class) &&
22232 +                     p->nr_cpus_allowed > 1)) {
22233 +               if (p->sched_class == &rt_sched_class)
22234 +                       task_rq(p)->rt.rt_nr_migratory += delta;
22235 +               else
22236 +                       task_rq(p)->dl.dl_nr_migratory += delta;
22237 +       }
22238 +}
22239 +
22240 +static inline void
22241 +migrate_disable_update_cpus_allowed(struct task_struct *p)
22242 +{
22243 +       struct rq *rq;
22244 +       struct rq_flags rf;
22245 +
22246 +       p->cpus_ptr = cpumask_of(smp_processor_id());
22247 +
22248 +       rq = task_rq_lock(p, &rf);
22249 +       update_nr_migratory(p, -1);
22250 +       p->nr_cpus_allowed = 1;
22251 +       task_rq_unlock(rq, p, &rf);
22252 +}
22253 +
22254 +static inline void
22255 +migrate_enable_update_cpus_allowed(struct task_struct *p)
22256 +{
22257 +       struct rq *rq;
22258 +       struct rq_flags rf;
22259 +
22260 +       p->cpus_ptr = &p->cpus_mask;
22261 +
22262 +       rq = task_rq_lock(p, &rf);
22263 +       p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
22264 +       update_nr_migratory(p, 1);
22265 +       task_rq_unlock(rq, p, &rf);
22266 +}
22267 +
22268 +void migrate_disable(void)
22269 +{
22270 +       struct task_struct *p = current;
22271 +
22272 +       if (in_atomic() || irqs_disabled()) {
22273 +#ifdef CONFIG_SCHED_DEBUG
22274 +               p->migrate_disable_atomic++;
22275 +#endif
22276 +               return;
22277 +       }
22278 +#ifdef CONFIG_SCHED_DEBUG
22279 +       if (unlikely(p->migrate_disable_atomic)) {
22280 +               tracing_off();
22281 +               WARN_ON_ONCE(1);
22282 +       }
22283 +#endif
22284 +
22285 +       if (p->migrate_disable) {
22286 +               p->migrate_disable++;
22287 +               return;
22288 +       }
22289 +
22290 +       preempt_disable();
22291 +       preempt_lazy_disable();
22292 +       pin_current_cpu();
22293 +
22294 +       migrate_disable_update_cpus_allowed(p);
22295 +       p->migrate_disable = 1;
22296 +
22297 +       preempt_enable();
22298 +}
22299 +EXPORT_SYMBOL(migrate_disable);
22300 +
22301 +void migrate_enable(void)
22302 +{
22303 +       struct task_struct *p = current;
22304 +
22305 +       if (in_atomic() || irqs_disabled()) {
22306 +#ifdef CONFIG_SCHED_DEBUG
22307 +               p->migrate_disable_atomic--;
22308 +#endif
22309 +               return;
22310 +       }
22311 +
22312 +#ifdef CONFIG_SCHED_DEBUG
22313 +       if (unlikely(p->migrate_disable_atomic)) {
22314 +               tracing_off();
22315 +               WARN_ON_ONCE(1);
22316 +       }
22317 +#endif
22318 +
22319 +       WARN_ON_ONCE(p->migrate_disable <= 0);
22320 +       if (p->migrate_disable > 1) {
22321 +               p->migrate_disable--;
22322 +               return;
22323 +       }
22324 +
22325 +       preempt_disable();
22326 +
22327 +       p->migrate_disable = 0;
22328 +       migrate_enable_update_cpus_allowed(p);
22329 +
22330 +       if (p->migrate_disable_update) {
22331 +               struct rq *rq;
22332 +               struct rq_flags rf;
22333 +
22334 +               rq = task_rq_lock(p, &rf);
22335 +               update_rq_clock(rq);
22336 +
22337 +               __do_set_cpus_allowed_tail(p, &p->cpus_mask);
22338 +               task_rq_unlock(rq, p, &rf);
22339 +
22340 +               p->migrate_disable_update = 0;
22341 +
22342 +               WARN_ON(smp_processor_id() != task_cpu(p));
22343 +               if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
22344 +                       const struct cpumask *cpu_valid_mask = cpu_active_mask;
22345 +                       struct migration_arg arg;
22346 +                       unsigned int dest_cpu;
22347 +
22348 +                       if (p->flags & PF_KTHREAD) {
22349 +                               /*
22350 +                                * Kernel threads are allowed on online && !active CPUs
22351 +                                */
22352 +                               cpu_valid_mask = cpu_online_mask;
22353 +                       }
22354 +                       dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
22355 +                       arg.task = p;
22356 +                       arg.dest_cpu = dest_cpu;
22357 +
22358 +                       unpin_current_cpu();
22359 +                       preempt_lazy_enable();
22360 +                       preempt_enable();
22361 +                       stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
22362 +                       tlb_migrate_finish(p->mm);
22363 +
22364 +                       return;
22365 +               }
22366 +       }
22367 +       unpin_current_cpu();
22368 +       preempt_lazy_enable();
22369 +       preempt_enable();
22370 +}
22371 +EXPORT_SYMBOL(migrate_enable);
22372 +
22373 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22374 +void migrate_disable(void)
22375 +{
22376 +       struct task_struct *p = current;
22377 +
22378 +       if (in_atomic() || irqs_disabled()) {
22379 +#ifdef CONFIG_SCHED_DEBUG
22380 +               p->migrate_disable_atomic++;
22381 +#endif
22382 +               return;
22383 +       }
22384 +#ifdef CONFIG_SCHED_DEBUG
22385 +       if (unlikely(p->migrate_disable_atomic)) {
22386 +               tracing_off();
22387 +               WARN_ON_ONCE(1);
22388 +       }
22389 +#endif
22390 +
22391 +       p->migrate_disable++;
22392 +}
22393 +EXPORT_SYMBOL(migrate_disable);
22394 +
22395 +void migrate_enable(void)
22396 +{
22397 +       struct task_struct *p = current;
22398 +
22399 +       if (in_atomic() || irqs_disabled()) {
22400 +#ifdef CONFIG_SCHED_DEBUG
22401 +               p->migrate_disable_atomic--;
22402 +#endif
22403 +               return;
22404 +       }
22405 +
22406 +#ifdef CONFIG_SCHED_DEBUG
22407 +       if (unlikely(p->migrate_disable_atomic)) {
22408 +               tracing_off();
22409 +               WARN_ON_ONCE(1);
22410 +       }
22411 +#endif
22412 +
22413 +       WARN_ON_ONCE(p->migrate_disable <= 0);
22414 +       p->migrate_disable--;
22415 +}
22416 +EXPORT_SYMBOL(migrate_enable);
22417 +#endif
22418 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/cpudeadline.c linux-4.14/kernel/sched/cpudeadline.c
22419 --- linux-4.14.orig/kernel/sched/cpudeadline.c  2017-11-12 19:46:13.000000000 +0100
22420 +++ linux-4.14/kernel/sched/cpudeadline.c       2018-09-05 11:05:07.000000000 +0200
22421 @@ -127,13 +127,13 @@
22422         const struct sched_dl_entity *dl_se = &p->dl;
22423  
22424         if (later_mask &&
22425 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
22426 +           cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
22427                 return 1;
22428         } else {
22429                 int best_cpu = cpudl_maximum(cp);
22430                 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
22431  
22432 -               if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
22433 +               if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
22434                     dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
22435                         if (later_mask)
22436                                 cpumask_set_cpu(best_cpu, later_mask);
22437 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/cpupri.c linux-4.14/kernel/sched/cpupri.c
22438 --- linux-4.14.orig/kernel/sched/cpupri.c       2017-11-12 19:46:13.000000000 +0100
22439 +++ linux-4.14/kernel/sched/cpupri.c    2018-09-05 11:05:07.000000000 +0200
22440 @@ -103,11 +103,11 @@
22441                 if (skip)
22442                         continue;
22443  
22444 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
22445 +               if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
22446                         continue;
22447  
22448                 if (lowest_mask) {
22449 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
22450 +                       cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
22451  
22452                         /*
22453                          * We have to ensure that we have at least one bit
22454 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/deadline.c linux-4.14/kernel/sched/deadline.c
22455 --- linux-4.14.orig/kernel/sched/deadline.c     2018-09-05 11:03:22.000000000 +0200
22456 +++ linux-4.14/kernel/sched/deadline.c  2018-09-05 11:05:07.000000000 +0200
22457 @@ -504,7 +504,7 @@
22458                  * If we cannot preempt any rq, fall back to pick any
22459                  * online cpu.
22460                  */
22461 -               cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
22462 +               cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
22463                 if (cpu >= nr_cpu_ids) {
22464                         /*
22465                          * Fail to find any suitable cpu.
22466 @@ -1020,7 +1020,7 @@
22467  {
22468         struct hrtimer *timer = &dl_se->dl_timer;
22469  
22470 -       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22471 +       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
22472         timer->function = dl_task_timer;
22473  }
22474  
22475 @@ -1749,7 +1749,7 @@
22476  static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
22477  {
22478         if (!task_running(rq, p) &&
22479 -           cpumask_test_cpu(cpu, &p->cpus_allowed))
22480 +           cpumask_test_cpu(cpu, p->cpus_ptr))
22481                 return 1;
22482         return 0;
22483  }
22484 @@ -1899,7 +1899,7 @@
22485                 /* Retry if something changed. */
22486                 if (double_lock_balance(rq, later_rq)) {
22487                         if (unlikely(task_rq(task) != rq ||
22488 -                                    !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
22489 +                                    !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
22490                                      task_running(rq, task) ||
22491                                      !dl_task(task) ||
22492                                      !task_on_rq_queued(task))) {
22493 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/debug.c linux-4.14/kernel/sched/debug.c
22494 --- linux-4.14.orig/kernel/sched/debug.c        2017-11-12 19:46:13.000000000 +0100
22495 +++ linux-4.14/kernel/sched/debug.c     2018-09-05 11:05:07.000000000 +0200
22496 @@ -1017,6 +1017,10 @@
22497                 P(dl.runtime);
22498                 P(dl.deadline);
22499         }
22500 +#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
22501 +       P(migrate_disable);
22502 +#endif
22503 +       P(nr_cpus_allowed);
22504  #undef PN_SCHEDSTAT
22505  #undef PN
22506  #undef __PN
22507 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/fair.c linux-4.14/kernel/sched/fair.c
22508 --- linux-4.14.orig/kernel/sched/fair.c 2018-09-05 11:03:22.000000000 +0200
22509 +++ linux-4.14/kernel/sched/fair.c      2018-09-05 11:05:07.000000000 +0200
22510 @@ -1596,7 +1596,7 @@
22511          */
22512         if (cur) {
22513                 /* Skip this swap candidate if cannot move to the source cpu */
22514 -               if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
22515 +               if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
22516                         goto unlock;
22517  
22518                 /*
22519 @@ -1706,7 +1706,7 @@
22520  
22521         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
22522                 /* Skip this CPU if the source task cannot migrate */
22523 -               if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
22524 +               if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
22525                         continue;
22526  
22527                 env->dst_cpu = cpu;
22528 @@ -3840,7 +3840,7 @@
22529         ideal_runtime = sched_slice(cfs_rq, curr);
22530         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
22531         if (delta_exec > ideal_runtime) {
22532 -               resched_curr(rq_of(cfs_rq));
22533 +               resched_curr_lazy(rq_of(cfs_rq));
22534                 /*
22535                  * The current task ran long enough, ensure it doesn't get
22536                  * re-elected due to buddy favours.
22537 @@ -3864,7 +3864,7 @@
22538                 return;
22539  
22540         if (delta > ideal_runtime)
22541 -               resched_curr(rq_of(cfs_rq));
22542 +               resched_curr_lazy(rq_of(cfs_rq));
22543  }
22544  
22545  static void
22546 @@ -4006,7 +4006,7 @@
22547          * validating it and just reschedule.
22548          */
22549         if (queued) {
22550 -               resched_curr(rq_of(cfs_rq));
22551 +               resched_curr_lazy(rq_of(cfs_rq));
22552                 return;
22553         }
22554         /*
22555 @@ -4188,7 +4188,7 @@
22556          * hierarchy can be throttled
22557          */
22558         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
22559 -               resched_curr(rq_of(cfs_rq));
22560 +               resched_curr_lazy(rq_of(cfs_rq));
22561  }
22562  
22563  static __always_inline
22564 @@ -4837,7 +4837,7 @@
22565  
22566                 if (delta < 0) {
22567                         if (rq->curr == p)
22568 -                               resched_curr(rq);
22569 +                               resched_curr_lazy(rq);
22570                         return;
22571                 }
22572                 hrtick_start(rq, delta);
22573 @@ -5475,7 +5475,7 @@
22574  
22575                 /* Skip over this group if it has no CPUs allowed */
22576                 if (!cpumask_intersects(sched_group_span(group),
22577 -                                       &p->cpus_allowed))
22578 +                                       p->cpus_ptr))
22579                         continue;
22580  
22581                 local_group = cpumask_test_cpu(this_cpu,
22582 @@ -5595,7 +5595,7 @@
22583                 return cpumask_first(sched_group_span(group));
22584  
22585         /* Traverse only the allowed CPUs */
22586 -       for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
22587 +       for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
22588                 if (idle_cpu(i)) {
22589                         struct rq *rq = cpu_rq(i);
22590                         struct cpuidle_state *idle = idle_get_state(rq);
22591 @@ -5698,7 +5698,7 @@
22592         if (!test_idle_cores(target, false))
22593                 return -1;
22594  
22595 -       cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
22596 +       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
22597  
22598         for_each_cpu_wrap(core, cpus, target) {
22599                 bool idle = true;
22600 @@ -5732,7 +5732,7 @@
22601                 return -1;
22602  
22603         for_each_cpu(cpu, cpu_smt_mask(target)) {
22604 -               if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22605 +               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22606                         continue;
22607                 if (idle_cpu(cpu))
22608                         return cpu;
22609 @@ -5795,7 +5795,7 @@
22610         for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
22611                 if (!--nr)
22612                         return -1;
22613 -               if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22614 +               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22615                         continue;
22616                 if (idle_cpu(cpu))
22617                         break;
22618 @@ -5950,7 +5950,7 @@
22619         if (sd_flag & SD_BALANCE_WAKE) {
22620                 record_wakee(p);
22621                 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
22622 -                             && cpumask_test_cpu(cpu, &p->cpus_allowed);
22623 +                             && cpumask_test_cpu(cpu, p->cpus_ptr);
22624         }
22625  
22626         rcu_read_lock();
22627 @@ -6231,7 +6231,7 @@
22628         return;
22629  
22630  preempt:
22631 -       resched_curr(rq);
22632 +       resched_curr_lazy(rq);
22633         /*
22634          * Only set the backward buddy when the current task is still
22635          * on the rq. This can happen when a wakeup gets interleaved
22636 @@ -6699,14 +6699,14 @@
22637         /*
22638          * We do not migrate tasks that are:
22639          * 1) throttled_lb_pair, or
22640 -        * 2) cannot be migrated to this CPU due to cpus_allowed, or
22641 +        * 2) cannot be migrated to this CPU due to cpus_ptr, or
22642          * 3) running (obviously), or
22643          * 4) are cache-hot on their current CPU.
22644          */
22645         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
22646                 return 0;
22647  
22648 -       if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
22649 +       if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
22650                 int cpu;
22651  
22652                 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
22653 @@ -6726,7 +6726,7 @@
22654  
22655                 /* Prevent to re-select dst_cpu via env's cpus */
22656                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
22657 -                       if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
22658 +                       if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
22659                                 env->flags |= LBF_DST_PINNED;
22660                                 env->new_dst_cpu = cpu;
22661                                 break;
22662 @@ -7295,7 +7295,7 @@
22663  
22664  /*
22665   * Group imbalance indicates (and tries to solve) the problem where balancing
22666 - * groups is inadequate due to ->cpus_allowed constraints.
22667 + * groups is inadequate due to ->cpus_ptr constraints.
22668   *
22669   * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
22670   * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
22671 @@ -7871,7 +7871,7 @@
22672         /*
22673          * If the busiest group is imbalanced the below checks don't
22674          * work because they assume all things are equal, which typically
22675 -        * isn't true due to cpus_allowed constraints and the like.
22676 +        * isn't true due to cpus_ptr constraints and the like.
22677          */
22678         if (busiest->group_type == group_imbalanced)
22679                 goto force_balance;
22680 @@ -8263,7 +8263,7 @@
22681                          * if the curr task on busiest cpu can't be
22682                          * moved to this_cpu
22683                          */
22684 -                       if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
22685 +                       if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
22686                                 raw_spin_unlock_irqrestore(&busiest->lock,
22687                                                             flags);
22688                                 env.flags |= LBF_ALL_PINNED;
22689 @@ -9085,7 +9085,7 @@
22690                  * 'current' within the tree based on its new key value.
22691                  */
22692                 swap(curr->vruntime, se->vruntime);
22693 -               resched_curr(rq);
22694 +               resched_curr_lazy(rq);
22695         }
22696  
22697         se->vruntime -= cfs_rq->min_vruntime;
22698 @@ -9109,7 +9109,7 @@
22699          */
22700         if (rq->curr == p) {
22701                 if (p->prio > oldprio)
22702 -                       resched_curr(rq);
22703 +                       resched_curr_lazy(rq);
22704         } else
22705                 check_preempt_curr(rq, p, 0);
22706  }
22707 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/features.h linux-4.14/kernel/sched/features.h
22708 --- linux-4.14.orig/kernel/sched/features.h     2017-11-12 19:46:13.000000000 +0100
22709 +++ linux-4.14/kernel/sched/features.h  2018-09-05 11:05:07.000000000 +0200
22710 @@ -46,11 +46,19 @@
22711   */
22712  SCHED_FEAT(NONTASK_CAPACITY, true)
22713  
22714 +#ifdef CONFIG_PREEMPT_RT_FULL
22715 +SCHED_FEAT(TTWU_QUEUE, false)
22716 +# ifdef CONFIG_PREEMPT_LAZY
22717 +SCHED_FEAT(PREEMPT_LAZY, true)
22718 +# endif
22719 +#else
22720 +
22721  /*
22722   * Queue remote wakeups on the target CPU and process them
22723   * using the scheduler IPI. Reduces rq->lock contention/bounces.
22724   */
22725  SCHED_FEAT(TTWU_QUEUE, true)
22726 +#endif
22727  
22728  /*
22729   * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
22730 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/Makefile linux-4.14/kernel/sched/Makefile
22731 --- linux-4.14.orig/kernel/sched/Makefile       2017-11-12 19:46:13.000000000 +0100
22732 +++ linux-4.14/kernel/sched/Makefile    2018-09-05 11:05:07.000000000 +0200
22733 @@ -18,7 +18,7 @@
22734  
22735  obj-y += core.o loadavg.o clock.o cputime.o
22736  obj-y += idle_task.o fair.o rt.o deadline.o
22737 -obj-y += wait.o wait_bit.o swait.o completion.o idle.o
22738 +obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o
22739  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22740  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
22741  obj-$(CONFIG_SCHEDSTATS) += stats.o
22742 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/rt.c linux-4.14/kernel/sched/rt.c
22743 --- linux-4.14.orig/kernel/sched/rt.c   2018-09-05 11:03:22.000000000 +0200
22744 +++ linux-4.14/kernel/sched/rt.c        2018-09-05 11:05:07.000000000 +0200
22745 @@ -47,8 +47,8 @@
22746  
22747         raw_spin_lock_init(&rt_b->rt_runtime_lock);
22748  
22749 -       hrtimer_init(&rt_b->rt_period_timer,
22750 -                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22751 +       hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
22752 +                    HRTIMER_MODE_REL_HARD);
22753         rt_b->rt_period_timer.function = sched_rt_period_timer;
22754  }
22755  
22756 @@ -1594,7 +1594,7 @@
22757  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
22758  {
22759         if (!task_running(rq, p) &&
22760 -           cpumask_test_cpu(cpu, &p->cpus_allowed))
22761 +           cpumask_test_cpu(cpu, p->cpus_ptr))
22762                 return 1;
22763         return 0;
22764  }
22765 @@ -1729,7 +1729,7 @@
22766                          * Also make sure that it wasn't scheduled on its rq.
22767                          */
22768                         if (unlikely(task_rq(task) != rq ||
22769 -                                    !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
22770 +                                    !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
22771                                      task_running(rq, task) ||
22772                                      !rt_task(task) ||
22773                                      !task_on_rq_queued(task))) {
22774 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/sched.h linux-4.14/kernel/sched/sched.h
22775 --- linux-4.14.orig/kernel/sched/sched.h        2018-09-05 11:03:22.000000000 +0200
22776 +++ linux-4.14/kernel/sched/sched.h     2018-09-05 11:05:07.000000000 +0200
22777 @@ -1354,6 +1354,7 @@
22778  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
22779  #define WF_FORK                0x02            /* child wakeup after fork */
22780  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
22781 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
22782  
22783  /*
22784   * To aid in avoiding the subversion of "niceness" due to uneven distribution
22785 @@ -1545,6 +1546,15 @@
22786  extern void resched_curr(struct rq *rq);
22787  extern void resched_cpu(int cpu);
22788  
22789 +#ifdef CONFIG_PREEMPT_LAZY
22790 +extern void resched_curr_lazy(struct rq *rq);
22791 +#else
22792 +static inline void resched_curr_lazy(struct rq *rq)
22793 +{
22794 +       resched_curr(rq);
22795 +}
22796 +#endif
22797 +
22798  extern struct rt_bandwidth def_rt_bandwidth;
22799  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
22800  
22801 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/swait.c linux-4.14/kernel/sched/swait.c
22802 --- linux-4.14.orig/kernel/sched/swait.c        2017-11-12 19:46:13.000000000 +0100
22803 +++ linux-4.14/kernel/sched/swait.c     2018-09-05 11:05:07.000000000 +0200
22804 @@ -1,6 +1,7 @@
22805  // SPDX-License-Identifier: GPL-2.0
22806  #include <linux/sched/signal.h>
22807  #include <linux/swait.h>
22808 +#include <linux/suspend.h>
22809  
22810  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
22811                              struct lock_class_key *key)
22812 @@ -30,6 +31,25 @@
22813  }
22814  EXPORT_SYMBOL(swake_up_locked);
22815  
22816 +void swake_up_all_locked(struct swait_queue_head *q)
22817 +{
22818 +       struct swait_queue *curr;
22819 +       int wakes = 0;
22820 +
22821 +       while (!list_empty(&q->task_list)) {
22822 +
22823 +               curr = list_first_entry(&q->task_list, typeof(*curr),
22824 +                                       task_list);
22825 +               wake_up_process(curr->task);
22826 +               list_del_init(&curr->task_list);
22827 +               wakes++;
22828 +       }
22829 +       if (pm_in_action)
22830 +               return;
22831 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
22832 +}
22833 +EXPORT_SYMBOL(swake_up_all_locked);
22834 +
22835  void swake_up(struct swait_queue_head *q)
22836  {
22837         unsigned long flags;
22838 @@ -49,6 +69,7 @@
22839         struct swait_queue *curr;
22840         LIST_HEAD(tmp);
22841  
22842 +       WARN_ON(irqs_disabled());
22843         raw_spin_lock_irq(&q->lock);
22844         list_splice_init(&q->task_list, &tmp);
22845         while (!list_empty(&tmp)) {
22846 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/swork.c linux-4.14/kernel/sched/swork.c
22847 --- linux-4.14.orig/kernel/sched/swork.c        1970-01-01 01:00:00.000000000 +0100
22848 +++ linux-4.14/kernel/sched/swork.c     2018-09-05 11:05:07.000000000 +0200
22849 @@ -0,0 +1,173 @@
22850 +/*
22851 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
22852 + *
22853 + * Provides a framework for enqueuing callbacks from irq context
22854 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
22855 + */
22856 +
22857 +#include <linux/swait.h>
22858 +#include <linux/swork.h>
22859 +#include <linux/kthread.h>
22860 +#include <linux/slab.h>
22861 +#include <linux/spinlock.h>
22862 +#include <linux/export.h>
22863 +
22864 +#define SWORK_EVENT_PENDING     (1 << 0)
22865 +
22866 +static DEFINE_MUTEX(worker_mutex);
22867 +static struct sworker *glob_worker;
22868 +
22869 +struct sworker {
22870 +       struct list_head events;
22871 +       struct swait_queue_head wq;
22872 +
22873 +       raw_spinlock_t lock;
22874 +
22875 +       struct task_struct *task;
22876 +       int refs;
22877 +};
22878 +
22879 +static bool swork_readable(struct sworker *worker)
22880 +{
22881 +       bool r;
22882 +
22883 +       if (kthread_should_stop())
22884 +               return true;
22885 +
22886 +       raw_spin_lock_irq(&worker->lock);
22887 +       r = !list_empty(&worker->events);
22888 +       raw_spin_unlock_irq(&worker->lock);
22889 +
22890 +       return r;
22891 +}
22892 +
22893 +static int swork_kthread(void *arg)
22894 +{
22895 +       struct sworker *worker = arg;
22896 +
22897 +       for (;;) {
22898 +               swait_event_interruptible(worker->wq,
22899 +                                       swork_readable(worker));
22900 +               if (kthread_should_stop())
22901 +                       break;
22902 +
22903 +               raw_spin_lock_irq(&worker->lock);
22904 +               while (!list_empty(&worker->events)) {
22905 +                       struct swork_event *sev;
22906 +
22907 +                       sev = list_first_entry(&worker->events,
22908 +                                       struct swork_event, item);
22909 +                       list_del(&sev->item);
22910 +                       raw_spin_unlock_irq(&worker->lock);
22911 +
22912 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
22913 +                                                        &sev->flags));
22914 +                       sev->func(sev);
22915 +                       raw_spin_lock_irq(&worker->lock);
22916 +               }
22917 +               raw_spin_unlock_irq(&worker->lock);
22918 +       }
22919 +       return 0;
22920 +}
22921 +
22922 +static struct sworker *swork_create(void)
22923 +{
22924 +       struct sworker *worker;
22925 +
22926 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
22927 +       if (!worker)
22928 +               return ERR_PTR(-ENOMEM);
22929 +
22930 +       INIT_LIST_HEAD(&worker->events);
22931 +       raw_spin_lock_init(&worker->lock);
22932 +       init_swait_queue_head(&worker->wq);
22933 +
22934 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
22935 +       if (IS_ERR(worker->task)) {
22936 +               kfree(worker);
22937 +               return ERR_PTR(-ENOMEM);
22938 +       }
22939 +
22940 +       return worker;
22941 +}
22942 +
22943 +static void swork_destroy(struct sworker *worker)
22944 +{
22945 +       kthread_stop(worker->task);
22946 +
22947 +       WARN_ON(!list_empty(&worker->events));
22948 +       kfree(worker);
22949 +}
22950 +
22951 +/**
22952 + * swork_queue - queue swork
22953 + *
22954 + * Returns %false if @work was already on a queue, %true otherwise.
22955 + *
22956 + * The work is queued and processed on a random CPU
22957 + */
22958 +bool swork_queue(struct swork_event *sev)
22959 +{
22960 +       unsigned long flags;
22961 +
22962 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
22963 +               return false;
22964 +
22965 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
22966 +       list_add_tail(&sev->item, &glob_worker->events);
22967 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
22968 +
22969 +       swake_up(&glob_worker->wq);
22970 +       return true;
22971 +}
22972 +EXPORT_SYMBOL_GPL(swork_queue);
22973 +
22974 +/**
22975 + * swork_get - get an instance of the sworker
22976 + *
22977 + * Returns an negative error code if the initialization if the worker did not
22978 + * work, %0 otherwise.
22979 + *
22980 + */
22981 +int swork_get(void)
22982 +{
22983 +       struct sworker *worker;
22984 +
22985 +       mutex_lock(&worker_mutex);
22986 +       if (!glob_worker) {
22987 +               worker = swork_create();
22988 +               if (IS_ERR(worker)) {
22989 +                       mutex_unlock(&worker_mutex);
22990 +                       return -ENOMEM;
22991 +               }
22992 +
22993 +               glob_worker = worker;
22994 +       }
22995 +
22996 +       glob_worker->refs++;
22997 +       mutex_unlock(&worker_mutex);
22998 +
22999 +       return 0;
23000 +}
23001 +EXPORT_SYMBOL_GPL(swork_get);
23002 +
23003 +/**
23004 + * swork_put - puts an instance of the sworker
23005 + *
23006 + * Will destroy the sworker thread. This function must not be called until all
23007 + * queued events have been completed.
23008 + */
23009 +void swork_put(void)
23010 +{
23011 +       mutex_lock(&worker_mutex);
23012 +
23013 +       glob_worker->refs--;
23014 +       if (glob_worker->refs > 0)
23015 +               goto out;
23016 +
23017 +       swork_destroy(glob_worker);
23018 +       glob_worker = NULL;
23019 +out:
23020 +       mutex_unlock(&worker_mutex);
23021 +}
23022 +EXPORT_SYMBOL_GPL(swork_put);
23023 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/sched/topology.c linux-4.14/kernel/sched/topology.c
23024 --- linux-4.14.orig/kernel/sched/topology.c     2018-09-05 11:03:22.000000000 +0200
23025 +++ linux-4.14/kernel/sched/topology.c  2018-09-05 11:05:07.000000000 +0200
23026 @@ -286,6 +286,7 @@
23027         rd->rto_cpu = -1;
23028         raw_spin_lock_init(&rd->rto_lock);
23029         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23030 +       rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23031  #endif
23032  
23033         init_dl_bw(&rd->dl_bw);
23034 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/signal.c linux-4.14/kernel/signal.c
23035 --- linux-4.14.orig/kernel/signal.c     2018-09-05 11:03:22.000000000 +0200
23036 +++ linux-4.14/kernel/signal.c  2018-09-05 11:05:07.000000000 +0200
23037 @@ -19,6 +19,7 @@
23038  #include <linux/sched/task.h>
23039  #include <linux/sched/task_stack.h>
23040  #include <linux/sched/cputime.h>
23041 +#include <linux/sched/rt.h>
23042  #include <linux/fs.h>
23043  #include <linux/tty.h>
23044  #include <linux/binfmts.h>
23045 @@ -360,13 +361,30 @@
23046         return false;
23047  }
23048  
23049 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
23050 +{
23051 +       struct sigqueue *q = t->sigqueue_cache;
23052 +
23053 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23054 +               return NULL;
23055 +       return q;
23056 +}
23057 +
23058 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23059 +{
23060 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23061 +               return 0;
23062 +       return 1;
23063 +}
23064 +
23065  /*
23066   * allocate a new signal queue record
23067   * - this may be called without locks if and only if t == current, otherwise an
23068   *   appropriate lock must be held to stop the target task from exiting
23069   */
23070  static struct sigqueue *
23071 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23072 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23073 +                   int override_rlimit, int fromslab)
23074  {
23075         struct sigqueue *q = NULL;
23076         struct user_struct *user;
23077 @@ -383,7 +401,10 @@
23078         if (override_rlimit ||
23079             atomic_read(&user->sigpending) <=
23080                         task_rlimit(t, RLIMIT_SIGPENDING)) {
23081 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
23082 +               if (!fromslab)
23083 +                       q = get_task_cache(t);
23084 +               if (!q)
23085 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
23086         } else {
23087                 print_dropped_signal(sig);
23088         }
23089 @@ -400,6 +421,13 @@
23090         return q;
23091  }
23092  
23093 +static struct sigqueue *
23094 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23095 +                int override_rlimit)
23096 +{
23097 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23098 +}
23099 +
23100  static void __sigqueue_free(struct sigqueue *q)
23101  {
23102         if (q->flags & SIGQUEUE_PREALLOC)
23103 @@ -409,6 +437,21 @@
23104         kmem_cache_free(sigqueue_cachep, q);
23105  }
23106  
23107 +static void sigqueue_free_current(struct sigqueue *q)
23108 +{
23109 +       struct user_struct *up;
23110 +
23111 +       if (q->flags & SIGQUEUE_PREALLOC)
23112 +               return;
23113 +
23114 +       up = q->user;
23115 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23116 +               atomic_dec(&up->sigpending);
23117 +               free_uid(up);
23118 +       } else
23119 +                 __sigqueue_free(q);
23120 +}
23121 +
23122  void flush_sigqueue(struct sigpending *queue)
23123  {
23124         struct sigqueue *q;
23125 @@ -422,6 +465,21 @@
23126  }
23127  
23128  /*
23129 + * Called from __exit_signal. Flush tsk->pending and
23130 + * tsk->sigqueue_cache
23131 + */
23132 +void flush_task_sigqueue(struct task_struct *tsk)
23133 +{
23134 +       struct sigqueue *q;
23135 +
23136 +       flush_sigqueue(&tsk->pending);
23137 +
23138 +       q = get_task_cache(tsk);
23139 +       if (q)
23140 +               kmem_cache_free(sigqueue_cachep, q);
23141 +}
23142 +
23143 +/*
23144   * Flush all pending signals for this kthread.
23145   */
23146  void flush_signals(struct task_struct *t)
23147 @@ -542,7 +600,7 @@
23148                         (info->si_code == SI_TIMER) &&
23149                         (info->si_sys_private);
23150  
23151 -               __sigqueue_free(first);
23152 +               sigqueue_free_current(first);
23153         } else {
23154                 /*
23155                  * Ok, it wasn't in the queue.  This must be
23156 @@ -578,6 +636,8 @@
23157         bool resched_timer = false;
23158         int signr;
23159  
23160 +       WARN_ON_ONCE(tsk != current);
23161 +
23162         /* We only dequeue private signals from ourselves, we don't let
23163          * signalfd steal them
23164          */
23165 @@ -1177,8 +1237,8 @@
23166   * We don't want to have recursive SIGSEGV's etc, for example,
23167   * that is why we also clear SIGNAL_UNKILLABLE.
23168   */
23169 -int
23170 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23171 +static int
23172 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23173  {
23174         unsigned long int flags;
23175         int ret, blocked, ignored;
23176 @@ -1207,6 +1267,39 @@
23177         return ret;
23178  }
23179  
23180 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23181 +{
23182 +/*
23183 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23184 + * since it can not enable preemption, and the signal code's spin_locks
23185 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23186 + * send the signal on exit of the trap.
23187 + */
23188 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23189 +       if (in_atomic()) {
23190 +               if (WARN_ON_ONCE(t != current))
23191 +                       return 0;
23192 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
23193 +                       return 0;
23194 +
23195 +               if (is_si_special(info)) {
23196 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
23197 +                       t->forced_info.si_signo = sig;
23198 +                       t->forced_info.si_errno = 0;
23199 +                       t->forced_info.si_code = SI_KERNEL;
23200 +                       t->forced_info.si_pid = 0;
23201 +                       t->forced_info.si_uid = 0;
23202 +               } else {
23203 +                       t->forced_info = *info;
23204 +               }
23205 +
23206 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23207 +               return 0;
23208 +       }
23209 +#endif
23210 +       return do_force_sig_info(sig, info, t);
23211 +}
23212 +
23213  /*
23214   * Nuke all other threads in the group.
23215   */
23216 @@ -1241,12 +1334,12 @@
23217                  * Disable interrupts early to avoid deadlocks.
23218                  * See rcu_read_unlock() comment header for details.
23219                  */
23220 -               local_irq_save(*flags);
23221 +               local_irq_save_nort(*flags);
23222                 rcu_read_lock();
23223                 sighand = rcu_dereference(tsk->sighand);
23224                 if (unlikely(sighand == NULL)) {
23225                         rcu_read_unlock();
23226 -                       local_irq_restore(*flags);
23227 +                       local_irq_restore_nort(*flags);
23228                         break;
23229                 }
23230                 /*
23231 @@ -1267,7 +1360,7 @@
23232                 }
23233                 spin_unlock(&sighand->siglock);
23234                 rcu_read_unlock();
23235 -               local_irq_restore(*flags);
23236 +               local_irq_restore_nort(*flags);
23237         }
23238  
23239         return sighand;
23240 @@ -1514,7 +1607,8 @@
23241   */
23242  struct sigqueue *sigqueue_alloc(void)
23243  {
23244 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23245 +       /* Preallocated sigqueue objects always from the slabcache ! */
23246 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23247  
23248         if (q)
23249                 q->flags |= SIGQUEUE_PREALLOC;
23250 @@ -1888,15 +1982,7 @@
23251                 if (gstop_done && ptrace_reparented(current))
23252                         do_notify_parent_cldstop(current, false, why);
23253  
23254 -               /*
23255 -                * Don't want to allow preemption here, because
23256 -                * sys_ptrace() needs this task to be inactive.
23257 -                *
23258 -                * XXX: implement read_unlock_no_resched().
23259 -                */
23260 -               preempt_disable();
23261                 read_unlock(&tasklist_lock);
23262 -               preempt_enable_no_resched();
23263                 freezable_schedule();
23264         } else {
23265                 /*
23266 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/softirq.c linux-4.14/kernel/softirq.c
23267 --- linux-4.14.orig/kernel/softirq.c    2018-09-05 11:03:22.000000000 +0200
23268 +++ linux-4.14/kernel/softirq.c 2018-09-05 11:05:07.000000000 +0200
23269 @@ -21,11 +21,14 @@
23270  #include <linux/freezer.h>
23271  #include <linux/kthread.h>
23272  #include <linux/rcupdate.h>
23273 +#include <linux/delay.h>
23274  #include <linux/ftrace.h>
23275  #include <linux/smp.h>
23276  #include <linux/smpboot.h>
23277  #include <linux/tick.h>
23278 +#include <linux/locallock.h>
23279  #include <linux/irq.h>
23280 +#include <linux/sched/types.h>
23281  
23282  #define CREATE_TRACE_POINTS
23283  #include <trace/events/irq.h>
23284 @@ -56,12 +59,108 @@
23285  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23286  
23287  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23288 +#ifdef CONFIG_PREEMPT_RT_FULL
23289 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23290 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23291 +#endif
23292  
23293  const char * const softirq_to_name[NR_SOFTIRQS] = {
23294         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
23295         "TASKLET", "SCHED", "HRTIMER", "RCU"
23296  };
23297  
23298 +#ifdef CONFIG_NO_HZ_COMMON
23299 +# ifdef CONFIG_PREEMPT_RT_FULL
23300 +
23301 +struct softirq_runner {
23302 +       struct task_struct *runner[NR_SOFTIRQS];
23303 +};
23304 +
23305 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23306 +
23307 +static inline void softirq_set_runner(unsigned int sirq)
23308 +{
23309 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23310 +
23311 +       sr->runner[sirq] = current;
23312 +}
23313 +
23314 +static inline void softirq_clr_runner(unsigned int sirq)
23315 +{
23316 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23317 +
23318 +       sr->runner[sirq] = NULL;
23319 +}
23320 +
23321 +/*
23322 + * On preempt-rt a softirq running context might be blocked on a
23323 + * lock. There might be no other runnable task on this CPU because the
23324 + * lock owner runs on some other CPU. So we have to go into idle with
23325 + * the pending bit set. Therefor we need to check this otherwise we
23326 + * warn about false positives which confuses users and defeats the
23327 + * whole purpose of this test.
23328 + *
23329 + * This code is called with interrupts disabled.
23330 + */
23331 +void softirq_check_pending_idle(void)
23332 +{
23333 +       static int rate_limit;
23334 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23335 +       u32 warnpending;
23336 +       int i;
23337 +
23338 +       if (rate_limit >= 10)
23339 +               return;
23340 +
23341 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
23342 +       for (i = 0; i < NR_SOFTIRQS; i++) {
23343 +               struct task_struct *tsk = sr->runner[i];
23344 +
23345 +               /*
23346 +                * The wakeup code in rtmutex.c wakes up the task
23347 +                * _before_ it sets pi_blocked_on to NULL under
23348 +                * tsk->pi_lock. So we need to check for both: state
23349 +                * and pi_blocked_on.
23350 +                */
23351 +               if (tsk) {
23352 +                       raw_spin_lock(&tsk->pi_lock);
23353 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
23354 +                               /* Clear all bits pending in that task */
23355 +                               warnpending &= ~(tsk->softirqs_raised);
23356 +                               warnpending &= ~(1 << i);
23357 +                       }
23358 +                       raw_spin_unlock(&tsk->pi_lock);
23359 +               }
23360 +       }
23361 +
23362 +       if (warnpending) {
23363 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23364 +                      warnpending);
23365 +               rate_limit++;
23366 +       }
23367 +}
23368 +# else
23369 +/*
23370 + * On !PREEMPT_RT we just printk rate limited:
23371 + */
23372 +void softirq_check_pending_idle(void)
23373 +{
23374 +       static int rate_limit;
23375 +
23376 +       if (rate_limit < 10 &&
23377 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
23378 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23379 +                      local_softirq_pending());
23380 +               rate_limit++;
23381 +       }
23382 +}
23383 +# endif
23384 +
23385 +#else /* !CONFIG_NO_HZ_COMMON */
23386 +static inline void softirq_set_runner(unsigned int sirq) { }
23387 +static inline void softirq_clr_runner(unsigned int sirq) { }
23388 +#endif
23389 +
23390  /*
23391   * we cannot loop indefinitely here to avoid userspace starvation,
23392   * but we also don't want to introduce a worst case 1/HZ latency
23393 @@ -77,6 +176,38 @@
23394                 wake_up_process(tsk);
23395  }
23396  
23397 +#ifdef CONFIG_PREEMPT_RT_FULL
23398 +static void wakeup_timer_softirqd(void)
23399 +{
23400 +       /* Interrupts are disabled: no need to stop preemption */
23401 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
23402 +
23403 +       if (tsk && tsk->state != TASK_RUNNING)
23404 +               wake_up_process(tsk);
23405 +}
23406 +#endif
23407 +
23408 +static void handle_softirq(unsigned int vec_nr)
23409 +{
23410 +       struct softirq_action *h = softirq_vec + vec_nr;
23411 +       int prev_count;
23412 +
23413 +       prev_count = preempt_count();
23414 +
23415 +       kstat_incr_softirqs_this_cpu(vec_nr);
23416 +
23417 +       trace_softirq_entry(vec_nr);
23418 +       h->action(h);
23419 +       trace_softirq_exit(vec_nr);
23420 +       if (unlikely(prev_count != preempt_count())) {
23421 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23422 +                      vec_nr, softirq_to_name[vec_nr], h->action,
23423 +                      prev_count, preempt_count());
23424 +               preempt_count_set(prev_count);
23425 +       }
23426 +}
23427 +
23428 +#ifndef CONFIG_PREEMPT_RT_FULL
23429  /*
23430   * If ksoftirqd is scheduled, we do not want to process pending softirqs
23431   * right now. Let ksoftirqd handle this at its own rate, to get fairness,
23432 @@ -92,6 +223,47 @@
23433         return tsk && (tsk->state == TASK_RUNNING);
23434  }
23435  
23436 +static inline int ksoftirqd_softirq_pending(void)
23437 +{
23438 +       return local_softirq_pending();
23439 +}
23440 +
23441 +static void handle_pending_softirqs(u32 pending)
23442 +{
23443 +       struct softirq_action *h = softirq_vec;
23444 +       int softirq_bit;
23445 +
23446 +       local_irq_enable();
23447 +
23448 +       h = softirq_vec;
23449 +
23450 +       while ((softirq_bit = ffs(pending))) {
23451 +               unsigned int vec_nr;
23452 +
23453 +               h += softirq_bit - 1;
23454 +               vec_nr = h - softirq_vec;
23455 +               handle_softirq(vec_nr);
23456 +
23457 +               h++;
23458 +               pending >>= softirq_bit;
23459 +       }
23460 +
23461 +       rcu_bh_qs();
23462 +       local_irq_disable();
23463 +}
23464 +
23465 +static void run_ksoftirqd(unsigned int cpu)
23466 +{
23467 +       local_irq_disable();
23468 +       if (ksoftirqd_softirq_pending()) {
23469 +               __do_softirq();
23470 +               local_irq_enable();
23471 +               cond_resched_rcu_qs();
23472 +               return;
23473 +       }
23474 +       local_irq_enable();
23475 +}
23476 +
23477  /*
23478   * preempt_count and SOFTIRQ_OFFSET usage:
23479   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
23480 @@ -247,10 +419,8 @@
23481         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
23482         unsigned long old_flags = current->flags;
23483         int max_restart = MAX_SOFTIRQ_RESTART;
23484 -       struct softirq_action *h;
23485         bool in_hardirq;
23486         __u32 pending;
23487 -       int softirq_bit;
23488  
23489         /*
23490          * Mask out PF_MEMALLOC s current task context is borrowed for the
23491 @@ -269,36 +439,7 @@
23492         /* Reset the pending bitmask before enabling irqs */
23493         set_softirq_pending(0);
23494  
23495 -       local_irq_enable();
23496 -
23497 -       h = softirq_vec;
23498 -
23499 -       while ((softirq_bit = ffs(pending))) {
23500 -               unsigned int vec_nr;
23501 -               int prev_count;
23502 -
23503 -               h += softirq_bit - 1;
23504 -
23505 -               vec_nr = h - softirq_vec;
23506 -               prev_count = preempt_count();
23507 -
23508 -               kstat_incr_softirqs_this_cpu(vec_nr);
23509 -
23510 -               trace_softirq_entry(vec_nr);
23511 -               h->action(h);
23512 -               trace_softirq_exit(vec_nr);
23513 -               if (unlikely(prev_count != preempt_count())) {
23514 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23515 -                              vec_nr, softirq_to_name[vec_nr], h->action,
23516 -                              prev_count, preempt_count());
23517 -                       preempt_count_set(prev_count);
23518 -               }
23519 -               h++;
23520 -               pending >>= softirq_bit;
23521 -       }
23522 -
23523 -       rcu_bh_qs();
23524 -       local_irq_disable();
23525 +       handle_pending_softirqs(pending);
23526  
23527         pending = local_softirq_pending();
23528         if (pending) {
23529 @@ -335,6 +476,309 @@
23530  }
23531  
23532  /*
23533 + * This function must run with irqs disabled!
23534 + */
23535 +void raise_softirq_irqoff(unsigned int nr)
23536 +{
23537 +       __raise_softirq_irqoff(nr);
23538 +
23539 +       /*
23540 +        * If we're in an interrupt or softirq, we're done
23541 +        * (this also catches softirq-disabled code). We will
23542 +        * actually run the softirq once we return from
23543 +        * the irq or softirq.
23544 +        *
23545 +        * Otherwise we wake up ksoftirqd to make sure we
23546 +        * schedule the softirq soon.
23547 +        */
23548 +       if (!in_interrupt())
23549 +               wakeup_softirqd();
23550 +}
23551 +
23552 +void __raise_softirq_irqoff(unsigned int nr)
23553 +{
23554 +       trace_softirq_raise(nr);
23555 +       or_softirq_pending(1UL << nr);
23556 +}
23557 +
23558 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
23559 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
23560 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
23561 +
23562 +#else /* !PREEMPT_RT_FULL */
23563 +
23564 +/*
23565 + * On RT we serialize softirq execution with a cpu local lock per softirq
23566 + */
23567 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
23568 +
23569 +void __init softirq_early_init(void)
23570 +{
23571 +       int i;
23572 +
23573 +       for (i = 0; i < NR_SOFTIRQS; i++)
23574 +               local_irq_lock_init(local_softirq_locks[i]);
23575 +}
23576 +
23577 +static void lock_softirq(int which)
23578 +{
23579 +       local_lock(local_softirq_locks[which]);
23580 +}
23581 +
23582 +static void unlock_softirq(int which)
23583 +{
23584 +       local_unlock(local_softirq_locks[which]);
23585 +}
23586 +
23587 +static void do_single_softirq(int which)
23588 +{
23589 +       unsigned long old_flags = current->flags;
23590 +
23591 +       current->flags &= ~PF_MEMALLOC;
23592 +       vtime_account_irq_enter(current);
23593 +       current->flags |= PF_IN_SOFTIRQ;
23594 +       lockdep_softirq_enter();
23595 +       local_irq_enable();
23596 +       handle_softirq(which);
23597 +       local_irq_disable();
23598 +       lockdep_softirq_exit();
23599 +       current->flags &= ~PF_IN_SOFTIRQ;
23600 +       vtime_account_irq_enter(current);
23601 +       current_restore_flags(old_flags, PF_MEMALLOC);
23602 +}
23603 +
23604 +/*
23605 + * Called with interrupts disabled. Process softirqs which were raised
23606 + * in current context (or on behalf of ksoftirqd).
23607 + */
23608 +static void do_current_softirqs(void)
23609 +{
23610 +       while (current->softirqs_raised) {
23611 +               int i = __ffs(current->softirqs_raised);
23612 +               unsigned int pending, mask = (1U << i);
23613 +
23614 +               current->softirqs_raised &= ~mask;
23615 +               local_irq_enable();
23616 +
23617 +               /*
23618 +                * If the lock is contended, we boost the owner to
23619 +                * process the softirq or leave the critical section
23620 +                * now.
23621 +                */
23622 +               lock_softirq(i);
23623 +               local_irq_disable();
23624 +               softirq_set_runner(i);
23625 +               /*
23626 +                * Check with the local_softirq_pending() bits,
23627 +                * whether we need to process this still or if someone
23628 +                * else took care of it.
23629 +                */
23630 +               pending = local_softirq_pending();
23631 +               if (pending & mask) {
23632 +                       set_softirq_pending(pending & ~mask);
23633 +                       do_single_softirq(i);
23634 +               }
23635 +               softirq_clr_runner(i);
23636 +               WARN_ON(current->softirq_nestcnt != 1);
23637 +               local_irq_enable();
23638 +               unlock_softirq(i);
23639 +               local_irq_disable();
23640 +       }
23641 +}
23642 +
23643 +void __local_bh_disable(void)
23644 +{
23645 +       if (++current->softirq_nestcnt == 1)
23646 +               migrate_disable();
23647 +}
23648 +EXPORT_SYMBOL(__local_bh_disable);
23649 +
23650 +void __local_bh_enable(void)
23651 +{
23652 +       if (WARN_ON(current->softirq_nestcnt == 0))
23653 +               return;
23654 +
23655 +       local_irq_disable();
23656 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
23657 +               do_current_softirqs();
23658 +       local_irq_enable();
23659 +
23660 +       if (--current->softirq_nestcnt == 0)
23661 +               migrate_enable();
23662 +}
23663 +EXPORT_SYMBOL(__local_bh_enable);
23664 +
23665 +void _local_bh_enable(void)
23666 +{
23667 +       if (WARN_ON(current->softirq_nestcnt == 0))
23668 +               return;
23669 +       if (--current->softirq_nestcnt == 0)
23670 +               migrate_enable();
23671 +}
23672 +EXPORT_SYMBOL(_local_bh_enable);
23673 +
23674 +int in_serving_softirq(void)
23675 +{
23676 +       return current->flags & PF_IN_SOFTIRQ;
23677 +}
23678 +EXPORT_SYMBOL(in_serving_softirq);
23679 +
23680 +/* Called with preemption disabled */
23681 +static void run_ksoftirqd(unsigned int cpu)
23682 +{
23683 +       local_irq_disable();
23684 +       current->softirq_nestcnt++;
23685 +
23686 +       do_current_softirqs();
23687 +       current->softirq_nestcnt--;
23688 +       local_irq_enable();
23689 +       cond_resched_rcu_qs();
23690 +}
23691 +
23692 +/*
23693 + * Called from netif_rx_ni(). Preemption enabled, but migration
23694 + * disabled. So the cpu can't go away under us.
23695 + */
23696 +void thread_do_softirq(void)
23697 +{
23698 +       if (!in_serving_softirq() && current->softirqs_raised) {
23699 +               current->softirq_nestcnt++;
23700 +               do_current_softirqs();
23701 +               current->softirq_nestcnt--;
23702 +       }
23703 +}
23704 +
23705 +static void do_raise_softirq_irqoff(unsigned int nr)
23706 +{
23707 +       unsigned int mask;
23708 +
23709 +       mask = 1UL << nr;
23710 +
23711 +       trace_softirq_raise(nr);
23712 +       or_softirq_pending(mask);
23713 +
23714 +       /*
23715 +        * If we are not in a hard interrupt and inside a bh disabled
23716 +        * region, we simply raise the flag on current. local_bh_enable()
23717 +        * will make sure that the softirq is executed. Otherwise we
23718 +        * delegate it to ksoftirqd.
23719 +        */
23720 +       if (!in_irq() && current->softirq_nestcnt)
23721 +               current->softirqs_raised |= mask;
23722 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
23723 +               return;
23724 +
23725 +       if (mask & TIMER_SOFTIRQS)
23726 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
23727 +       else
23728 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
23729 +}
23730 +
23731 +static void wakeup_proper_softirq(unsigned int nr)
23732 +{
23733 +       if ((1UL << nr) & TIMER_SOFTIRQS)
23734 +               wakeup_timer_softirqd();
23735 +       else
23736 +               wakeup_softirqd();
23737 +}
23738 +
23739 +void __raise_softirq_irqoff(unsigned int nr)
23740 +{
23741 +       do_raise_softirq_irqoff(nr);
23742 +       if (!in_irq() && !current->softirq_nestcnt)
23743 +               wakeup_proper_softirq(nr);
23744 +}
23745 +
23746 +/*
23747 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
23748 + */
23749 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
23750 +{
23751 +       unsigned int mask;
23752 +
23753 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
23754 +                        !__this_cpu_read(ktimer_softirqd)))
23755 +               return;
23756 +       mask = 1UL << nr;
23757 +
23758 +       trace_softirq_raise(nr);
23759 +       or_softirq_pending(mask);
23760 +       if (mask & TIMER_SOFTIRQS)
23761 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
23762 +       else
23763 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
23764 +       wakeup_proper_softirq(nr);
23765 +}
23766 +
23767 +/*
23768 + * This function must run with irqs disabled!
23769 + */
23770 +void raise_softirq_irqoff(unsigned int nr)
23771 +{
23772 +       do_raise_softirq_irqoff(nr);
23773 +
23774 +       /*
23775 +        * If we're in an hard interrupt we let irq return code deal
23776 +        * with the wakeup of ksoftirqd.
23777 +        */
23778 +       if (in_irq())
23779 +               return;
23780 +       /*
23781 +        * If we are in thread context but outside of a bh disabled
23782 +        * region, we need to wake ksoftirqd as well.
23783 +        *
23784 +        * CHECKME: Some of the places which do that could be wrapped
23785 +        * into local_bh_disable/enable pairs. Though it's unclear
23786 +        * whether this is worth the effort. To find those places just
23787 +        * raise a WARN() if the condition is met.
23788 +        */
23789 +       if (!current->softirq_nestcnt)
23790 +               wakeup_proper_softirq(nr);
23791 +}
23792 +
23793 +static inline int ksoftirqd_softirq_pending(void)
23794 +{
23795 +       return current->softirqs_raised;
23796 +}
23797 +
23798 +static inline void local_bh_disable_nort(void) { }
23799 +static inline void _local_bh_enable_nort(void) { }
23800 +
23801 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
23802 +{
23803 +       /* Take over all but timer pending softirqs when starting */
23804 +       local_irq_disable();
23805 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
23806 +       local_irq_enable();
23807 +}
23808 +
23809 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
23810 +{
23811 +       struct sched_param param = { .sched_priority = 1 };
23812 +
23813 +       sched_setscheduler(current, SCHED_FIFO, &param);
23814 +
23815 +       /* Take over timer pending softirqs when starting */
23816 +       local_irq_disable();
23817 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
23818 +       local_irq_enable();
23819 +}
23820 +
23821 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
23822 +                                                   bool online)
23823 +{
23824 +       struct sched_param param = { .sched_priority = 0 };
23825 +
23826 +       sched_setscheduler(current, SCHED_NORMAL, &param);
23827 +}
23828 +
23829 +static int ktimer_softirqd_should_run(unsigned int cpu)
23830 +{
23831 +       return current->softirqs_raised;
23832 +}
23833 +
23834 +#endif /* PREEMPT_RT_FULL */
23835 +/*
23836   * Enter an interrupt context.
23837   */
23838  void irq_enter(void)
23839 @@ -345,9 +789,9 @@
23840                  * Prevent raise_softirq from needlessly waking up ksoftirqd
23841                  * here, as softirq will be serviced on return from interrupt.
23842                  */
23843 -               local_bh_disable();
23844 +               local_bh_disable_nort();
23845                 tick_irq_enter();
23846 -               _local_bh_enable();
23847 +               _local_bh_enable_nort();
23848         }
23849  
23850         __irq_enter();
23851 @@ -355,6 +799,7 @@
23852  
23853  static inline void invoke_softirq(void)
23854  {
23855 +#ifndef CONFIG_PREEMPT_RT_FULL
23856         if (ksoftirqd_running(local_softirq_pending()))
23857                 return;
23858  
23859 @@ -377,6 +822,18 @@
23860         } else {
23861                 wakeup_softirqd();
23862         }
23863 +#else /* PREEMPT_RT_FULL */
23864 +       unsigned long flags;
23865 +
23866 +       local_irq_save(flags);
23867 +       if (__this_cpu_read(ksoftirqd) &&
23868 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
23869 +               wakeup_softirqd();
23870 +       if (__this_cpu_read(ktimer_softirqd) &&
23871 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
23872 +               wakeup_timer_softirqd();
23873 +       local_irq_restore(flags);
23874 +#endif
23875  }
23876  
23877  static inline void tick_irq_exit(void)
23878 @@ -385,7 +842,13 @@
23879         int cpu = smp_processor_id();
23880  
23881         /* Make sure that timer wheel updates are propagated */
23882 -       if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
23883 +#ifdef CONFIG_PREEMPT_RT_BASE
23884 +       if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) &&
23885 +           !need_resched() && !local_softirq_pending())
23886 +#else
23887 +       if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu))
23888 +#endif
23889 +       {
23890                 if (!in_irq())
23891                         tick_nohz_irq_exit();
23892         }
23893 @@ -413,26 +876,6 @@
23894         trace_hardirq_exit(); /* must be last! */
23895  }
23896  
23897 -/*
23898 - * This function must run with irqs disabled!
23899 - */
23900 -inline void raise_softirq_irqoff(unsigned int nr)
23901 -{
23902 -       __raise_softirq_irqoff(nr);
23903 -
23904 -       /*
23905 -        * If we're in an interrupt or softirq, we're done
23906 -        * (this also catches softirq-disabled code). We will
23907 -        * actually run the softirq once we return from
23908 -        * the irq or softirq.
23909 -        *
23910 -        * Otherwise we wake up ksoftirqd to make sure we
23911 -        * schedule the softirq soon.
23912 -        */
23913 -       if (!in_interrupt())
23914 -               wakeup_softirqd();
23915 -}
23916 -
23917  void raise_softirq(unsigned int nr)
23918  {
23919         unsigned long flags;
23920 @@ -442,12 +885,6 @@
23921         local_irq_restore(flags);
23922  }
23923  
23924 -void __raise_softirq_irqoff(unsigned int nr)
23925 -{
23926 -       trace_softirq_raise(nr);
23927 -       or_softirq_pending(1UL << nr);
23928 -}
23929 -
23930  void open_softirq(int nr, void (*action)(struct softirq_action *))
23931  {
23932         softirq_vec[nr].action = action;
23933 @@ -464,15 +901,45 @@
23934  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
23935  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
23936  
23937 +static void inline
23938 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
23939 +{
23940 +       if (tasklet_trylock(t)) {
23941 +again:
23942 +               /* We may have been preempted before tasklet_trylock
23943 +                * and __tasklet_action may have already run.
23944 +                * So double check the sched bit while the takslet
23945 +                * is locked before adding it to the list.
23946 +                */
23947 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
23948 +                       t->next = NULL;
23949 +                       *head->tail = t;
23950 +                       head->tail = &(t->next);
23951 +                       raise_softirq_irqoff(nr);
23952 +                       tasklet_unlock(t);
23953 +               } else {
23954 +                       /* This is subtle. If we hit the corner case above
23955 +                        * It is possible that we get preempted right here,
23956 +                        * and another task has successfully called
23957 +                        * tasklet_schedule(), then this function, and
23958 +                        * failed on the trylock. Thus we must be sure
23959 +                        * before releasing the tasklet lock, that the
23960 +                        * SCHED_BIT is clear. Otherwise the tasklet
23961 +                        * may get its SCHED_BIT set, but not added to the
23962 +                        * list
23963 +                        */
23964 +                       if (!tasklet_tryunlock(t))
23965 +                               goto again;
23966 +               }
23967 +       }
23968 +}
23969 +
23970  void __tasklet_schedule(struct tasklet_struct *t)
23971  {
23972         unsigned long flags;
23973  
23974         local_irq_save(flags);
23975 -       t->next = NULL;
23976 -       *__this_cpu_read(tasklet_vec.tail) = t;
23977 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
23978 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
23979 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
23980         local_irq_restore(flags);
23981  }
23982  EXPORT_SYMBOL(__tasklet_schedule);
23983 @@ -482,50 +949,108 @@
23984         unsigned long flags;
23985  
23986         local_irq_save(flags);
23987 -       t->next = NULL;
23988 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
23989 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
23990 -       raise_softirq_irqoff(HI_SOFTIRQ);
23991 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
23992         local_irq_restore(flags);
23993  }
23994  EXPORT_SYMBOL(__tasklet_hi_schedule);
23995  
23996 -static __latent_entropy void tasklet_action(struct softirq_action *a)
23997 +void tasklet_enable(struct tasklet_struct *t)
23998  {
23999 -       struct tasklet_struct *list;
24000 +       if (!atomic_dec_and_test(&t->count))
24001 +               return;
24002 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24003 +               tasklet_schedule(t);
24004 +}
24005 +EXPORT_SYMBOL(tasklet_enable);
24006  
24007 -       local_irq_disable();
24008 -       list = __this_cpu_read(tasklet_vec.head);
24009 -       __this_cpu_write(tasklet_vec.head, NULL);
24010 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24011 -       local_irq_enable();
24012 +static void __tasklet_action(struct softirq_action *a,
24013 +                            struct tasklet_struct *list)
24014 +{
24015 +       int loops = 1000000;
24016  
24017         while (list) {
24018                 struct tasklet_struct *t = list;
24019  
24020                 list = list->next;
24021  
24022 -               if (tasklet_trylock(t)) {
24023 -                       if (!atomic_read(&t->count)) {
24024 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24025 -                                                       &t->state))
24026 -                                       BUG();
24027 -                               t->func(t->data);
24028 -                               tasklet_unlock(t);
24029 -                               continue;
24030 -                       }
24031 -                       tasklet_unlock(t);
24032 +               /*
24033 +                * Should always succeed - after a tasklist got on the
24034 +                * list (after getting the SCHED bit set from 0 to 1),
24035 +                * nothing but the tasklet softirq it got queued to can
24036 +                * lock it:
24037 +                */
24038 +               if (!tasklet_trylock(t)) {
24039 +                       WARN_ON(1);
24040 +                       continue;
24041                 }
24042  
24043 -               local_irq_disable();
24044                 t->next = NULL;
24045 -               *__this_cpu_read(tasklet_vec.tail) = t;
24046 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
24047 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24048 -               local_irq_enable();
24049 +
24050 +               /*
24051 +                * If we cannot handle the tasklet because it's disabled,
24052 +                * mark it as pending. tasklet_enable() will later
24053 +                * re-schedule the tasklet.
24054 +                */
24055 +               if (unlikely(atomic_read(&t->count))) {
24056 +out_disabled:
24057 +                       /* implicit unlock: */
24058 +                       wmb();
24059 +                       t->state = TASKLET_STATEF_PENDING;
24060 +                       continue;
24061 +               }
24062 +
24063 +               /*
24064 +                * After this point on the tasklet might be rescheduled
24065 +                * on another CPU, but it can only be added to another
24066 +                * CPU's tasklet list if we unlock the tasklet (which we
24067 +                * dont do yet).
24068 +                */
24069 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24070 +                       WARN_ON(1);
24071 +
24072 +again:
24073 +               t->func(t->data);
24074 +
24075 +               /*
24076 +                * Try to unlock the tasklet. We must use cmpxchg, because
24077 +                * another CPU might have scheduled or disabled the tasklet.
24078 +                * We only allow the STATE_RUN -> 0 transition here.
24079 +                */
24080 +               while (!tasklet_tryunlock(t)) {
24081 +                       /*
24082 +                        * If it got disabled meanwhile, bail out:
24083 +                        */
24084 +                       if (atomic_read(&t->count))
24085 +                               goto out_disabled;
24086 +                       /*
24087 +                        * If it got scheduled meanwhile, re-execute
24088 +                        * the tasklet function:
24089 +                        */
24090 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24091 +                               goto again;
24092 +                       if (!--loops) {
24093 +                               printk("hm, tasklet state: %08lx\n", t->state);
24094 +                               WARN_ON(1);
24095 +                               tasklet_unlock(t);
24096 +                               break;
24097 +                       }
24098 +               }
24099         }
24100  }
24101  
24102 +static __latent_entropy void tasklet_action(struct softirq_action *a)
24103 +{
24104 +       struct tasklet_struct *list;
24105 +
24106 +       local_irq_disable();
24107 +       list = __this_cpu_read(tasklet_vec.head);
24108 +       __this_cpu_write(tasklet_vec.head, NULL);
24109 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24110 +       local_irq_enable();
24111 +
24112 +       __tasklet_action(a, list);
24113 +}
24114 +
24115  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24116  {
24117         struct tasklet_struct *list;
24118 @@ -536,30 +1061,7 @@
24119         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24120         local_irq_enable();
24121  
24122 -       while (list) {
24123 -               struct tasklet_struct *t = list;
24124 -
24125 -               list = list->next;
24126 -
24127 -               if (tasklet_trylock(t)) {
24128 -                       if (!atomic_read(&t->count)) {
24129 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24130 -                                                       &t->state))
24131 -                                       BUG();
24132 -                               t->func(t->data);
24133 -                               tasklet_unlock(t);
24134 -                               continue;
24135 -                       }
24136 -                       tasklet_unlock(t);
24137 -               }
24138 -
24139 -               local_irq_disable();
24140 -               t->next = NULL;
24141 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
24142 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24143 -               __raise_softirq_irqoff(HI_SOFTIRQ);
24144 -               local_irq_enable();
24145 -       }
24146 +       __tasklet_action(a, list);
24147  }
24148  
24149  void tasklet_init(struct tasklet_struct *t,
24150 @@ -580,7 +1082,7 @@
24151  
24152         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24153                 do {
24154 -                       yield();
24155 +                       msleep(1);
24156                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24157         }
24158         tasklet_unlock_wait(t);
24159 @@ -588,57 +1090,6 @@
24160  }
24161  EXPORT_SYMBOL(tasklet_kill);
24162  
24163 -/*
24164 - * tasklet_hrtimer
24165 - */
24166 -
24167 -/*
24168 - * The trampoline is called when the hrtimer expires. It schedules a tasklet
24169 - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
24170 - * hrtimer callback, but from softirq context.
24171 - */
24172 -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
24173 -{
24174 -       struct tasklet_hrtimer *ttimer =
24175 -               container_of(timer, struct tasklet_hrtimer, timer);
24176 -
24177 -       tasklet_hi_schedule(&ttimer->tasklet);
24178 -       return HRTIMER_NORESTART;
24179 -}
24180 -
24181 -/*
24182 - * Helper function which calls the hrtimer callback from
24183 - * tasklet/softirq context
24184 - */
24185 -static void __tasklet_hrtimer_trampoline(unsigned long data)
24186 -{
24187 -       struct tasklet_hrtimer *ttimer = (void *)data;
24188 -       enum hrtimer_restart restart;
24189 -
24190 -       restart = ttimer->function(&ttimer->timer);
24191 -       if (restart != HRTIMER_NORESTART)
24192 -               hrtimer_restart(&ttimer->timer);
24193 -}
24194 -
24195 -/**
24196 - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
24197 - * @ttimer:     tasklet_hrtimer which is initialized
24198 - * @function:   hrtimer callback function which gets called from softirq context
24199 - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
24200 - * @mode:       hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
24201 - */
24202 -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
24203 -                         enum hrtimer_restart (*function)(struct hrtimer *),
24204 -                         clockid_t which_clock, enum hrtimer_mode mode)
24205 -{
24206 -       hrtimer_init(&ttimer->timer, which_clock, mode);
24207 -       ttimer->timer.function = __hrtimer_tasklet_trampoline;
24208 -       tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
24209 -                    (unsigned long)ttimer);
24210 -       ttimer->function = function;
24211 -}
24212 -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
24213 -
24214  void __init softirq_init(void)
24215  {
24216         int cpu;
24217 @@ -654,25 +1105,26 @@
24218         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24219  }
24220  
24221 -static int ksoftirqd_should_run(unsigned int cpu)
24222 -{
24223 -       return local_softirq_pending();
24224 -}
24225 -
24226 -static void run_ksoftirqd(unsigned int cpu)
24227 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24228 +void tasklet_unlock_wait(struct tasklet_struct *t)
24229  {
24230 -       local_irq_disable();
24231 -       if (local_softirq_pending()) {
24232 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24233                 /*
24234 -                * We can safely run softirq on inline stack, as we are not deep
24235 -                * in the task stack here.
24236 +                * Hack for now to avoid this busy-loop:
24237                  */
24238 -               __do_softirq();
24239 -               local_irq_enable();
24240 -               cond_resched_rcu_qs();
24241 -               return;
24242 +#ifdef CONFIG_PREEMPT_RT_FULL
24243 +               msleep(1);
24244 +#else
24245 +               barrier();
24246 +#endif
24247         }
24248 -       local_irq_enable();
24249 +}
24250 +EXPORT_SYMBOL(tasklet_unlock_wait);
24251 +#endif
24252 +
24253 +static int ksoftirqd_should_run(unsigned int cpu)
24254 +{
24255 +       return ksoftirqd_softirq_pending();
24256  }
24257  
24258  #ifdef CONFIG_HOTPLUG_CPU
24259 @@ -739,17 +1191,31 @@
24260  
24261  static struct smp_hotplug_thread softirq_threads = {
24262         .store                  = &ksoftirqd,
24263 +       .setup                  = ksoftirqd_set_sched_params,
24264         .thread_should_run      = ksoftirqd_should_run,
24265         .thread_fn              = run_ksoftirqd,
24266         .thread_comm            = "ksoftirqd/%u",
24267  };
24268  
24269 +#ifdef CONFIG_PREEMPT_RT_FULL
24270 +static struct smp_hotplug_thread softirq_timer_threads = {
24271 +       .store                  = &ktimer_softirqd,
24272 +       .setup                  = ktimer_softirqd_set_sched_params,
24273 +       .cleanup                = ktimer_softirqd_clr_sched_params,
24274 +       .thread_should_run      = ktimer_softirqd_should_run,
24275 +       .thread_fn              = run_ksoftirqd,
24276 +       .thread_comm            = "ktimersoftd/%u",
24277 +};
24278 +#endif
24279 +
24280  static __init int spawn_ksoftirqd(void)
24281  {
24282         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
24283                                   takeover_tasklets);
24284         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24285 -
24286 +#ifdef CONFIG_PREEMPT_RT_FULL
24287 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24288 +#endif
24289         return 0;
24290  }
24291  early_initcall(spawn_ksoftirqd);
24292 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/stop_machine.c linux-4.14/kernel/stop_machine.c
24293 --- linux-4.14.orig/kernel/stop_machine.c       2018-09-05 11:03:22.000000000 +0200
24294 +++ linux-4.14/kernel/stop_machine.c    2018-09-05 11:05:07.000000000 +0200
24295 @@ -496,6 +496,8 @@
24296                 struct cpu_stop_done *done = work->done;
24297                 int ret;
24298  
24299 +               /* XXX */
24300 +
24301                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
24302                 preempt_count_inc();
24303                 ret = fn(arg);
24304 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/alarmtimer.c linux-4.14/kernel/time/alarmtimer.c
24305 --- linux-4.14.orig/kernel/time/alarmtimer.c    2018-09-05 11:03:22.000000000 +0200
24306 +++ linux-4.14/kernel/time/alarmtimer.c 2018-09-05 11:05:07.000000000 +0200
24307 @@ -436,7 +436,7 @@
24308                 int ret = alarm_try_to_cancel(alarm);
24309                 if (ret >= 0)
24310                         return ret;
24311 -               cpu_relax();
24312 +               hrtimer_wait_for_timer(&alarm->timer);
24313         }
24314  }
24315  EXPORT_SYMBOL_GPL(alarm_cancel);
24316 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/hrtimer.c linux-4.14/kernel/time/hrtimer.c
24317 --- linux-4.14.orig/kernel/time/hrtimer.c       2018-09-05 11:03:22.000000000 +0200
24318 +++ linux-4.14/kernel/time/hrtimer.c    2018-09-05 11:05:07.000000000 +0200
24319 @@ -60,6 +60,15 @@
24320  #include "tick-internal.h"
24321  
24322  /*
24323 + * Masks for selecting the soft and hard context timers from
24324 + * cpu_base->active
24325 + */
24326 +#define MASK_SHIFT             (HRTIMER_BASE_MONOTONIC_SOFT)
24327 +#define HRTIMER_ACTIVE_HARD    ((1U << MASK_SHIFT) - 1)
24328 +#define HRTIMER_ACTIVE_SOFT    (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
24329 +#define HRTIMER_ACTIVE_ALL     (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
24330 +
24331 +/*
24332   * The timer bases:
24333   *
24334   * There are more clockids than hrtimer bases. Thus, we index
24335 @@ -70,7 +79,6 @@
24336  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
24337  {
24338         .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
24339 -       .seq = SEQCNT_ZERO(hrtimer_bases.seq),
24340         .clock_base =
24341         {
24342                 {
24343 @@ -93,6 +101,26 @@
24344                         .clockid = CLOCK_TAI,
24345                         .get_time = &ktime_get_clocktai,
24346                 },
24347 +               {
24348 +                       .index = HRTIMER_BASE_MONOTONIC_SOFT,
24349 +                       .clockid = CLOCK_MONOTONIC,
24350 +                       .get_time = &ktime_get,
24351 +               },
24352 +               {
24353 +                       .index = HRTIMER_BASE_REALTIME_SOFT,
24354 +                       .clockid = CLOCK_REALTIME,
24355 +                       .get_time = &ktime_get_real,
24356 +               },
24357 +               {
24358 +                       .index = HRTIMER_BASE_BOOTTIME_SOFT,
24359 +                       .clockid = CLOCK_BOOTTIME,
24360 +                       .get_time = &ktime_get_boottime,
24361 +               },
24362 +               {
24363 +                       .index = HRTIMER_BASE_TAI_SOFT,
24364 +                       .clockid = CLOCK_TAI,
24365 +                       .get_time = &ktime_get_clocktai,
24366 +               },
24367         }
24368  };
24369  
24370 @@ -118,7 +146,6 @@
24371   * timer->base->cpu_base
24372   */
24373  static struct hrtimer_cpu_base migration_cpu_base = {
24374 -       .seq = SEQCNT_ZERO(migration_cpu_base),
24375         .clock_base = { { .cpu_base = &migration_cpu_base, }, },
24376  };
24377  
24378 @@ -156,45 +183,33 @@
24379  }
24380  
24381  /*
24382 - * With HIGHRES=y we do not migrate the timer when it is expiring
24383 - * before the next event on the target cpu because we cannot reprogram
24384 - * the target cpu hardware and we would cause it to fire late.
24385 + * We do not migrate the timer when it is expiring before the next
24386 + * event on the target cpu. When high resolution is enabled, we cannot
24387 + * reprogram the target cpu hardware and we would cause it to fire
24388 + * late. To keep it simple, we handle the high resolution enabled and
24389 + * disabled case similar.
24390   *
24391   * Called with cpu_base->lock of target cpu held.
24392   */
24393  static int
24394  hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
24395  {
24396 -#ifdef CONFIG_HIGH_RES_TIMERS
24397         ktime_t expires;
24398  
24399 -       if (!new_base->cpu_base->hres_active)
24400 -               return 0;
24401 -
24402         expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
24403 -       return expires <= new_base->cpu_base->expires_next;
24404 -#else
24405 -       return 0;
24406 -#endif
24407 +       return expires < new_base->cpu_base->expires_next;
24408  }
24409  
24410 -#ifdef CONFIG_NO_HZ_COMMON
24411 -static inline
24412 -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
24413 -                                        int pinned)
24414 -{
24415 -       if (pinned || !base->migration_enabled)
24416 -               return base;
24417 -       return &per_cpu(hrtimer_bases, get_nohz_timer_target());
24418 -}
24419 -#else
24420  static inline
24421  struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
24422                                          int pinned)
24423  {
24424 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
24425 +       if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
24426 +               return &per_cpu(hrtimer_bases, get_nohz_timer_target());
24427 +#endif
24428         return base;
24429  }
24430 -#endif
24431  
24432  /*
24433   * We switch the timer base to a power-optimized selected CPU target,
24434 @@ -396,7 +411,8 @@
24435         debug_object_init(timer, &hrtimer_debug_descr);
24436  }
24437  
24438 -static inline void debug_hrtimer_activate(struct hrtimer *timer)
24439 +static inline void debug_hrtimer_activate(struct hrtimer *timer,
24440 +                                         enum hrtimer_mode mode)
24441  {
24442         debug_object_activate(timer, &hrtimer_debug_descr);
24443  }
24444 @@ -429,8 +445,10 @@
24445  EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
24446  
24447  #else
24448 +
24449  static inline void debug_hrtimer_init(struct hrtimer *timer) { }
24450 -static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
24451 +static inline void debug_hrtimer_activate(struct hrtimer *timer,
24452 +                                         enum hrtimer_mode mode) { }
24453  static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
24454  #endif
24455  
24456 @@ -442,10 +460,11 @@
24457         trace_hrtimer_init(timer, clockid, mode);
24458  }
24459  
24460 -static inline void debug_activate(struct hrtimer *timer)
24461 +static inline void debug_activate(struct hrtimer *timer,
24462 +                                 enum hrtimer_mode mode)
24463  {
24464 -       debug_hrtimer_activate(timer);
24465 -       trace_hrtimer_start(timer);
24466 +       debug_hrtimer_activate(timer, mode);
24467 +       trace_hrtimer_start(timer, mode);
24468  }
24469  
24470  static inline void debug_deactivate(struct hrtimer *timer)
24471 @@ -454,35 +473,43 @@
24472         trace_hrtimer_cancel(timer);
24473  }
24474  
24475 -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
24476 -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
24477 -                                            struct hrtimer *timer)
24478 +static struct hrtimer_clock_base *
24479 +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
24480  {
24481 -#ifdef CONFIG_HIGH_RES_TIMERS
24482 -       cpu_base->next_timer = timer;
24483 -#endif
24484 +       unsigned int idx;
24485 +
24486 +       if (!*active)
24487 +               return NULL;
24488 +
24489 +       idx = __ffs(*active);
24490 +       *active &= ~(1U << idx);
24491 +
24492 +       return &cpu_base->clock_base[idx];
24493  }
24494  
24495 -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
24496 +#define for_each_active_base(base, cpu_base, active)   \
24497 +       while ((base = __next_base((cpu_base), &(active))))
24498 +
24499 +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
24500 +                                        unsigned int active,
24501 +                                        ktime_t expires_next)
24502  {
24503 -       struct hrtimer_clock_base *base = cpu_base->clock_base;
24504 -       unsigned int active = cpu_base->active_bases;
24505 -       ktime_t expires, expires_next = KTIME_MAX;
24506 +       struct hrtimer_clock_base *base;
24507 +       ktime_t expires;
24508  
24509 -       hrtimer_update_next_timer(cpu_base, NULL);
24510 -       for (; active; base++, active >>= 1) {
24511 +       for_each_active_base(base, cpu_base, active) {
24512                 struct timerqueue_node *next;
24513                 struct hrtimer *timer;
24514  
24515 -               if (!(active & 0x01))
24516 -                       continue;
24517 -
24518                 next = timerqueue_getnext(&base->active);
24519                 timer = container_of(next, struct hrtimer, node);
24520                 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24521                 if (expires < expires_next) {
24522                         expires_next = expires;
24523 -                       hrtimer_update_next_timer(cpu_base, timer);
24524 +                       if (timer->is_soft)
24525 +                               cpu_base->softirq_next_timer = timer;
24526 +                       else
24527 +                               cpu_base->next_timer = timer;
24528                 }
24529         }
24530         /*
24531 @@ -494,7 +521,47 @@
24532                 expires_next = 0;
24533         return expires_next;
24534  }
24535 -#endif
24536 +
24537 +/*
24538 + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
24539 + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
24540 + *
24541 + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
24542 + * those timers will get run whenever the softirq gets handled, at the end of
24543 + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
24544 + *
24545 + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
24546 + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
24547 + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
24548 + *
24549 + * @active_mask must be one of:
24550 + *  - HRTIMER_ACTIVE_ALL,
24551 + *  - HRTIMER_ACTIVE_SOFT, or
24552 + *  - HRTIMER_ACTIVE_HARD.
24553 + */
24554 +static ktime_t
24555 +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
24556 +{
24557 +       unsigned int active;
24558 +       struct hrtimer *next_timer = NULL;
24559 +       ktime_t expires_next = KTIME_MAX;
24560 +
24561 +       if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
24562 +               active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
24563 +               cpu_base->softirq_next_timer = NULL;
24564 +               expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
24565 +
24566 +               next_timer = cpu_base->softirq_next_timer;
24567 +       }
24568 +
24569 +       if (active_mask & HRTIMER_ACTIVE_HARD) {
24570 +               active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
24571 +               cpu_base->next_timer = next_timer;
24572 +               expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
24573 +       }
24574 +
24575 +       return expires_next;
24576 +}
24577  
24578  static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
24579  {
24580 @@ -502,36 +569,14 @@
24581         ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
24582         ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
24583  
24584 -       return ktime_get_update_offsets_now(&base->clock_was_set_seq,
24585 +       ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
24586                                             offs_real, offs_boot, offs_tai);
24587 -}
24588 -
24589 -/* High resolution timer related functions */
24590 -#ifdef CONFIG_HIGH_RES_TIMERS
24591 -
24592 -/*
24593 - * High resolution timer enabled ?
24594 - */
24595 -static bool hrtimer_hres_enabled __read_mostly  = true;
24596 -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
24597 -EXPORT_SYMBOL_GPL(hrtimer_resolution);
24598 -
24599 -/*
24600 - * Enable / Disable high resolution mode
24601 - */
24602 -static int __init setup_hrtimer_hres(char *str)
24603 -{
24604 -       return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
24605 -}
24606  
24607 -__setup("highres=", setup_hrtimer_hres);
24608 +       base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
24609 +       base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
24610 +       base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
24611  
24612 -/*
24613 - * hrtimer_high_res_enabled - query, if the highres mode is enabled
24614 - */
24615 -static inline int hrtimer_is_hres_enabled(void)
24616 -{
24617 -       return hrtimer_hres_enabled;
24618 +       return now;
24619  }
24620  
24621  /*
24622 @@ -539,7 +584,8 @@
24623   */
24624  static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
24625  {
24626 -       return cpu_base->hres_active;
24627 +       return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
24628 +               cpu_base->hres_active : 0;
24629  }
24630  
24631  static inline int hrtimer_hres_active(void)
24632 @@ -557,10 +603,23 @@
24633  {
24634         ktime_t expires_next;
24635  
24636 -       if (!cpu_base->hres_active)
24637 -               return;
24638 +       /*
24639 +        * Find the current next expiration time.
24640 +        */
24641 +       expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
24642  
24643 -       expires_next = __hrtimer_get_next_event(cpu_base);
24644 +       if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
24645 +               /*
24646 +                * When the softirq is activated, hrtimer has to be
24647 +                * programmed with the first hard hrtimer because soft
24648 +                * timer interrupt could occur too late.
24649 +                */
24650 +               if (cpu_base->softirq_activated)
24651 +                       expires_next = __hrtimer_get_next_event(cpu_base,
24652 +                                                               HRTIMER_ACTIVE_HARD);
24653 +               else
24654 +                       cpu_base->softirq_expires_next = expires_next;
24655 +       }
24656  
24657         if (skip_equal && expires_next == cpu_base->expires_next)
24658                 return;
24659 @@ -568,6 +627,9 @@
24660         cpu_base->expires_next = expires_next;
24661  
24662         /*
24663 +        * If hres is not active, hardware does not have to be
24664 +        * reprogrammed yet.
24665 +        *
24666          * If a hang was detected in the last timer interrupt then we
24667          * leave the hang delay active in the hardware. We want the
24668          * system to make progress. That also prevents the following
24669 @@ -581,83 +643,38 @@
24670          * set. So we'd effectivly block all timers until the T2 event
24671          * fires.
24672          */
24673 -       if (cpu_base->hang_detected)
24674 +       if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
24675                 return;
24676  
24677         tick_program_event(cpu_base->expires_next, 1);
24678  }
24679  
24680 +/* High resolution timer related functions */
24681 +#ifdef CONFIG_HIGH_RES_TIMERS
24682 +
24683  /*
24684 - * When a timer is enqueued and expires earlier than the already enqueued
24685 - * timers, we have to check, whether it expires earlier than the timer for
24686 - * which the clock event device was armed.
24687 - *
24688 - * Called with interrupts disabled and base->cpu_base.lock held
24689 + * High resolution timer enabled ?
24690   */
24691 -static void hrtimer_reprogram(struct hrtimer *timer,
24692 -                             struct hrtimer_clock_base *base)
24693 -{
24694 -       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
24695 -       ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24696 -
24697 -       WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
24698 -
24699 -       /*
24700 -        * If the timer is not on the current cpu, we cannot reprogram
24701 -        * the other cpus clock event device.
24702 -        */
24703 -       if (base->cpu_base != cpu_base)
24704 -               return;
24705 -
24706 -       /*
24707 -        * If the hrtimer interrupt is running, then it will
24708 -        * reevaluate the clock bases and reprogram the clock event
24709 -        * device. The callbacks are always executed in hard interrupt
24710 -        * context so we don't need an extra check for a running
24711 -        * callback.
24712 -        */
24713 -       if (cpu_base->in_hrtirq)
24714 -               return;
24715 -
24716 -       /*
24717 -        * CLOCK_REALTIME timer might be requested with an absolute
24718 -        * expiry time which is less than base->offset. Set it to 0.
24719 -        */
24720 -       if (expires < 0)
24721 -               expires = 0;
24722 -
24723 -       if (expires >= cpu_base->expires_next)
24724 -               return;
24725 -
24726 -       /* Update the pointer to the next expiring timer */
24727 -       cpu_base->next_timer = timer;
24728 -
24729 -       /*
24730 -        * If a hang was detected in the last timer interrupt then we
24731 -        * do not schedule a timer which is earlier than the expiry
24732 -        * which we enforced in the hang detection. We want the system
24733 -        * to make progress.
24734 -        */
24735 -       if (cpu_base->hang_detected)
24736 -               return;
24737 +static bool hrtimer_hres_enabled __read_mostly  = true;
24738 +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
24739 +EXPORT_SYMBOL_GPL(hrtimer_resolution);
24740  
24741 -       /*
24742 -        * Program the timer hardware. We enforce the expiry for
24743 -        * events which are already in the past.
24744 -        */
24745 -       cpu_base->expires_next = expires;
24746 -       tick_program_event(expires, 1);
24747 +/*
24748 + * Enable / Disable high resolution mode
24749 + */
24750 +static int __init setup_hrtimer_hres(char *str)
24751 +{
24752 +       return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
24753  }
24754  
24755 +__setup("highres=", setup_hrtimer_hres);
24756 +
24757  /*
24758 - * Initialize the high resolution related parts of cpu_base
24759 + * hrtimer_high_res_enabled - query, if the highres mode is enabled
24760   */
24761 -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
24762 +static inline int hrtimer_is_hres_enabled(void)
24763  {
24764 -       base->expires_next = KTIME_MAX;
24765 -       base->hang_detected = 0;
24766 -       base->hres_active = 0;
24767 -       base->next_timer = NULL;
24768 +       return hrtimer_hres_enabled;
24769  }
24770  
24771  /*
24772 @@ -669,7 +686,7 @@
24773  {
24774         struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
24775  
24776 -       if (!base->hres_active)
24777 +       if (!__hrtimer_hres_active(base))
24778                 return;
24779  
24780         raw_spin_lock(&base->lock);
24781 @@ -698,6 +715,29 @@
24782         retrigger_next_event(NULL);
24783  }
24784  
24785 +#ifdef CONFIG_PREEMPT_RT_FULL
24786 +
24787 +static struct swork_event clock_set_delay_work;
24788 +
24789 +static void run_clock_set_delay(struct swork_event *event)
24790 +{
24791 +       clock_was_set();
24792 +}
24793 +
24794 +void clock_was_set_delayed(void)
24795 +{
24796 +       swork_queue(&clock_set_delay_work);
24797 +}
24798 +
24799 +static __init int create_clock_set_delay_thread(void)
24800 +{
24801 +       WARN_ON(swork_get());
24802 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
24803 +       return 0;
24804 +}
24805 +early_initcall(create_clock_set_delay_thread);
24806 +#else /* PREEMPT_RT_FULL */
24807 +
24808  static void clock_was_set_work(struct work_struct *work)
24809  {
24810         clock_was_set();
24811 @@ -713,26 +753,106 @@
24812  {
24813         schedule_work(&hrtimer_work);
24814  }
24815 +#endif
24816  
24817  #else
24818  
24819 -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
24820 -static inline int hrtimer_hres_active(void) { return 0; }
24821  static inline int hrtimer_is_hres_enabled(void) { return 0; }
24822  static inline void hrtimer_switch_to_hres(void) { }
24823 -static inline void
24824 -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
24825 -static inline int hrtimer_reprogram(struct hrtimer *timer,
24826 -                                   struct hrtimer_clock_base *base)
24827 -{
24828 -       return 0;
24829 -}
24830 -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
24831  static inline void retrigger_next_event(void *arg) { }
24832  
24833  #endif /* CONFIG_HIGH_RES_TIMERS */
24834  
24835  /*
24836 + * When a timer is enqueued and expires earlier than the already enqueued
24837 + * timers, we have to check, whether it expires earlier than the timer for
24838 + * which the clock event device was armed.
24839 + *
24840 + * Called with interrupts disabled and base->cpu_base.lock held
24841 + */
24842 +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
24843 +{
24844 +       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
24845 +       struct hrtimer_clock_base *base = timer->base;
24846 +       ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
24847 +
24848 +       WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
24849 +
24850 +       /*
24851 +        * CLOCK_REALTIME timer might be requested with an absolute
24852 +        * expiry time which is less than base->offset. Set it to 0.
24853 +        */
24854 +       if (expires < 0)
24855 +               expires = 0;
24856 +
24857 +       if (timer->is_soft) {
24858 +               /*
24859 +                * soft hrtimer could be started on a remote CPU. In this
24860 +                * case softirq_expires_next needs to be updated on the
24861 +                * remote CPU. The soft hrtimer will not expire before the
24862 +                * first hard hrtimer on the remote CPU -
24863 +                * hrtimer_check_target() prevents this case.
24864 +                */
24865 +               struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
24866 +
24867 +               if (timer_cpu_base->softirq_activated)
24868 +                       return;
24869 +
24870 +               if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
24871 +                       return;
24872 +
24873 +               timer_cpu_base->softirq_next_timer = timer;
24874 +               timer_cpu_base->softirq_expires_next = expires;
24875 +
24876 +               if (!ktime_before(expires, timer_cpu_base->expires_next) ||
24877 +                   !reprogram)
24878 +                       return;
24879 +       }
24880 +
24881 +       /*
24882 +        * If the timer is not on the current cpu, we cannot reprogram
24883 +        * the other cpus clock event device.
24884 +        */
24885 +       if (base->cpu_base != cpu_base)
24886 +               return;
24887 +
24888 +       /*
24889 +        * If the hrtimer interrupt is running, then it will
24890 +        * reevaluate the clock bases and reprogram the clock event
24891 +        * device. The callbacks are always executed in hard interrupt
24892 +        * context so we don't need an extra check for a running
24893 +        * callback.
24894 +        */
24895 +       if (cpu_base->in_hrtirq)
24896 +               return;
24897 +
24898 +       if (expires >= cpu_base->expires_next)
24899 +               return;
24900 +
24901 +       /* Update the pointer to the next expiring timer */
24902 +       cpu_base->next_timer = timer;
24903 +       cpu_base->expires_next = expires;
24904 +
24905 +       /*
24906 +        * If hres is not active, hardware does not have to be
24907 +        * programmed yet.
24908 +        *
24909 +        * If a hang was detected in the last timer interrupt then we
24910 +        * do not schedule a timer which is earlier than the expiry
24911 +        * which we enforced in the hang detection. We want the system
24912 +        * to make progress.
24913 +        */
24914 +       if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
24915 +               return;
24916 +
24917 +       /*
24918 +        * Program the timer hardware. We enforce the expiry for
24919 +        * events which are already in the past.
24920 +        */
24921 +       tick_program_event(expires, 1);
24922 +}
24923 +
24924 +/*
24925   * Clock realtime was set
24926   *
24927   * Change the offset of the realtime clock vs. the monotonic
24928 @@ -830,6 +950,33 @@
24929  }
24930  EXPORT_SYMBOL_GPL(hrtimer_forward);
24931  
24932 +#ifdef CONFIG_PREEMPT_RT_BASE
24933 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
24934 +
24935 +/**
24936 + * hrtimer_wait_for_timer - Wait for a running timer
24937 + *
24938 + * @timer:     timer to wait for
24939 + *
24940 + * The function waits in case the timers callback function is
24941 + * currently executed on the waitqueue of the timer base. The
24942 + * waitqueue is woken up after the timer callback function has
24943 + * finished execution.
24944 + */
24945 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
24946 +{
24947 +       struct hrtimer_clock_base *base = timer->base;
24948 +
24949 +       if (base && base->cpu_base &&
24950 +           base->index >= HRTIMER_BASE_MONOTONIC_SOFT)
24951 +               wait_event(base->cpu_base->wait,
24952 +                               !(hrtimer_callback_running(timer)));
24953 +}
24954 +
24955 +#else
24956 +# define wake_up_timer_waiters(b)      do { } while (0)
24957 +#endif
24958 +
24959  /*
24960   * enqueue_hrtimer - internal function to (re)start a timer
24961   *
24962 @@ -839,9 +986,10 @@
24963   * Returns 1 when the new timer is the leftmost timer in the tree.
24964   */
24965  static int enqueue_hrtimer(struct hrtimer *timer,
24966 -                          struct hrtimer_clock_base *base)
24967 +                          struct hrtimer_clock_base *base,
24968 +                          enum hrtimer_mode mode)
24969  {
24970 -       debug_activate(timer);
24971 +       debug_activate(timer, mode);
24972  
24973         base->cpu_base->active_bases |= 1 << base->index;
24974  
24975 @@ -874,7 +1022,6 @@
24976         if (!timerqueue_del(&base->active, &timer->node))
24977                 cpu_base->active_bases &= ~(1 << base->index);
24978  
24979 -#ifdef CONFIG_HIGH_RES_TIMERS
24980         /*
24981          * Note: If reprogram is false we do not update
24982          * cpu_base->next_timer. This happens when we remove the first
24983 @@ -885,7 +1032,6 @@
24984          */
24985         if (reprogram && timer == cpu_base->next_timer)
24986                 hrtimer_force_reprogram(cpu_base, 1);
24987 -#endif
24988  }
24989  
24990  /*
24991 @@ -934,22 +1080,36 @@
24992         return tim;
24993  }
24994  
24995 -/**
24996 - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
24997 - * @timer:     the timer to be added
24998 - * @tim:       expiry time
24999 - * @delta_ns:  "slack" range for the timer
25000 - * @mode:      expiry mode: absolute (HRTIMER_MODE_ABS) or
25001 - *             relative (HRTIMER_MODE_REL)
25002 - */
25003 -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25004 -                           u64 delta_ns, const enum hrtimer_mode mode)
25005 +static void
25006 +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
25007  {
25008 -       struct hrtimer_clock_base *base, *new_base;
25009 -       unsigned long flags;
25010 -       int leftmost;
25011 +       ktime_t expires;
25012  
25013 -       base = lock_hrtimer_base(timer, &flags);
25014 +       /*
25015 +        * Find the next SOFT expiration.
25016 +        */
25017 +       expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
25018 +
25019 +       /*
25020 +        * reprogramming needs to be triggered, even if the next soft
25021 +        * hrtimer expires at the same time than the next hard
25022 +        * hrtimer. cpu_base->softirq_expires_next needs to be updated!
25023 +        */
25024 +       if (expires == KTIME_MAX)
25025 +               return;
25026 +
25027 +       /*
25028 +        * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
25029 +        * cpu_base->*expires_next is only set by hrtimer_reprogram()
25030 +        */
25031 +       hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
25032 +}
25033 +
25034 +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25035 +                                   u64 delta_ns, const enum hrtimer_mode mode,
25036 +                                   struct hrtimer_clock_base *base)
25037 +{
25038 +       struct hrtimer_clock_base *new_base;
25039  
25040         /* Remove an active timer from the queue: */
25041         remove_hrtimer(timer, base, true);
25042 @@ -964,21 +1124,37 @@
25043         /* Switch the timer base, if necessary: */
25044         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25045  
25046 -       leftmost = enqueue_hrtimer(timer, new_base);
25047 -       if (!leftmost)
25048 -               goto unlock;
25049 +       return enqueue_hrtimer(timer, new_base, mode);
25050 +}
25051 +
25052 +/**
25053 + * hrtimer_start_range_ns - (re)start an hrtimer
25054 + * @timer:     the timer to be added
25055 + * @tim:       expiry time
25056 + * @delta_ns:  "slack" range for the timer
25057 + * @mode:      timer mode: absolute (HRTIMER_MODE_ABS) or
25058 + *             relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
25059 + *             softirq based mode is considered for debug purpose only!
25060 + */
25061 +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25062 +                           u64 delta_ns, const enum hrtimer_mode mode)
25063 +{
25064 +       struct hrtimer_clock_base *base;
25065 +       unsigned long flags;
25066 +
25067 +       /*
25068 +        * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
25069 +        * match.
25070 +        */
25071 +#ifndef CONFIG_PREEMPT_RT_BASE
25072 +       WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
25073 +#endif
25074 +
25075 +       base = lock_hrtimer_base(timer, &flags);
25076 +
25077 +       if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
25078 +               hrtimer_reprogram(timer, true);
25079  
25080 -       if (!hrtimer_is_hres_active(timer)) {
25081 -               /*
25082 -                * Kick to reschedule the next tick to handle the new timer
25083 -                * on dynticks target.
25084 -                */
25085 -               if (new_base->cpu_base->nohz_active)
25086 -                       wake_up_nohz_cpu(new_base->cpu_base->cpu);
25087 -       } else {
25088 -               hrtimer_reprogram(timer, new_base);
25089 -       }
25090 -unlock:
25091         unlock_hrtimer_base(timer, &flags);
25092  }
25093  EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
25094 @@ -1035,7 +1211,7 @@
25095  
25096                 if (ret >= 0)
25097                         return ret;
25098 -               cpu_relax();
25099 +               hrtimer_wait_for_timer(timer);
25100         }
25101  }
25102  EXPORT_SYMBOL_GPL(hrtimer_cancel);
25103 @@ -1076,7 +1252,7 @@
25104         raw_spin_lock_irqsave(&cpu_base->lock, flags);
25105  
25106         if (!__hrtimer_hres_active(cpu_base))
25107 -               expires = __hrtimer_get_next_event(cpu_base);
25108 +               expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25109  
25110         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25111  
25112 @@ -1099,8 +1275,16 @@
25113  static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25114                            enum hrtimer_mode mode)
25115  {
25116 -       struct hrtimer_cpu_base *cpu_base;
25117 +       bool softtimer;
25118         int base;
25119 +       struct hrtimer_cpu_base *cpu_base;
25120 +
25121 +       softtimer = !!(mode & HRTIMER_MODE_SOFT);
25122 +#ifdef CONFIG_PREEMPT_RT_FULL
25123 +       if (!softtimer && !(mode & HRTIMER_MODE_HARD))
25124 +               softtimer = true;
25125 +#endif
25126 +       base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
25127  
25128         memset(timer, 0, sizeof(struct hrtimer));
25129  
25130 @@ -1114,7 +1298,8 @@
25131         if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
25132                 clock_id = CLOCK_MONOTONIC;
25133  
25134 -       base = hrtimer_clockid_to_base(clock_id);
25135 +       base += hrtimer_clockid_to_base(clock_id);
25136 +       timer->is_soft = softtimer;
25137         timer->base = &cpu_base->clock_base[base];
25138         timerqueue_init(&timer->node);
25139  }
25140 @@ -1123,7 +1308,13 @@
25141   * hrtimer_init - initialize a timer to the given clock
25142   * @timer:     the timer to be initialized
25143   * @clock_id:  the clock to be used
25144 - * @mode:      timer mode abs/rel
25145 + * @mode:       The modes which are relevant for intitialization:
25146 + *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
25147 + *              HRTIMER_MODE_REL_SOFT
25148 + *
25149 + *              The PINNED variants of the above can be handed in,
25150 + *              but the PINNED bit is ignored as pinning happens
25151 + *              when the hrtimer is started
25152   */
25153  void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25154                   enum hrtimer_mode mode)
25155 @@ -1142,19 +1333,19 @@
25156   */
25157  bool hrtimer_active(const struct hrtimer *timer)
25158  {
25159 -       struct hrtimer_cpu_base *cpu_base;
25160 +       struct hrtimer_clock_base *base;
25161         unsigned int seq;
25162  
25163         do {
25164 -               cpu_base = READ_ONCE(timer->base->cpu_base);
25165 -               seq = raw_read_seqcount_begin(&cpu_base->seq);
25166 +               base = READ_ONCE(timer->base);
25167 +               seq = raw_read_seqcount_begin(&base->seq);
25168  
25169                 if (timer->state != HRTIMER_STATE_INACTIVE ||
25170 -                   cpu_base->running == timer)
25171 +                   base->running == timer)
25172                         return true;
25173  
25174 -       } while (read_seqcount_retry(&cpu_base->seq, seq) ||
25175 -                cpu_base != READ_ONCE(timer->base->cpu_base));
25176 +       } while (read_seqcount_retry(&base->seq, seq) ||
25177 +                base != READ_ONCE(timer->base));
25178  
25179         return false;
25180  }
25181 @@ -1180,7 +1371,8 @@
25182  
25183  static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25184                           struct hrtimer_clock_base *base,
25185 -                         struct hrtimer *timer, ktime_t *now)
25186 +                         struct hrtimer *timer, ktime_t *now,
25187 +                         unsigned long flags)
25188  {
25189         enum hrtimer_restart (*fn)(struct hrtimer *);
25190         int restart;
25191 @@ -1188,16 +1380,16 @@
25192         lockdep_assert_held(&cpu_base->lock);
25193  
25194         debug_deactivate(timer);
25195 -       cpu_base->running = timer;
25196 +       base->running = timer;
25197  
25198         /*
25199          * Separate the ->running assignment from the ->state assignment.
25200          *
25201          * As with a regular write barrier, this ensures the read side in
25202 -        * hrtimer_active() cannot observe cpu_base->running == NULL &&
25203 +        * hrtimer_active() cannot observe base->running == NULL &&
25204          * timer->state == INACTIVE.
25205          */
25206 -       raw_write_seqcount_barrier(&cpu_base->seq);
25207 +       raw_write_seqcount_barrier(&base->seq);
25208  
25209         __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25210         fn = timer->function;
25211 @@ -1211,15 +1403,15 @@
25212                 timer->is_rel = false;
25213  
25214         /*
25215 -        * Because we run timers from hardirq context, there is no chance
25216 -        * they get migrated to another cpu, therefore its safe to unlock
25217 -        * the timer base.
25218 +        * The timer is marked as running in the cpu base, so it is
25219 +        * protected against migration to a different CPU even if the lock
25220 +        * is dropped.
25221          */
25222 -       raw_spin_unlock(&cpu_base->lock);
25223 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25224         trace_hrtimer_expire_entry(timer, now);
25225         restart = fn(timer);
25226         trace_hrtimer_expire_exit(timer);
25227 -       raw_spin_lock(&cpu_base->lock);
25228 +       raw_spin_lock_irq(&cpu_base->lock);
25229  
25230         /*
25231          * Note: We clear the running state after enqueue_hrtimer and
25232 @@ -1232,33 +1424,31 @@
25233          */
25234         if (restart != HRTIMER_NORESTART &&
25235             !(timer->state & HRTIMER_STATE_ENQUEUED))
25236 -               enqueue_hrtimer(timer, base);
25237 +               enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
25238  
25239         /*
25240          * Separate the ->running assignment from the ->state assignment.
25241          *
25242          * As with a regular write barrier, this ensures the read side in
25243 -        * hrtimer_active() cannot observe cpu_base->running == NULL &&
25244 +        * hrtimer_active() cannot observe base->running.timer == NULL &&
25245          * timer->state == INACTIVE.
25246          */
25247 -       raw_write_seqcount_barrier(&cpu_base->seq);
25248 +       raw_write_seqcount_barrier(&base->seq);
25249  
25250 -       WARN_ON_ONCE(cpu_base->running != timer);
25251 -       cpu_base->running = NULL;
25252 +       WARN_ON_ONCE(base->running != timer);
25253 +       base->running = NULL;
25254  }
25255  
25256 -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25257 +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
25258 +                                unsigned long flags, unsigned int active_mask)
25259  {
25260 -       struct hrtimer_clock_base *base = cpu_base->clock_base;
25261 -       unsigned int active = cpu_base->active_bases;
25262 +       struct hrtimer_clock_base *base;
25263 +       unsigned int active = cpu_base->active_bases & active_mask;
25264  
25265 -       for (; active; base++, active >>= 1) {
25266 +       for_each_active_base(base, cpu_base, active) {
25267                 struct timerqueue_node *node;
25268                 ktime_t basenow;
25269  
25270 -               if (!(active & 0x01))
25271 -                       continue;
25272 -
25273                 basenow = ktime_add(now, base->offset);
25274  
25275                 while ((node = timerqueue_getnext(&base->active))) {
25276 @@ -1281,11 +1471,29 @@
25277                         if (basenow < hrtimer_get_softexpires_tv64(timer))
25278                                 break;
25279  
25280 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
25281 +                       __run_hrtimer(cpu_base, base, timer, &basenow, flags);
25282                 }
25283         }
25284  }
25285  
25286 +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
25287 +{
25288 +       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25289 +       unsigned long flags;
25290 +       ktime_t now;
25291 +
25292 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
25293 +
25294 +       now = hrtimer_update_base(cpu_base);
25295 +       __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
25296 +
25297 +       cpu_base->softirq_activated = 0;
25298 +       hrtimer_update_softirq_timer(cpu_base, true);
25299 +
25300 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25301 +       wake_up_timer_waiters(cpu_base);
25302 +}
25303 +
25304  #ifdef CONFIG_HIGH_RES_TIMERS
25305  
25306  /*
25307 @@ -1296,13 +1504,14 @@
25308  {
25309         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25310         ktime_t expires_next, now, entry_time, delta;
25311 +       unsigned long flags;
25312         int retries = 0;
25313  
25314         BUG_ON(!cpu_base->hres_active);
25315         cpu_base->nr_events++;
25316         dev->next_event = KTIME_MAX;
25317  
25318 -       raw_spin_lock(&cpu_base->lock);
25319 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
25320         entry_time = now = hrtimer_update_base(cpu_base);
25321  retry:
25322         cpu_base->in_hrtirq = 1;
25323 @@ -1315,17 +1524,23 @@
25324          */
25325         cpu_base->expires_next = KTIME_MAX;
25326  
25327 -       __hrtimer_run_queues(cpu_base, now);
25328 +       if (!ktime_before(now, cpu_base->softirq_expires_next)) {
25329 +               cpu_base->softirq_expires_next = KTIME_MAX;
25330 +               cpu_base->softirq_activated = 1;
25331 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25332 +       }
25333 +
25334 +       __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
25335  
25336         /* Reevaluate the clock bases for the next expiry */
25337 -       expires_next = __hrtimer_get_next_event(cpu_base);
25338 +       expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25339         /*
25340          * Store the new expiry value so the migration code can verify
25341          * against it.
25342          */
25343         cpu_base->expires_next = expires_next;
25344         cpu_base->in_hrtirq = 0;
25345 -       raw_spin_unlock(&cpu_base->lock);
25346 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25347  
25348         /* Reprogramming necessary ? */
25349         if (!tick_program_event(expires_next, 0)) {
25350 @@ -1346,7 +1561,7 @@
25351          * Acquire base lock for updating the offsets and retrieving
25352          * the current time.
25353          */
25354 -       raw_spin_lock(&cpu_base->lock);
25355 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
25356         now = hrtimer_update_base(cpu_base);
25357         cpu_base->nr_retries++;
25358         if (++retries < 3)
25359 @@ -1359,7 +1574,8 @@
25360          */
25361         cpu_base->nr_hangs++;
25362         cpu_base->hang_detected = 1;
25363 -       raw_spin_unlock(&cpu_base->lock);
25364 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25365 +
25366         delta = ktime_sub(now, entry_time);
25367         if ((unsigned int)delta > cpu_base->max_hang_time)
25368                 cpu_base->max_hang_time = (unsigned int) delta;
25369 @@ -1401,6 +1617,7 @@
25370  void hrtimer_run_queues(void)
25371  {
25372         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25373 +       unsigned long flags;
25374         ktime_t now;
25375  
25376         if (__hrtimer_hres_active(cpu_base))
25377 @@ -1418,10 +1635,17 @@
25378                 return;
25379         }
25380  
25381 -       raw_spin_lock(&cpu_base->lock);
25382 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
25383         now = hrtimer_update_base(cpu_base);
25384 -       __hrtimer_run_queues(cpu_base, now);
25385 -       raw_spin_unlock(&cpu_base->lock);
25386 +
25387 +       if (!ktime_before(now, cpu_base->softirq_expires_next)) {
25388 +               cpu_base->softirq_expires_next = KTIME_MAX;
25389 +               cpu_base->softirq_activated = 1;
25390 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25391 +       }
25392 +
25393 +       __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
25394 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25395  }
25396  
25397  /*
25398 @@ -1440,13 +1664,65 @@
25399         return HRTIMER_NORESTART;
25400  }
25401  
25402 -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25403 +#ifdef CONFIG_PREEMPT_RT_FULL
25404 +static bool task_is_realtime(struct task_struct *tsk)
25405  {
25406 +       int policy = tsk->policy;
25407 +
25408 +       if (policy == SCHED_FIFO || policy == SCHED_RR)
25409 +               return true;
25410 +       if (policy == SCHED_DEADLINE)
25411 +               return true;
25412 +       return false;
25413 +}
25414 +#endif
25415 +
25416 +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
25417 +                                  clockid_t clock_id,
25418 +                                  enum hrtimer_mode mode,
25419 +                                  struct task_struct *task)
25420 +{
25421 +#ifdef CONFIG_PREEMPT_RT_FULL
25422 +       if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
25423 +               if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
25424 +                       mode |= HRTIMER_MODE_HARD;
25425 +               else
25426 +                       mode |= HRTIMER_MODE_SOFT;
25427 +       }
25428 +#endif
25429 +       __hrtimer_init(&sl->timer, clock_id, mode);
25430         sl->timer.function = hrtimer_wakeup;
25431         sl->task = task;
25432  }
25433 +
25434 +/**
25435 + * hrtimer_init_sleeper - initialize sleeper to the given clock
25436 + * @sl:                sleeper to be initialized
25437 + * @clock_id:  the clock to be used
25438 + * @mode:      timer mode abs/rel
25439 + * @task:      the task to wake up
25440 + */
25441 +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
25442 +                         enum hrtimer_mode mode, struct task_struct *task)
25443 +{
25444 +       debug_init(&sl->timer, clock_id, mode);
25445 +       __hrtimer_init_sleeper(sl, clock_id, mode, task);
25446 +
25447 +}
25448  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
25449  
25450 +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
25451 +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
25452 +                                  clockid_t clock_id,
25453 +                                  enum hrtimer_mode mode,
25454 +                                  struct task_struct *task)
25455 +{
25456 +       debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
25457 +       __hrtimer_init_sleeper(sl, clock_id, mode, task);
25458 +}
25459 +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
25460 +#endif
25461 +
25462  int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
25463  {
25464         switch(restart->nanosleep.type) {
25465 @@ -1470,8 +1746,6 @@
25466  {
25467         struct restart_block *restart;
25468  
25469 -       hrtimer_init_sleeper(t, current);
25470 -
25471         do {
25472                 set_current_state(TASK_INTERRUPTIBLE);
25473                 hrtimer_start_expires(&t->timer, mode);
25474 @@ -1508,10 +1782,9 @@
25475         struct hrtimer_sleeper t;
25476         int ret;
25477  
25478 -       hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
25479 -                               HRTIMER_MODE_ABS);
25480 +       hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
25481 +                                     HRTIMER_MODE_ABS, current);
25482         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25483 -
25484         ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
25485         destroy_hrtimer_on_stack(&t.timer);
25486         return ret;
25487 @@ -1529,7 +1802,7 @@
25488         if (dl_task(current) || rt_task(current))
25489                 slack = 0;
25490  
25491 -       hrtimer_init_on_stack(&t.timer, clockid, mode);
25492 +       hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
25493         hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
25494         ret = do_nanosleep(&t, mode);
25495         if (ret != -ERESTART_RESTARTBLOCK)
25496 @@ -1585,6 +1858,27 @@
25497  }
25498  #endif
25499  
25500 +#ifdef CONFIG_PREEMPT_RT_FULL
25501 +/*
25502 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
25503 + */
25504 +void cpu_chill(void)
25505 +{
25506 +       ktime_t chill_time;
25507 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
25508 +
25509 +       chill_time = ktime_set(0, NSEC_PER_MSEC);
25510 +       set_current_state(TASK_UNINTERRUPTIBLE);
25511 +       current->flags |= PF_NOFREEZE;
25512 +       sleeping_lock_inc();
25513 +       schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
25514 +       sleeping_lock_dec();
25515 +       if (!freeze_flag)
25516 +               current->flags &= ~PF_NOFREEZE;
25517 +}
25518 +EXPORT_SYMBOL(cpu_chill);
25519 +#endif
25520 +
25521  /*
25522   * Functions related to boot-time initialization:
25523   */
25524 @@ -1598,9 +1892,17 @@
25525                 timerqueue_init_head(&cpu_base->clock_base[i].active);
25526         }
25527  
25528 -       cpu_base->active_bases = 0;
25529         cpu_base->cpu = cpu;
25530 -       hrtimer_init_hres(cpu_base);
25531 +       cpu_base->active_bases = 0;
25532 +       cpu_base->hres_active = 0;
25533 +       cpu_base->hang_detected = 0;
25534 +       cpu_base->next_timer = NULL;
25535 +       cpu_base->softirq_next_timer = NULL;
25536 +       cpu_base->expires_next = KTIME_MAX;
25537 +       cpu_base->softirq_expires_next = KTIME_MAX;
25538 +#ifdef CONFIG_PREEMPT_RT_BASE
25539 +       init_waitqueue_head(&cpu_base->wait);
25540 +#endif
25541         return 0;
25542  }
25543  
25544 @@ -1632,7 +1934,7 @@
25545                  * sort out already expired timers and reprogram the
25546                  * event device.
25547                  */
25548 -               enqueue_hrtimer(timer, new_base);
25549 +               enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
25550         }
25551  }
25552  
25553 @@ -1644,6 +1946,12 @@
25554         BUG_ON(cpu_online(scpu));
25555         tick_cancel_sched_timer(scpu);
25556  
25557 +       /*
25558 +        * this BH disable ensures that raise_softirq_irqoff() does
25559 +        * not wakeup ksoftirqd (and acquire the pi-lock) while
25560 +        * holding the cpu_base lock
25561 +        */
25562 +       local_bh_disable();
25563         local_irq_disable();
25564         old_base = &per_cpu(hrtimer_bases, scpu);
25565         new_base = this_cpu_ptr(&hrtimer_bases);
25566 @@ -1659,12 +1967,19 @@
25567                                      &new_base->clock_base[i]);
25568         }
25569  
25570 +       /*
25571 +        * The migration might have changed the first expiring softirq
25572 +        * timer on this CPU. Update it.
25573 +        */
25574 +       hrtimer_update_softirq_timer(new_base, false);
25575 +
25576         raw_spin_unlock(&old_base->lock);
25577         raw_spin_unlock(&new_base->lock);
25578  
25579         /* Check, if we got expired work to do */
25580         __hrtimer_peek_ahead_timers();
25581         local_irq_enable();
25582 +       local_bh_enable();
25583         return 0;
25584  }
25585  
25586 @@ -1673,18 +1988,19 @@
25587  void __init hrtimers_init(void)
25588  {
25589         hrtimers_prepare_cpu(smp_processor_id());
25590 +       open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
25591  }
25592  
25593  /**
25594   * schedule_hrtimeout_range_clock - sleep until timeout
25595   * @expires:   timeout value (ktime_t)
25596   * @delta:     slack in expires timeout (ktime_t)
25597 - * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25598 - * @clock:     timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
25599 + * @mode:      timer mode
25600 + * @clock_id:  timer clock to be used
25601   */
25602  int __sched
25603  schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
25604 -                              const enum hrtimer_mode mode, int clock)
25605 +                              const enum hrtimer_mode mode, clockid_t clock_id)
25606  {
25607         struct hrtimer_sleeper t;
25608  
25609 @@ -1705,11 +2021,9 @@
25610                 return -EINTR;
25611         }
25612  
25613 -       hrtimer_init_on_stack(&t.timer, clock, mode);
25614 +       hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
25615         hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
25616  
25617 -       hrtimer_init_sleeper(&t, current);
25618 -
25619         hrtimer_start_expires(&t.timer, mode);
25620  
25621         if (likely(t.task))
25622 @@ -1727,7 +2041,7 @@
25623   * schedule_hrtimeout_range - sleep until timeout
25624   * @expires:   timeout value (ktime_t)
25625   * @delta:     slack in expires timeout (ktime_t)
25626 - * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25627 + * @mode:      timer mode
25628   *
25629   * Make the current task sleep until the given expiry time has
25630   * elapsed. The routine will return immediately unless
25631 @@ -1766,7 +2080,7 @@
25632  /**
25633   * schedule_hrtimeout - sleep until timeout
25634   * @expires:   timeout value (ktime_t)
25635 - * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
25636 + * @mode:      timer mode
25637   *
25638   * Make the current task sleep until the given expiry time has
25639   * elapsed. The routine will return immediately unless
25640 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/itimer.c linux-4.14/kernel/time/itimer.c
25641 --- linux-4.14.orig/kernel/time/itimer.c        2017-11-12 19:46:13.000000000 +0100
25642 +++ linux-4.14/kernel/time/itimer.c     2018-09-05 11:05:07.000000000 +0200
25643 @@ -214,6 +214,7 @@
25644                 /* We are sharing ->siglock with it_real_fn() */
25645                 if (hrtimer_try_to_cancel(timer) < 0) {
25646                         spin_unlock_irq(&tsk->sighand->siglock);
25647 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
25648                         goto again;
25649                 }
25650                 expires = timeval_to_ktime(value->it_value);
25651 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/jiffies.c linux-4.14/kernel/time/jiffies.c
25652 --- linux-4.14.orig/kernel/time/jiffies.c       2017-11-12 19:46:13.000000000 +0100
25653 +++ linux-4.14/kernel/time/jiffies.c    2018-09-05 11:05:07.000000000 +0200
25654 @@ -74,7 +74,8 @@
25655         .max_cycles     = 10,
25656  };
25657  
25658 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
25659 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
25660 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
25661  
25662  #if (BITS_PER_LONG < 64)
25663  u64 get_jiffies_64(void)
25664 @@ -83,9 +84,9 @@
25665         u64 ret;
25666  
25667         do {
25668 -               seq = read_seqbegin(&jiffies_lock);
25669 +               seq = read_seqcount_begin(&jiffies_seq);
25670                 ret = jiffies_64;
25671 -       } while (read_seqretry(&jiffies_lock, seq));
25672 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25673         return ret;
25674  }
25675  EXPORT_SYMBOL(get_jiffies_64);
25676 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/posix-cpu-timers.c linux-4.14/kernel/time/posix-cpu-timers.c
25677 --- linux-4.14.orig/kernel/time/posix-cpu-timers.c      2017-11-12 19:46:13.000000000 +0100
25678 +++ linux-4.14/kernel/time/posix-cpu-timers.c   2018-09-05 11:05:07.000000000 +0200
25679 @@ -3,8 +3,10 @@
25680   * Implement CPU time clocks for the POSIX clock interface.
25681   */
25682  
25683 +#include <uapi/linux/sched/types.h>
25684  #include <linux/sched/signal.h>
25685  #include <linux/sched/cputime.h>
25686 +#include <linux/sched/rt.h>
25687  #include <linux/posix-timers.h>
25688  #include <linux/errno.h>
25689  #include <linux/math64.h>
25690 @@ -14,6 +16,7 @@
25691  #include <linux/tick.h>
25692  #include <linux/workqueue.h>
25693  #include <linux/compat.h>
25694 +#include <linux/smpboot.h>
25695  
25696  #include "posix-timers.h"
25697  
25698 @@ -603,7 +606,7 @@
25699         /*
25700          * Disarm any old timer after extracting its expiry time.
25701          */
25702 -       WARN_ON_ONCE(!irqs_disabled());
25703 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25704  
25705         ret = 0;
25706         old_incr = timer->it.cpu.incr;
25707 @@ -1034,7 +1037,7 @@
25708         /*
25709          * Now re-arm for the new expiry time.
25710          */
25711 -       WARN_ON_ONCE(!irqs_disabled());
25712 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25713         arm_timer(timer);
25714  unlock:
25715         unlock_task_sighand(p, &flags);
25716 @@ -1119,13 +1122,13 @@
25717   * already updated our counts.  We need to check if any timers fire now.
25718   * Interrupts are disabled.
25719   */
25720 -void run_posix_cpu_timers(struct task_struct *tsk)
25721 +static void __run_posix_cpu_timers(struct task_struct *tsk)
25722  {
25723         LIST_HEAD(firing);
25724         struct k_itimer *timer, *next;
25725         unsigned long flags;
25726  
25727 -       WARN_ON_ONCE(!irqs_disabled());
25728 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25729  
25730         /*
25731          * The fast path checks that there are no expired thread or thread
25732 @@ -1179,6 +1182,152 @@
25733         }
25734  }
25735  
25736 +#ifdef CONFIG_PREEMPT_RT_BASE
25737 +#include <linux/kthread.h>
25738 +#include <linux/cpu.h>
25739 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
25740 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
25741 +DEFINE_PER_CPU(bool, posix_timer_th_active);
25742 +
25743 +static void posix_cpu_kthread_fn(unsigned int cpu)
25744 +{
25745 +       struct task_struct *tsk = NULL;
25746 +       struct task_struct *next = NULL;
25747 +
25748 +       BUG_ON(per_cpu(posix_timer_task, cpu) != current);
25749 +
25750 +       /* grab task list */
25751 +       raw_local_irq_disable();
25752 +       tsk = per_cpu(posix_timer_tasklist, cpu);
25753 +       per_cpu(posix_timer_tasklist, cpu) = NULL;
25754 +       raw_local_irq_enable();
25755 +
25756 +       /* its possible the list is empty, just return */
25757 +       if (!tsk)
25758 +               return;
25759 +
25760 +       /* Process task list */
25761 +       while (1) {
25762 +               /* save next */
25763 +               next = tsk->posix_timer_list;
25764 +
25765 +               /* run the task timers, clear its ptr and
25766 +                * unreference it
25767 +                */
25768 +               __run_posix_cpu_timers(tsk);
25769 +               tsk->posix_timer_list = NULL;
25770 +               put_task_struct(tsk);
25771 +
25772 +               /* check if this is the last on the list */
25773 +               if (next == tsk)
25774 +                       break;
25775 +               tsk = next;
25776 +       }
25777 +}
25778 +
25779 +static inline int __fastpath_timer_check(struct task_struct *tsk)
25780 +{
25781 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
25782 +       if (unlikely(tsk->exit_state))
25783 +               return 0;
25784 +
25785 +       if (!task_cputime_zero(&tsk->cputime_expires))
25786 +                       return 1;
25787 +
25788 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
25789 +                       return 1;
25790 +
25791 +       return 0;
25792 +}
25793 +
25794 +void run_posix_cpu_timers(struct task_struct *tsk)
25795 +{
25796 +       unsigned int cpu = smp_processor_id();
25797 +       struct task_struct *tasklist;
25798 +
25799 +       BUG_ON(!irqs_disabled());
25800 +
25801 +       if (per_cpu(posix_timer_th_active, cpu) != true)
25802 +               return;
25803 +
25804 +       /* get per-cpu references */
25805 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
25806 +
25807 +       /* check to see if we're already queued */
25808 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
25809 +               get_task_struct(tsk);
25810 +               if (tasklist) {
25811 +                       tsk->posix_timer_list = tasklist;
25812 +               } else {
25813 +                       /*
25814 +                        * The list is terminated by a self-pointing
25815 +                        * task_struct
25816 +                        */
25817 +                       tsk->posix_timer_list = tsk;
25818 +               }
25819 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
25820 +
25821 +               wake_up_process(per_cpu(posix_timer_task, cpu));
25822 +       }
25823 +}
25824 +
25825 +static int posix_cpu_kthread_should_run(unsigned int cpu)
25826 +{
25827 +       return __this_cpu_read(posix_timer_tasklist) != NULL;
25828 +}
25829 +
25830 +static void posix_cpu_kthread_park(unsigned int cpu)
25831 +{
25832 +       this_cpu_write(posix_timer_th_active, false);
25833 +}
25834 +
25835 +static void posix_cpu_kthread_unpark(unsigned int cpu)
25836 +{
25837 +       this_cpu_write(posix_timer_th_active, true);
25838 +}
25839 +
25840 +static void posix_cpu_kthread_setup(unsigned int cpu)
25841 +{
25842 +       struct sched_param sp;
25843 +
25844 +       sp.sched_priority = MAX_RT_PRIO - 1;
25845 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
25846 +       posix_cpu_kthread_unpark(cpu);
25847 +}
25848 +
25849 +static struct smp_hotplug_thread posix_cpu_thread = {
25850 +       .store                  = &posix_timer_task,
25851 +       .thread_should_run      = posix_cpu_kthread_should_run,
25852 +       .thread_fn              = posix_cpu_kthread_fn,
25853 +       .thread_comm            = "posixcputmr/%u",
25854 +       .setup                  = posix_cpu_kthread_setup,
25855 +       .park                   = posix_cpu_kthread_park,
25856 +       .unpark                 = posix_cpu_kthread_unpark,
25857 +};
25858 +
25859 +static int __init posix_cpu_thread_init(void)
25860 +{
25861 +       /* Start one for boot CPU. */
25862 +       unsigned long cpu;
25863 +       int ret;
25864 +
25865 +       /* init the per-cpu posix_timer_tasklets */
25866 +       for_each_possible_cpu(cpu)
25867 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25868 +
25869 +       ret = smpboot_register_percpu_thread(&posix_cpu_thread);
25870 +       WARN_ON(ret);
25871 +
25872 +       return 0;
25873 +}
25874 +early_initcall(posix_cpu_thread_init);
25875 +#else /* CONFIG_PREEMPT_RT_BASE */
25876 +void run_posix_cpu_timers(struct task_struct *tsk)
25877 +{
25878 +       __run_posix_cpu_timers(tsk);
25879 +}
25880 +#endif /* CONFIG_PREEMPT_RT_BASE */
25881 +
25882  /*
25883   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
25884   * The tsk->sighand->siglock must be held by the caller.
25885 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/posix-timers.c linux-4.14/kernel/time/posix-timers.c
25886 --- linux-4.14.orig/kernel/time/posix-timers.c  2018-09-05 11:03:22.000000000 +0200
25887 +++ linux-4.14/kernel/time/posix-timers.c       2018-09-05 11:05:07.000000000 +0200
25888 @@ -434,6 +434,7 @@
25889  static struct pid *good_sigevent(sigevent_t * event)
25890  {
25891         struct task_struct *rtn = current->group_leader;
25892 +       int sig = event->sigev_signo;
25893  
25894         switch (event->sigev_notify) {
25895         case SIGEV_SIGNAL | SIGEV_THREAD_ID:
25896 @@ -443,7 +444,8 @@
25897                 /* FALLTHRU */
25898         case SIGEV_SIGNAL:
25899         case SIGEV_THREAD:
25900 -               if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
25901 +               if (sig <= 0 || sig > SIGRTMAX ||
25902 +                   sig_kernel_only(sig) || sig_kernel_coredump(sig))
25903                         return NULL;
25904                 /* FALLTHRU */
25905         case SIGEV_NONE:
25906 @@ -469,7 +471,7 @@
25907  
25908  static void k_itimer_rcu_free(struct rcu_head *head)
25909  {
25910 -       struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
25911 +       struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
25912  
25913         kmem_cache_free(posix_timers_cache, tmr);
25914  }
25915 @@ -486,7 +488,7 @@
25916         }
25917         put_pid(tmr->it_pid);
25918         sigqueue_free(tmr->sigq);
25919 -       call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
25920 +       call_rcu(&tmr->rcu, k_itimer_rcu_free);
25921  }
25922  
25923  static int common_timer_create(struct k_itimer *new_timer)
25924 @@ -825,6 +827,22 @@
25925                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
25926  }
25927  
25928 +/*
25929 + * Protected by RCU!
25930 + */
25931 +static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr)
25932 +{
25933 +#ifdef CONFIG_PREEMPT_RT_FULL
25934 +       if (kc->timer_arm == common_hrtimer_arm)
25935 +               hrtimer_wait_for_timer(&timr->it.real.timer);
25936 +       else if (kc == &alarm_clock)
25937 +               hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer);
25938 +       else
25939 +               /* FIXME: Whacky hack for posix-cpu-timers */
25940 +               schedule_timeout(1);
25941 +#endif
25942 +}
25943 +
25944  static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
25945  {
25946         return hrtimer_try_to_cancel(&timr->it.real.timer);
25947 @@ -889,6 +907,7 @@
25948         if (!timr)
25949                 return -EINVAL;
25950  
25951 +       rcu_read_lock();
25952         kc = timr->kclock;
25953         if (WARN_ON_ONCE(!kc || !kc->timer_set))
25954                 error = -EINVAL;
25955 @@ -897,9 +916,12 @@
25956  
25957         unlock_timer(timr, flag);
25958         if (error == TIMER_RETRY) {
25959 +               timer_wait_for_callback(kc, timr);
25960                 old_spec64 = NULL;      // We already got the old time...
25961 +               rcu_read_unlock();
25962                 goto retry;
25963         }
25964 +       rcu_read_unlock();
25965  
25966         return error;
25967  }
25968 @@ -981,10 +1003,15 @@
25969         if (!timer)
25970                 return -EINVAL;
25971  
25972 +       rcu_read_lock();
25973         if (timer_delete_hook(timer) == TIMER_RETRY) {
25974                 unlock_timer(timer, flags);
25975 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25976 +                                       timer);
25977 +               rcu_read_unlock();
25978                 goto retry_delete;
25979         }
25980 +       rcu_read_unlock();
25981  
25982         spin_lock(&current->sighand->siglock);
25983         list_del(&timer->list);
25984 @@ -1010,8 +1037,18 @@
25985  retry_delete:
25986         spin_lock_irqsave(&timer->it_lock, flags);
25987  
25988 +       /* On RT we can race with a deletion */
25989 +       if (!timer->it_signal) {
25990 +               unlock_timer(timer, flags);
25991 +               return;
25992 +       }
25993 +
25994         if (timer_delete_hook(timer) == TIMER_RETRY) {
25995 +               rcu_read_lock();
25996                 unlock_timer(timer, flags);
25997 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25998 +                                       timer);
25999 +               rcu_read_unlock();
26000                 goto retry_delete;
26001         }
26002         list_del(&timer->list);
26003 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-broadcast-hrtimer.c linux-4.14/kernel/time/tick-broadcast-hrtimer.c
26004 --- linux-4.14.orig/kernel/time/tick-broadcast-hrtimer.c        2017-11-12 19:46:13.000000000 +0100
26005 +++ linux-4.14/kernel/time/tick-broadcast-hrtimer.c     2018-09-05 11:05:07.000000000 +0200
26006 @@ -106,7 +106,7 @@
26007  
26008  void tick_setup_hrtimer_broadcast(void)
26009  {
26010 -       hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26011 +       hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26012         bctimer.function = bc_handler;
26013         clockevents_register_device(&ce_broadcast_hrtimer);
26014  }
26015 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-common.c linux-4.14/kernel/time/tick-common.c
26016 --- linux-4.14.orig/kernel/time/tick-common.c   2017-11-12 19:46:13.000000000 +0100
26017 +++ linux-4.14/kernel/time/tick-common.c        2018-09-05 11:05:07.000000000 +0200
26018 @@ -79,13 +79,15 @@
26019  static void tick_periodic(int cpu)
26020  {
26021         if (tick_do_timer_cpu == cpu) {
26022 -               write_seqlock(&jiffies_lock);
26023 +               raw_spin_lock(&jiffies_lock);
26024 +               write_seqcount_begin(&jiffies_seq);
26025  
26026                 /* Keep track of the next tick event */
26027                 tick_next_period = ktime_add(tick_next_period, tick_period);
26028  
26029                 do_timer(1);
26030 -               write_sequnlock(&jiffies_lock);
26031 +               write_seqcount_end(&jiffies_seq);
26032 +               raw_spin_unlock(&jiffies_lock);
26033                 update_wall_time();
26034         }
26035  
26036 @@ -157,9 +159,9 @@
26037                 ktime_t next;
26038  
26039                 do {
26040 -                       seq = read_seqbegin(&jiffies_lock);
26041 +                       seq = read_seqcount_begin(&jiffies_seq);
26042                         next = tick_next_period;
26043 -               } while (read_seqretry(&jiffies_lock, seq));
26044 +               } while (read_seqcount_retry(&jiffies_seq, seq));
26045  
26046                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26047  
26048 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-internal.h linux-4.14/kernel/time/tick-internal.h
26049 --- linux-4.14.orig/kernel/time/tick-internal.h 2017-11-12 19:46:13.000000000 +0100
26050 +++ linux-4.14/kernel/time/tick-internal.h      2018-09-05 11:05:07.000000000 +0200
26051 @@ -150,16 +150,15 @@
26052  
26053  #ifdef CONFIG_NO_HZ_COMMON
26054  extern unsigned long tick_nohz_active;
26055 -#else
26056 +extern void timers_update_nohz(void);
26057 +# ifdef CONFIG_SMP
26058 +extern struct static_key_false timers_migration_enabled;
26059 +# endif
26060 +#else /* CONFIG_NO_HZ_COMMON */
26061 +static inline void timers_update_nohz(void) { }
26062  #define tick_nohz_active (0)
26063  #endif
26064  
26065 -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26066 -extern void timers_update_migration(bool update_nohz);
26067 -#else
26068 -static inline void timers_update_migration(bool update_nohz) { }
26069 -#endif
26070 -
26071  DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26072  
26073  extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
26074 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/tick-sched.c linux-4.14/kernel/time/tick-sched.c
26075 --- linux-4.14.orig/kernel/time/tick-sched.c    2018-09-05 11:03:22.000000000 +0200
26076 +++ linux-4.14/kernel/time/tick-sched.c 2018-09-05 11:05:07.000000000 +0200
26077 @@ -66,7 +66,8 @@
26078                 return;
26079  
26080         /* Reevaluate with jiffies_lock held */
26081 -       write_seqlock(&jiffies_lock);
26082 +       raw_spin_lock(&jiffies_lock);
26083 +       write_seqcount_begin(&jiffies_seq);
26084  
26085         delta = ktime_sub(now, last_jiffies_update);
26086         if (delta >= tick_period) {
26087 @@ -89,10 +90,12 @@
26088                 /* Keep the tick_next_period variable up to date */
26089                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26090         } else {
26091 -               write_sequnlock(&jiffies_lock);
26092 +               write_seqcount_end(&jiffies_seq);
26093 +               raw_spin_unlock(&jiffies_lock);
26094                 return;
26095         }
26096 -       write_sequnlock(&jiffies_lock);
26097 +       write_seqcount_end(&jiffies_seq);
26098 +       raw_spin_unlock(&jiffies_lock);
26099         update_wall_time();
26100  }
26101  
26102 @@ -103,12 +106,14 @@
26103  {
26104         ktime_t period;
26105  
26106 -       write_seqlock(&jiffies_lock);
26107 +       raw_spin_lock(&jiffies_lock);
26108 +       write_seqcount_begin(&jiffies_seq);
26109         /* Did we start the jiffies update yet ? */
26110         if (last_jiffies_update == 0)
26111                 last_jiffies_update = tick_next_period;
26112         period = last_jiffies_update;
26113 -       write_sequnlock(&jiffies_lock);
26114 +       write_seqcount_end(&jiffies_seq);
26115 +       raw_spin_unlock(&jiffies_lock);
26116         return period;
26117  }
26118  
26119 @@ -225,6 +230,7 @@
26120  
26121  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26122         .func = nohz_full_kick_func,
26123 +       .flags = IRQ_WORK_HARD_IRQ,
26124  };
26125  
26126  /*
26127 @@ -689,10 +695,10 @@
26128  
26129         /* Read jiffies and the time when jiffies were updated last */
26130         do {
26131 -               seq = read_seqbegin(&jiffies_lock);
26132 +               seq = read_seqcount_begin(&jiffies_seq);
26133                 basemono = last_jiffies_update;
26134                 basejiff = jiffies;
26135 -       } while (read_seqretry(&jiffies_lock, seq));
26136 +       } while (read_seqcount_retry(&jiffies_seq, seq));
26137         ts->last_jiffies = basejiff;
26138  
26139         /*
26140 @@ -906,14 +912,7 @@
26141                 return false;
26142  
26143         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26144 -               static int ratelimit;
26145 -
26146 -               if (ratelimit < 10 &&
26147 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26148 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
26149 -                               (unsigned int) local_softirq_pending());
26150 -                       ratelimit++;
26151 -               }
26152 +               softirq_check_pending_idle();
26153                 return false;
26154         }
26155  
26156 @@ -1132,7 +1131,7 @@
26157         ts->nohz_mode = mode;
26158         /* One update is enough */
26159         if (!test_and_set_bit(0, &tick_nohz_active))
26160 -               timers_update_migration(true);
26161 +               timers_update_nohz();
26162  }
26163  
26164  /**
26165 @@ -1250,7 +1249,7 @@
26166         /*
26167          * Emulate tick processing via per-CPU hrtimers:
26168          */
26169 -       hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26170 +       hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26171         ts->sched_timer.function = tick_sched_timer;
26172  
26173         /* Get the next period (per-CPU) */
26174 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timekeeping.c linux-4.14/kernel/time/timekeeping.c
26175 --- linux-4.14.orig/kernel/time/timekeeping.c   2017-11-12 19:46:13.000000000 +0100
26176 +++ linux-4.14/kernel/time/timekeeping.c        2018-09-05 11:05:07.000000000 +0200
26177 @@ -2326,8 +2326,10 @@
26178   */
26179  void xtime_update(unsigned long ticks)
26180  {
26181 -       write_seqlock(&jiffies_lock);
26182 +       raw_spin_lock(&jiffies_lock);
26183 +       write_seqcount_begin(&jiffies_seq);
26184         do_timer(ticks);
26185 -       write_sequnlock(&jiffies_lock);
26186 +       write_seqcount_end(&jiffies_seq);
26187 +       raw_spin_unlock(&jiffies_lock);
26188         update_wall_time();
26189  }
26190 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timekeeping.h linux-4.14/kernel/time/timekeeping.h
26191 --- linux-4.14.orig/kernel/time/timekeeping.h   2017-11-12 19:46:13.000000000 +0100
26192 +++ linux-4.14/kernel/time/timekeeping.h        2018-09-05 11:05:07.000000000 +0200
26193 @@ -18,7 +18,8 @@
26194  extern void do_timer(unsigned long ticks);
26195  extern void update_wall_time(void);
26196  
26197 -extern seqlock_t jiffies_lock;
26198 +extern raw_spinlock_t jiffies_lock;
26199 +extern seqcount_t jiffies_seq;
26200  
26201  #define CS_NAME_LEN    32
26202  
26203 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/time/timer.c linux-4.14/kernel/time/timer.c
26204 --- linux-4.14.orig/kernel/time/timer.c 2018-09-05 11:03:22.000000000 +0200
26205 +++ linux-4.14/kernel/time/timer.c      2018-09-05 11:05:07.000000000 +0200
26206 @@ -44,6 +44,7 @@
26207  #include <linux/sched/debug.h>
26208  #include <linux/slab.h>
26209  #include <linux/compat.h>
26210 +#include <linux/swait.h>
26211  
26212  #include <linux/uaccess.h>
26213  #include <asm/unistd.h>
26214 @@ -197,11 +198,12 @@
26215  struct timer_base {
26216         raw_spinlock_t          lock;
26217         struct timer_list       *running_timer;
26218 +#ifdef CONFIG_PREEMPT_RT_FULL
26219 +       struct swait_queue_head wait_for_running_timer;
26220 +#endif
26221         unsigned long           clk;
26222         unsigned long           next_expiry;
26223         unsigned int            cpu;
26224 -       bool                    migration_enabled;
26225 -       bool                    nohz_active;
26226         bool                    is_idle;
26227         bool                    must_forward_clk;
26228         DECLARE_BITMAP(pending_map, WHEEL_SIZE);
26229 @@ -210,45 +212,73 @@
26230  
26231  static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
26232  
26233 -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26234 +#ifdef CONFIG_NO_HZ_COMMON
26235 +
26236 +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
26237 +static DEFINE_MUTEX(timer_keys_mutex);
26238 +
26239 +static struct swork_event timer_update_swork;
26240 +
26241 +#ifdef CONFIG_SMP
26242  unsigned int sysctl_timer_migration = 1;
26243  
26244 -void timers_update_migration(bool update_nohz)
26245 +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
26246 +
26247 +static void timers_update_migration(void)
26248  {
26249         bool on = sysctl_timer_migration && tick_nohz_active;
26250 -       unsigned int cpu;
26251  
26252 -       /* Avoid the loop, if nothing to update */
26253 -       if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
26254 -               return;
26255 +       if (on)
26256 +               static_branch_enable(&timers_migration_enabled);
26257 +       else
26258 +               static_branch_disable(&timers_migration_enabled);
26259 +}
26260 +#else
26261 +static inline void timers_update_migration(void) { }
26262 +#endif /* !CONFIG_SMP */
26263  
26264 -       for_each_possible_cpu(cpu) {
26265 -               per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
26266 -               per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
26267 -               per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
26268 -               if (!update_nohz)
26269 -                       continue;
26270 -               per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
26271 -               per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
26272 -               per_cpu(hrtimer_bases.nohz_active, cpu) = true;
26273 -       }
26274 +static void timer_update_keys(struct swork_event *event)
26275 +{
26276 +       mutex_lock(&timer_keys_mutex);
26277 +       timers_update_migration();
26278 +       static_branch_enable(&timers_nohz_active);
26279 +       mutex_unlock(&timer_keys_mutex);
26280  }
26281  
26282 +void timers_update_nohz(void)
26283 +{
26284 +       swork_queue(&timer_update_swork);
26285 +}
26286 +
26287 +static __init int hrtimer_init_thread(void)
26288 +{
26289 +       WARN_ON(swork_get());
26290 +       INIT_SWORK(&timer_update_swork, timer_update_keys);
26291 +       return 0;
26292 +}
26293 +early_initcall(hrtimer_init_thread);
26294 +
26295  int timer_migration_handler(struct ctl_table *table, int write,
26296                             void __user *buffer, size_t *lenp,
26297                             loff_t *ppos)
26298  {
26299 -       static DEFINE_MUTEX(mutex);
26300         int ret;
26301  
26302 -       mutex_lock(&mutex);
26303 +       mutex_lock(&timer_keys_mutex);
26304         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
26305         if (!ret && write)
26306 -               timers_update_migration(false);
26307 -       mutex_unlock(&mutex);
26308 +               timers_update_migration();
26309 +       mutex_unlock(&timer_keys_mutex);
26310         return ret;
26311  }
26312 -#endif
26313 +
26314 +static inline bool is_timers_nohz_active(void)
26315 +{
26316 +       return static_branch_unlikely(&timers_nohz_active);
26317 +}
26318 +#else
26319 +static inline bool is_timers_nohz_active(void) { return false; }
26320 +#endif /* NO_HZ_COMMON */
26321  
26322  static unsigned long round_jiffies_common(unsigned long j, int cpu,
26323                 bool force_up)
26324 @@ -534,7 +564,7 @@
26325  static void
26326  trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
26327  {
26328 -       if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
26329 +       if (!is_timers_nohz_active())
26330                 return;
26331  
26332         /*
26333 @@ -840,21 +870,20 @@
26334         return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
26335  }
26336  
26337 -#ifdef CONFIG_NO_HZ_COMMON
26338  static inline struct timer_base *
26339  get_target_base(struct timer_base *base, unsigned tflags)
26340  {
26341 -#ifdef CONFIG_SMP
26342 -       if ((tflags & TIMER_PINNED) || !base->migration_enabled)
26343 -               return get_timer_this_cpu_base(tflags);
26344 -       return get_timer_cpu_base(tflags, get_nohz_timer_target());
26345 -#else
26346 -       return get_timer_this_cpu_base(tflags);
26347 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26348 +       if (static_branch_unlikely(&timers_migration_enabled) &&
26349 +           !(tflags & TIMER_PINNED))
26350 +               return get_timer_cpu_base(tflags, get_nohz_timer_target());
26351  #endif
26352 +       return get_timer_this_cpu_base(tflags);
26353  }
26354  
26355  static inline void forward_timer_base(struct timer_base *base)
26356  {
26357 +#ifdef CONFIG_NO_HZ_COMMON
26358         unsigned long jnow;
26359  
26360         /*
26361 @@ -878,16 +907,8 @@
26362                 base->clk = jnow;
26363         else
26364                 base->clk = base->next_expiry;
26365 -}
26366 -#else
26367 -static inline struct timer_base *
26368 -get_target_base(struct timer_base *base, unsigned tflags)
26369 -{
26370 -       return get_timer_this_cpu_base(tflags);
26371 -}
26372 -
26373 -static inline void forward_timer_base(struct timer_base *base) { }
26374  #endif
26375 +}
26376  
26377  
26378  /*
26379 @@ -1130,6 +1151,33 @@
26380  }
26381  EXPORT_SYMBOL_GPL(add_timer_on);
26382  
26383 +#ifdef CONFIG_PREEMPT_RT_FULL
26384 +/*
26385 + * Wait for a running timer
26386 + */
26387 +static void wait_for_running_timer(struct timer_list *timer)
26388 +{
26389 +       struct timer_base *base;
26390 +       u32 tf = timer->flags;
26391 +
26392 +       if (tf & TIMER_MIGRATING)
26393 +               return;
26394 +
26395 +       base = get_timer_base(tf);
26396 +       swait_event(base->wait_for_running_timer,
26397 +                   base->running_timer != timer);
26398 +}
26399 +
26400 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
26401 +#else
26402 +static inline void wait_for_running_timer(struct timer_list *timer)
26403 +{
26404 +       cpu_relax();
26405 +}
26406 +
26407 +# define wakeup_timer_waiters(b)       do { } while (0)
26408 +#endif
26409 +
26410  /**
26411   * del_timer - deactivate a timer.
26412   * @timer: the timer to be deactivated
26413 @@ -1185,7 +1233,7 @@
26414  }
26415  EXPORT_SYMBOL(try_to_del_timer_sync);
26416  
26417 -#ifdef CONFIG_SMP
26418 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26419  /**
26420   * del_timer_sync - deactivate a timer and wait for the handler to finish.
26421   * @timer: the timer to be deactivated
26422 @@ -1245,7 +1293,7 @@
26423                 int ret = try_to_del_timer_sync(timer);
26424                 if (ret >= 0)
26425                         return ret;
26426 -               cpu_relax();
26427 +               wait_for_running_timer(timer);
26428         }
26429  }
26430  EXPORT_SYMBOL(del_timer_sync);
26431 @@ -1309,13 +1357,16 @@
26432                 fn = timer->function;
26433                 data = timer->data;
26434  
26435 -               if (timer->flags & TIMER_IRQSAFE) {
26436 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
26437 +                   timer->flags & TIMER_IRQSAFE) {
26438                         raw_spin_unlock(&base->lock);
26439                         call_timer_fn(timer, fn, data);
26440 +                       base->running_timer = NULL;
26441                         raw_spin_lock(&base->lock);
26442                 } else {
26443                         raw_spin_unlock_irq(&base->lock);
26444                         call_timer_fn(timer, fn, data);
26445 +                       base->running_timer = NULL;
26446                         raw_spin_lock_irq(&base->lock);
26447                 }
26448         }
26449 @@ -1584,13 +1635,13 @@
26450  
26451         /* Note: this timer irq context must be accounted for as well. */
26452         account_process_tick(p, user_tick);
26453 +       scheduler_tick();
26454         run_local_timers();
26455         rcu_check_callbacks(user_tick);
26456 -#ifdef CONFIG_IRQ_WORK
26457 +#if defined(CONFIG_IRQ_WORK)
26458         if (in_irq())
26459                 irq_work_tick();
26460  #endif
26461 -       scheduler_tick();
26462         if (IS_ENABLED(CONFIG_POSIX_TIMERS))
26463                 run_posix_cpu_timers(p);
26464  }
26465 @@ -1617,8 +1668,8 @@
26466                 while (levels--)
26467                         expire_timers(base, heads + levels);
26468         }
26469 -       base->running_timer = NULL;
26470         raw_spin_unlock_irq(&base->lock);
26471 +       wakeup_timer_waiters(base);
26472  }
26473  
26474  /*
26475 @@ -1628,6 +1679,7 @@
26476  {
26477         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
26478  
26479 +       irq_work_tick_soft();
26480         /*
26481          * must_forward_clk must be cleared before running timers so that any
26482          * timer functions that call mod_timer will not try to forward the
26483 @@ -1864,6 +1916,9 @@
26484                 base->cpu = cpu;
26485                 raw_spin_lock_init(&base->lock);
26486                 base->clk = jiffies;
26487 +#ifdef CONFIG_PREEMPT_RT_FULL
26488 +               init_swait_queue_head(&base->wait_for_running_timer);
26489 +#endif
26490         }
26491  }
26492  
26493 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/Kconfig linux-4.14/kernel/trace/Kconfig
26494 --- linux-4.14.orig/kernel/trace/Kconfig        2018-09-05 11:03:22.000000000 +0200
26495 +++ linux-4.14/kernel/trace/Kconfig     2018-09-05 11:05:07.000000000 +0200
26496 @@ -585,7 +585,10 @@
26497           event activity as an initial guide for further investigation
26498           using more advanced tools.
26499  
26500 -         See Documentation/trace/events.txt.
26501 +         Inter-event tracing of quantities such as latencies is also
26502 +         supported using hist triggers under this option.
26503 +
26504 +         See Documentation/trace/histogram.txt.
26505           If in doubt, say N.
26506  
26507  config MMIOTRACE_TEST
26508 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/ring_buffer.c linux-4.14/kernel/trace/ring_buffer.c
26509 --- linux-4.14.orig/kernel/trace/ring_buffer.c  2018-09-05 11:03:22.000000000 +0200
26510 +++ linux-4.14/kernel/trace/ring_buffer.c       2018-09-05 11:05:07.000000000 +0200
26511 @@ -41,6 +41,8 @@
26512                          RINGBUF_TYPE_PADDING);
26513         trace_seq_printf(s, "\ttime_extend : type == %d\n",
26514                          RINGBUF_TYPE_TIME_EXTEND);
26515 +       trace_seq_printf(s, "\ttime_stamp : type == %d\n",
26516 +                        RINGBUF_TYPE_TIME_STAMP);
26517         trace_seq_printf(s, "\tdata max type_len  == %d\n",
26518                          RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
26519  
26520 @@ -140,12 +142,15 @@
26521  
26522  enum {
26523         RB_LEN_TIME_EXTEND = 8,
26524 -       RB_LEN_TIME_STAMP = 16,
26525 +       RB_LEN_TIME_STAMP =  8,
26526  };
26527  
26528  #define skip_time_extend(event) \
26529         ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
26530  
26531 +#define extended_time(event) \
26532 +       (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
26533 +
26534  static inline int rb_null_event(struct ring_buffer_event *event)
26535  {
26536         return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
26537 @@ -209,7 +214,7 @@
26538  {
26539         unsigned len = 0;
26540  
26541 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
26542 +       if (extended_time(event)) {
26543                 /* time extends include the data event after it */
26544                 len = RB_LEN_TIME_EXTEND;
26545                 event = skip_time_extend(event);
26546 @@ -231,7 +236,7 @@
26547  {
26548         unsigned length;
26549  
26550 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26551 +       if (extended_time(event))
26552                 event = skip_time_extend(event);
26553  
26554         length = rb_event_length(event);
26555 @@ -248,7 +253,7 @@
26556  static __always_inline void *
26557  rb_event_data(struct ring_buffer_event *event)
26558  {
26559 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26560 +       if (extended_time(event))
26561                 event = skip_time_extend(event);
26562         BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
26563         /* If length is in len field, then array[0] has the data */
26564 @@ -275,6 +280,27 @@
26565  #define TS_MASK                ((1ULL << TS_SHIFT) - 1)
26566  #define TS_DELTA_TEST  (~TS_MASK)
26567  
26568 +/**
26569 + * ring_buffer_event_time_stamp - return the event's extended timestamp
26570 + * @event: the event to get the timestamp of
26571 + *
26572 + * Returns the extended timestamp associated with a data event.
26573 + * An extended time_stamp is a 64-bit timestamp represented
26574 + * internally in a special way that makes the best use of space
26575 + * contained within a ring buffer event.  This function decodes
26576 + * it and maps it to a straight u64 value.
26577 + */
26578 +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
26579 +{
26580 +       u64 ts;
26581 +
26582 +       ts = event->array[0];
26583 +       ts <<= TS_SHIFT;
26584 +       ts += event->time_delta;
26585 +
26586 +       return ts;
26587 +}
26588 +
26589  /* Flag when events were overwritten */
26590  #define RB_MISSED_EVENTS       (1 << 31)
26591  /* Missed count stored at end */
26592 @@ -451,6 +477,7 @@
26593         struct buffer_page              *reader_page;
26594         unsigned long                   lost_events;
26595         unsigned long                   last_overrun;
26596 +       unsigned long                   nest;
26597         local_t                         entries_bytes;
26598         local_t                         entries;
26599         local_t                         overrun;
26600 @@ -488,6 +515,7 @@
26601         u64                             (*clock)(void);
26602  
26603         struct rb_irq_work              irq_work;
26604 +       bool                            time_stamp_abs;
26605  };
26606  
26607  struct ring_buffer_iter {
26608 @@ -1387,6 +1415,16 @@
26609         buffer->clock = clock;
26610  }
26611  
26612 +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
26613 +{
26614 +       buffer->time_stamp_abs = abs;
26615 +}
26616 +
26617 +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
26618 +{
26619 +       return buffer->time_stamp_abs;
26620 +}
26621 +
26622  static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
26623  
26624  static inline unsigned long rb_page_entries(struct buffer_page *bpage)
26625 @@ -2217,12 +2255,15 @@
26626  
26627  /* Slow path, do not inline */
26628  static noinline struct ring_buffer_event *
26629 -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
26630 +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
26631  {
26632 -       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
26633 +       if (abs)
26634 +               event->type_len = RINGBUF_TYPE_TIME_STAMP;
26635 +       else
26636 +               event->type_len = RINGBUF_TYPE_TIME_EXTEND;
26637  
26638 -       /* Not the first event on the page? */
26639 -       if (rb_event_index(event)) {
26640 +       /* Not the first event on the page, or not delta? */
26641 +       if (abs || rb_event_index(event)) {
26642                 event->time_delta = delta & TS_MASK;
26643                 event->array[0] = delta >> TS_SHIFT;
26644         } else {
26645 @@ -2265,7 +2306,9 @@
26646          * add it to the start of the resevered space.
26647          */
26648         if (unlikely(info->add_timestamp)) {
26649 -               event = rb_add_time_stamp(event, delta);
26650 +               bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
26651 +
26652 +               event = rb_add_time_stamp(event, info->delta, abs);
26653                 length -= RB_LEN_TIME_EXTEND;
26654                 delta = 0;
26655         }
26656 @@ -2453,7 +2496,7 @@
26657  
26658  static inline void rb_event_discard(struct ring_buffer_event *event)
26659  {
26660 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
26661 +       if (extended_time(event))
26662                 event = skip_time_extend(event);
26663  
26664         /* array[0] holds the actual length for the discarded event */
26665 @@ -2497,10 +2540,11 @@
26666                         cpu_buffer->write_stamp =
26667                                 cpu_buffer->commit_page->page->time_stamp;
26668                 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
26669 -                       delta = event->array[0];
26670 -                       delta <<= TS_SHIFT;
26671 -                       delta += event->time_delta;
26672 +                       delta = ring_buffer_event_time_stamp(event);
26673                         cpu_buffer->write_stamp += delta;
26674 +               } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
26675 +                       delta = ring_buffer_event_time_stamp(event);
26676 +                       cpu_buffer->write_stamp = delta;
26677                 } else
26678                         cpu_buffer->write_stamp += event->time_delta;
26679         }
26680 @@ -2583,22 +2627,19 @@
26681  trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
26682  {
26683         unsigned int val = cpu_buffer->current_context;
26684 +       unsigned long pc = preempt_count();
26685         int bit;
26686  
26687 -       if (in_interrupt()) {
26688 -               if (in_nmi())
26689 -                       bit = RB_CTX_NMI;
26690 -               else if (in_irq())
26691 -                       bit = RB_CTX_IRQ;
26692 -               else
26693 -                       bit = RB_CTX_SOFTIRQ;
26694 -       } else
26695 +       if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
26696                 bit = RB_CTX_NORMAL;
26697 +       else
26698 +               bit = pc & NMI_MASK ? RB_CTX_NMI :
26699 +                       pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
26700  
26701 -       if (unlikely(val & (1 << bit)))
26702 +       if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
26703                 return 1;
26704  
26705 -       val |= (1 << bit);
26706 +       val |= (1 << (bit + cpu_buffer->nest));
26707         cpu_buffer->current_context = val;
26708  
26709         return 0;
26710 @@ -2607,7 +2648,57 @@
26711  static __always_inline void
26712  trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
26713  {
26714 -       cpu_buffer->current_context &= cpu_buffer->current_context - 1;
26715 +       cpu_buffer->current_context &=
26716 +               cpu_buffer->current_context - (1 << cpu_buffer->nest);
26717 +}
26718 +
26719 +/* The recursive locking above uses 4 bits */
26720 +#define NESTED_BITS 4
26721 +
26722 +/**
26723 + * ring_buffer_nest_start - Allow to trace while nested
26724 + * @buffer: The ring buffer to modify
26725 + *
26726 + * The ring buffer has a safty mechanism to prevent recursion.
26727 + * But there may be a case where a trace needs to be done while
26728 + * tracing something else. In this case, calling this function
26729 + * will allow this function to nest within a currently active
26730 + * ring_buffer_lock_reserve().
26731 + *
26732 + * Call this function before calling another ring_buffer_lock_reserve() and
26733 + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
26734 + */
26735 +void ring_buffer_nest_start(struct ring_buffer *buffer)
26736 +{
26737 +       struct ring_buffer_per_cpu *cpu_buffer;
26738 +       int cpu;
26739 +
26740 +       /* Enabled by ring_buffer_nest_end() */
26741 +       preempt_disable_notrace();
26742 +       cpu = raw_smp_processor_id();
26743 +       cpu_buffer = buffer->buffers[cpu];
26744 +       /* This is the shift value for the above recusive locking */
26745 +       cpu_buffer->nest += NESTED_BITS;
26746 +}
26747 +
26748 +/**
26749 + * ring_buffer_nest_end - Allow to trace while nested
26750 + * @buffer: The ring buffer to modify
26751 + *
26752 + * Must be called after ring_buffer_nest_start() and after the
26753 + * ring_buffer_unlock_commit().
26754 + */
26755 +void ring_buffer_nest_end(struct ring_buffer *buffer)
26756 +{
26757 +       struct ring_buffer_per_cpu *cpu_buffer;
26758 +       int cpu;
26759 +
26760 +       /* disabled by ring_buffer_nest_start() */
26761 +       cpu = raw_smp_processor_id();
26762 +       cpu_buffer = buffer->buffers[cpu];
26763 +       /* This is the shift value for the above recusive locking */
26764 +       cpu_buffer->nest -= NESTED_BITS;
26765 +       preempt_enable_notrace();
26766  }
26767  
26768  /**
26769 @@ -2683,7 +2774,7 @@
26770          * If this is the first commit on the page, then it has the same
26771          * timestamp as the page itself.
26772          */
26773 -       if (!tail)
26774 +       if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
26775                 info->delta = 0;
26776  
26777         /* See if we shot pass the end of this buffer page */
26778 @@ -2760,8 +2851,11 @@
26779         /* make sure this diff is calculated here */
26780         barrier();
26781  
26782 -       /* Did the write stamp get updated already? */
26783 -       if (likely(info.ts >= cpu_buffer->write_stamp)) {
26784 +       if (ring_buffer_time_stamp_abs(buffer)) {
26785 +               info.delta = info.ts;
26786 +               rb_handle_timestamp(cpu_buffer, &info);
26787 +       } else /* Did the write stamp get updated already? */
26788 +               if (likely(info.ts >= cpu_buffer->write_stamp)) {
26789                 info.delta = diff;
26790                 if (unlikely(test_time_stamp(info.delta)))
26791                         rb_handle_timestamp(cpu_buffer, &info);
26792 @@ -3459,14 +3553,13 @@
26793                 return;
26794  
26795         case RINGBUF_TYPE_TIME_EXTEND:
26796 -               delta = event->array[0];
26797 -               delta <<= TS_SHIFT;
26798 -               delta += event->time_delta;
26799 +               delta = ring_buffer_event_time_stamp(event);
26800                 cpu_buffer->read_stamp += delta;
26801                 return;
26802  
26803         case RINGBUF_TYPE_TIME_STAMP:
26804 -               /* FIXME: not implemented */
26805 +               delta = ring_buffer_event_time_stamp(event);
26806 +               cpu_buffer->read_stamp = delta;
26807                 return;
26808  
26809         case RINGBUF_TYPE_DATA:
26810 @@ -3490,14 +3583,13 @@
26811                 return;
26812  
26813         case RINGBUF_TYPE_TIME_EXTEND:
26814 -               delta = event->array[0];
26815 -               delta <<= TS_SHIFT;
26816 -               delta += event->time_delta;
26817 +               delta = ring_buffer_event_time_stamp(event);
26818                 iter->read_stamp += delta;
26819                 return;
26820  
26821         case RINGBUF_TYPE_TIME_STAMP:
26822 -               /* FIXME: not implemented */
26823 +               delta = ring_buffer_event_time_stamp(event);
26824 +               iter->read_stamp = delta;
26825                 return;
26826  
26827         case RINGBUF_TYPE_DATA:
26828 @@ -3721,6 +3813,8 @@
26829         struct buffer_page *reader;
26830         int nr_loops = 0;
26831  
26832 +       if (ts)
26833 +               *ts = 0;
26834   again:
26835         /*
26836          * We repeat when a time extend is encountered.
26837 @@ -3757,12 +3851,17 @@
26838                 goto again;
26839  
26840         case RINGBUF_TYPE_TIME_STAMP:
26841 -               /* FIXME: not implemented */
26842 +               if (ts) {
26843 +                       *ts = ring_buffer_event_time_stamp(event);
26844 +                       ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26845 +                                                        cpu_buffer->cpu, ts);
26846 +               }
26847 +               /* Internal data, OK to advance */
26848                 rb_advance_reader(cpu_buffer);
26849                 goto again;
26850  
26851         case RINGBUF_TYPE_DATA:
26852 -               if (ts) {
26853 +               if (ts && !(*ts)) {
26854                         *ts = cpu_buffer->read_stamp + event->time_delta;
26855                         ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26856                                                          cpu_buffer->cpu, ts);
26857 @@ -3787,6 +3886,9 @@
26858         struct ring_buffer_event *event;
26859         int nr_loops = 0;
26860  
26861 +       if (ts)
26862 +               *ts = 0;
26863 +
26864         cpu_buffer = iter->cpu_buffer;
26865         buffer = cpu_buffer->buffer;
26866  
26867 @@ -3839,12 +3941,17 @@
26868                 goto again;
26869  
26870         case RINGBUF_TYPE_TIME_STAMP:
26871 -               /* FIXME: not implemented */
26872 +               if (ts) {
26873 +                       *ts = ring_buffer_event_time_stamp(event);
26874 +                       ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
26875 +                                                        cpu_buffer->cpu, ts);
26876 +               }
26877 +               /* Internal data, OK to advance */
26878                 rb_advance_iter(iter);
26879                 goto again;
26880  
26881         case RINGBUF_TYPE_DATA:
26882 -               if (ts) {
26883 +               if (ts && !(*ts)) {
26884                         *ts = iter->read_stamp + event->time_delta;
26885                         ring_buffer_normalize_time_stamp(buffer,
26886                                                          cpu_buffer->cpu, ts);
26887 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace.c linux-4.14/kernel/trace/trace.c
26888 --- linux-4.14.orig/kernel/trace/trace.c        2018-09-05 11:03:22.000000000 +0200
26889 +++ linux-4.14/kernel/trace/trace.c     2018-09-05 11:05:07.000000000 +0200
26890 @@ -1170,6 +1170,14 @@
26891         ARCH_TRACE_CLOCKS
26892  };
26893  
26894 +bool trace_clock_in_ns(struct trace_array *tr)
26895 +{
26896 +       if (trace_clocks[tr->clock_id].in_ns)
26897 +               return true;
26898 +
26899 +       return false;
26900 +}
26901 +
26902  /*
26903   * trace_parser_get_init - gets the buffer for trace parser
26904   */
26905 @@ -2127,6 +2135,7 @@
26906         struct task_struct *tsk = current;
26907  
26908         entry->preempt_count            = pc & 0xff;
26909 +       entry->preempt_lazy_count       = preempt_lazy_count();
26910         entry->pid                      = (tsk) ? tsk->pid : 0;
26911         entry->flags =
26912  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
26913 @@ -2137,8 +2146,11 @@
26914                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
26915                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
26916                 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
26917 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
26918 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
26919 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
26920                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
26921 +
26922 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
26923  }
26924  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
26925  
26926 @@ -2275,7 +2287,7 @@
26927  
26928         *current_rb = trace_file->tr->trace_buffer.buffer;
26929  
26930 -       if ((trace_file->flags &
26931 +       if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
26932              (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
26933             (entry = this_cpu_read(trace_buffered_event))) {
26934                 /* Try to use the per cpu buffer first */
26935 @@ -3342,14 +3354,17 @@
26936  
26937  static void print_lat_help_header(struct seq_file *m)
26938  {
26939 -       seq_puts(m, "#                  _------=> CPU#            \n"
26940 -                   "#                 / _-----=> irqs-off        \n"
26941 -                   "#                | / _----=> need-resched    \n"
26942 -                   "#                || / _---=> hardirq/softirq \n"
26943 -                   "#                ||| / _--=> preempt-depth   \n"
26944 -                   "#                |||| /     delay            \n"
26945 -                   "#  cmd     pid   ||||| time  |   caller      \n"
26946 -                   "#     \\   /      |||||  \\    |   /         \n");
26947 +       seq_puts(m, "#                  _--------=> CPU#              \n"
26948 +                   "#                 / _-------=> irqs-off          \n"
26949 +                   "#                | / _------=> need-resched      \n"
26950 +                   "#                || / _-----=> need-resched_lazy \n"
26951 +                   "#                ||| / _----=> hardirq/softirq   \n"
26952 +                   "#                |||| / _---=> preempt-depth     \n"
26953 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
26954 +                   "#                |||||| / _-=> migrate-disable   \n"
26955 +                   "#                ||||||| /     delay             \n"
26956 +                   "# cmd     pid    |||||||| time   |  caller       \n"
26957 +                   "#     \\   /      ||||||||   \\    |  /            \n");
26958  }
26959  
26960  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
26961 @@ -3385,15 +3400,17 @@
26962                    tgid ? tgid_space : space);
26963         seq_printf(m, "#                          %s / _----=> need-resched\n",
26964                    tgid ? tgid_space : space);
26965 -       seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",
26966 +       seq_printf(m, "#                          %s| /  _----=> need-resched_lazy\n",
26967                    tgid ? tgid_space : space);
26968 -       seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",
26969 +       seq_printf(m, "#                          %s|| / _---=> hardirq/softirq\n",
26970                    tgid ? tgid_space : space);
26971 -       seq_printf(m, "#                          %s||| /     delay\n",
26972 +       seq_printf(m, "#                          %s||| / _--=> preempt-depth\n",
26973                    tgid ? tgid_space : space);
26974 -       seq_printf(m, "#           TASK-PID %sCPU#  ||||    TIMESTAMP  FUNCTION\n",
26975 +       seq_printf(m, "#                          %s|||| /     delay\n",
26976 +                  tgid ? tgid_space : space);
26977 +       seq_printf(m, "#           TASK-PID %sCPU#  |||||    TIMESTAMP  FUNCTION\n",
26978                    tgid ? "   TGID   " : space);
26979 -       seq_printf(m, "#              | |   %s  |   ||||       |         |\n",
26980 +       seq_printf(m, "#              | |   %s  |   |||||       |         |\n",
26981                    tgid ? "     |    " : space);
26982  }
26983  
26984 @@ -4531,6 +4548,9 @@
26985  #ifdef CONFIG_X86_64
26986         "     x86-tsc:   TSC cycle counter\n"
26987  #endif
26988 +       "\n  timestamp_mode\t-view the mode used to timestamp events\n"
26989 +       "       delta:   Delta difference against a buffer-wide timestamp\n"
26990 +       "    absolute:   Absolute (standalone) timestamp\n"
26991         "\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
26992         "\n  trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
26993         "  tracing_cpumask\t- Limit which CPUs to trace\n"
26994 @@ -4707,8 +4727,9 @@
26995         "\t            .sym        display an address as a symbol\n"
26996         "\t            .sym-offset display an address as a symbol and offset\n"
26997         "\t            .execname   display a common_pid as a program name\n"
26998 -       "\t            .syscall    display a syscall id as a syscall name\n\n"
26999 -       "\t            .log2       display log2 value rather than raw number\n\n"
27000 +       "\t            .syscall    display a syscall id as a syscall name\n"
27001 +       "\t            .log2       display log2 value rather than raw number\n"
27002 +       "\t            .usecs      display a common_timestamp in microseconds\n\n"
27003         "\t    The 'pause' parameter can be used to pause an existing hist\n"
27004         "\t    trigger or to start a hist trigger but not log any events\n"
27005         "\t    until told to do so.  'continue' can be used to start or\n"
27006 @@ -6218,7 +6239,7 @@
27007         return 0;
27008  }
27009  
27010 -static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27011 +int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27012  {
27013         int i;
27014  
27015 @@ -6298,6 +6319,71 @@
27016         return ret;
27017  }
27018  
27019 +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
27020 +{
27021 +       struct trace_array *tr = m->private;
27022 +
27023 +       mutex_lock(&trace_types_lock);
27024 +
27025 +       if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
27026 +               seq_puts(m, "delta [absolute]\n");
27027 +       else
27028 +               seq_puts(m, "[delta] absolute\n");
27029 +
27030 +       mutex_unlock(&trace_types_lock);
27031 +
27032 +       return 0;
27033 +}
27034 +
27035 +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
27036 +{
27037 +       struct trace_array *tr = inode->i_private;
27038 +       int ret;
27039 +
27040 +       if (tracing_disabled)
27041 +               return -ENODEV;
27042 +
27043 +       if (trace_array_get(tr))
27044 +               return -ENODEV;
27045 +
27046 +       ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
27047 +       if (ret < 0)
27048 +               trace_array_put(tr);
27049 +
27050 +       return ret;
27051 +}
27052 +
27053 +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
27054 +{
27055 +       int ret = 0;
27056 +
27057 +       mutex_lock(&trace_types_lock);
27058 +
27059 +       if (abs && tr->time_stamp_abs_ref++)
27060 +               goto out;
27061 +
27062 +       if (!abs) {
27063 +               if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
27064 +                       ret = -EINVAL;
27065 +                       goto out;
27066 +               }
27067 +
27068 +               if (--tr->time_stamp_abs_ref)
27069 +                       goto out;
27070 +       }
27071 +
27072 +       ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
27073 +
27074 +#ifdef CONFIG_TRACER_MAX_TRACE
27075 +       if (tr->max_buffer.buffer)
27076 +               ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
27077 +#endif
27078 + out:
27079 +       mutex_unlock(&trace_types_lock);
27080 +
27081 +       return ret;
27082 +}
27083 +
27084  struct ftrace_buffer_info {
27085         struct trace_iterator   iter;
27086         void                    *spare;
27087 @@ -6545,6 +6631,13 @@
27088         .write          = tracing_clock_write,
27089  };
27090  
27091 +static const struct file_operations trace_time_stamp_mode_fops = {
27092 +       .open           = tracing_time_stamp_mode_open,
27093 +       .read           = seq_read,
27094 +       .llseek         = seq_lseek,
27095 +       .release        = tracing_single_release_tr,
27096 +};
27097 +
27098  #ifdef CONFIG_TRACER_SNAPSHOT
27099  static const struct file_operations snapshot_fops = {
27100         .open           = tracing_snapshot_open,
27101 @@ -7682,6 +7775,7 @@
27102         struct trace_array *tr;
27103         int ret;
27104  
27105 +       mutex_lock(&event_mutex);
27106         mutex_lock(&trace_types_lock);
27107  
27108         ret = -EEXIST;
27109 @@ -7714,6 +7808,7 @@
27110  
27111         INIT_LIST_HEAD(&tr->systems);
27112         INIT_LIST_HEAD(&tr->events);
27113 +       INIT_LIST_HEAD(&tr->hist_vars);
27114  
27115         if (allocate_trace_buffers(tr, trace_buf_size) < 0)
27116                 goto out_free_tr;
27117 @@ -7737,6 +7832,7 @@
27118         list_add(&tr->list, &ftrace_trace_arrays);
27119  
27120         mutex_unlock(&trace_types_lock);
27121 +       mutex_unlock(&event_mutex);
27122  
27123         return 0;
27124  
27125 @@ -7748,6 +7844,7 @@
27126  
27127   out_unlock:
27128         mutex_unlock(&trace_types_lock);
27129 +       mutex_unlock(&event_mutex);
27130  
27131         return ret;
27132  
27133 @@ -7760,6 +7857,7 @@
27134         int ret;
27135         int i;
27136  
27137 +       mutex_lock(&event_mutex);
27138         mutex_lock(&trace_types_lock);
27139  
27140         ret = -ENODEV;
27141 @@ -7805,6 +7903,7 @@
27142  
27143   out_unlock:
27144         mutex_unlock(&trace_types_lock);
27145 +       mutex_unlock(&event_mutex);
27146  
27147         return ret;
27148  }
27149 @@ -7862,6 +7961,9 @@
27150         trace_create_file("tracing_on", 0644, d_tracer,
27151                           tr, &rb_simple_fops);
27152  
27153 +       trace_create_file("timestamp_mode", 0444, d_tracer, tr,
27154 +                         &trace_time_stamp_mode_fops);
27155 +
27156         create_trace_options_dir(tr);
27157  
27158  #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
27159 @@ -8271,6 +8373,92 @@
27160  }
27161  EXPORT_SYMBOL_GPL(ftrace_dump);
27162  
27163 +int trace_run_command(const char *buf, int (*createfn)(int, char **))
27164 +{
27165 +       char **argv;
27166 +       int argc, ret;
27167 +
27168 +       argc = 0;
27169 +       ret = 0;
27170 +       argv = argv_split(GFP_KERNEL, buf, &argc);
27171 +       if (!argv)
27172 +               return -ENOMEM;
27173 +
27174 +       if (argc)
27175 +               ret = createfn(argc, argv);
27176 +
27177 +       argv_free(argv);
27178 +
27179 +       return ret;
27180 +}
27181 +
27182 +#define WRITE_BUFSIZE  4096
27183 +
27184 +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
27185 +                               size_t count, loff_t *ppos,
27186 +                               int (*createfn)(int, char **))
27187 +{
27188 +       char *kbuf, *buf, *tmp;
27189 +       int ret = 0;
27190 +       size_t done = 0;
27191 +       size_t size;
27192 +
27193 +       kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
27194 +       if (!kbuf)
27195 +               return -ENOMEM;
27196 +
27197 +       while (done < count) {
27198 +               size = count - done;
27199 +
27200 +               if (size >= WRITE_BUFSIZE)
27201 +                       size = WRITE_BUFSIZE - 1;
27202 +
27203 +               if (copy_from_user(kbuf, buffer + done, size)) {
27204 +                       ret = -EFAULT;
27205 +                       goto out;
27206 +               }
27207 +               kbuf[size] = '\0';
27208 +               buf = kbuf;
27209 +               do {
27210 +                       tmp = strchr(buf, '\n');
27211 +                       if (tmp) {
27212 +                               *tmp = '\0';
27213 +                               size = tmp - buf + 1;
27214 +                       } else {
27215 +                               size = strlen(buf);
27216 +                               if (done + size < count) {
27217 +                                       if (buf != kbuf)
27218 +                                               break;
27219 +                                       /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
27220 +                                       pr_warn("Line length is too long: Should be less than %d\n",
27221 +                                               WRITE_BUFSIZE - 2);
27222 +                                       ret = -EINVAL;
27223 +                                       goto out;
27224 +                               }
27225 +                       }
27226 +                       done += size;
27227 +
27228 +                       /* Remove comments */
27229 +                       tmp = strchr(buf, '#');
27230 +
27231 +                       if (tmp)
27232 +                               *tmp = '\0';
27233 +
27234 +                       ret = trace_run_command(buf, createfn);
27235 +                       if (ret)
27236 +                               goto out;
27237 +                       buf += size;
27238 +
27239 +               } while (done < count);
27240 +       }
27241 +       ret = done;
27242 +
27243 +out:
27244 +       kfree(kbuf);
27245 +
27246 +       return ret;
27247 +}
27248 +
27249  __init static int tracer_alloc_buffers(void)
27250  {
27251         int ring_buf_size;
27252 @@ -8371,6 +8559,7 @@
27253  
27254         INIT_LIST_HEAD(&global_trace.systems);
27255         INIT_LIST_HEAD(&global_trace.events);
27256 +       INIT_LIST_HEAD(&global_trace.hist_vars);
27257         list_add(&global_trace.list, &ftrace_trace_arrays);
27258  
27259         apply_trace_boot_options();
27260 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events.c linux-4.14/kernel/trace/trace_events.c
27261 --- linux-4.14.orig/kernel/trace/trace_events.c 2018-09-05 11:03:22.000000000 +0200
27262 +++ linux-4.14/kernel/trace/trace_events.c      2018-09-05 11:05:07.000000000 +0200
27263 @@ -187,6 +187,8 @@
27264         __common_field(unsigned char, flags);
27265         __common_field(unsigned char, preempt_count);
27266         __common_field(int, pid);
27267 +       __common_field(unsigned short, migrate_disable);
27268 +       __common_field(unsigned short, padding);
27269  
27270         return ret;
27271  }
27272 @@ -1406,8 +1408,8 @@
27273                 return -ENODEV;
27274  
27275         /* Make sure the system still exists */
27276 -       mutex_lock(&trace_types_lock);
27277         mutex_lock(&event_mutex);
27278 +       mutex_lock(&trace_types_lock);
27279         list_for_each_entry(tr, &ftrace_trace_arrays, list) {
27280                 list_for_each_entry(dir, &tr->systems, list) {
27281                         if (dir == inode->i_private) {
27282 @@ -1421,8 +1423,8 @@
27283                 }
27284         }
27285   exit_loop:
27286 -       mutex_unlock(&event_mutex);
27287         mutex_unlock(&trace_types_lock);
27288 +       mutex_unlock(&event_mutex);
27289  
27290         if (!system)
27291                 return -ENODEV;
27292 @@ -2308,15 +2310,15 @@
27293  int trace_add_event_call(struct trace_event_call *call)
27294  {
27295         int ret;
27296 -       mutex_lock(&trace_types_lock);
27297         mutex_lock(&event_mutex);
27298 +       mutex_lock(&trace_types_lock);
27299  
27300         ret = __register_event(call, NULL);
27301         if (ret >= 0)
27302                 __add_event_to_tracers(call);
27303  
27304 -       mutex_unlock(&event_mutex);
27305         mutex_unlock(&trace_types_lock);
27306 +       mutex_unlock(&event_mutex);
27307         return ret;
27308  }
27309  
27310 @@ -2370,13 +2372,13 @@
27311  {
27312         int ret;
27313  
27314 -       mutex_lock(&trace_types_lock);
27315         mutex_lock(&event_mutex);
27316 +       mutex_lock(&trace_types_lock);
27317         down_write(&trace_event_sem);
27318         ret = probe_remove_event_call(call);
27319         up_write(&trace_event_sem);
27320 -       mutex_unlock(&event_mutex);
27321         mutex_unlock(&trace_types_lock);
27322 +       mutex_unlock(&event_mutex);
27323  
27324         return ret;
27325  }
27326 @@ -2438,8 +2440,8 @@
27327  {
27328         struct module *mod = data;
27329  
27330 -       mutex_lock(&trace_types_lock);
27331         mutex_lock(&event_mutex);
27332 +       mutex_lock(&trace_types_lock);
27333         switch (val) {
27334         case MODULE_STATE_COMING:
27335                 trace_module_add_events(mod);
27336 @@ -2448,8 +2450,8 @@
27337                 trace_module_remove_events(mod);
27338                 break;
27339         }
27340 -       mutex_unlock(&event_mutex);
27341         mutex_unlock(&trace_types_lock);
27342 +       mutex_unlock(&event_mutex);
27343  
27344         return 0;
27345  }
27346 @@ -2964,24 +2966,24 @@
27347   * creates the event hierachry in the @parent/events directory.
27348   *
27349   * Returns 0 on success.
27350 + *
27351 + * Must be called with event_mutex held.
27352   */
27353  int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
27354  {
27355         int ret;
27356  
27357 -       mutex_lock(&event_mutex);
27358 +       lockdep_assert_held(&event_mutex);
27359  
27360         ret = create_event_toplevel_files(parent, tr);
27361         if (ret)
27362 -               goto out_unlock;
27363 +               goto out;
27364  
27365         down_write(&trace_event_sem);
27366         __trace_add_event_dirs(tr);
27367         up_write(&trace_event_sem);
27368  
27369 - out_unlock:
27370 -       mutex_unlock(&event_mutex);
27371 -
27372 + out:
27373         return ret;
27374  }
27375  
27376 @@ -3010,9 +3012,10 @@
27377         return ret;
27378  }
27379  
27380 +/* Must be called with event_mutex held */
27381  int event_trace_del_tracer(struct trace_array *tr)
27382  {
27383 -       mutex_lock(&event_mutex);
27384 +       lockdep_assert_held(&event_mutex);
27385  
27386         /* Disable any event triggers and associated soft-disabled events */
27387         clear_event_triggers(tr);
27388 @@ -3033,8 +3036,6 @@
27389  
27390         tr->event_dir = NULL;
27391  
27392 -       mutex_unlock(&event_mutex);
27393 -
27394         return 0;
27395  }
27396  
27397 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events_hist.c linux-4.14/kernel/trace/trace_events_hist.c
27398 --- linux-4.14.orig/kernel/trace/trace_events_hist.c    2018-09-05 11:03:22.000000000 +0200
27399 +++ linux-4.14/kernel/trace/trace_events_hist.c 2018-09-05 11:05:07.000000000 +0200
27400 @@ -20,13 +20,39 @@
27401  #include <linux/slab.h>
27402  #include <linux/stacktrace.h>
27403  #include <linux/rculist.h>
27404 +#include <linux/tracefs.h>
27405  
27406  #include "tracing_map.h"
27407  #include "trace.h"
27408  
27409 +#define SYNTH_SYSTEM           "synthetic"
27410 +#define SYNTH_FIELDS_MAX       16
27411 +
27412 +#define STR_VAR_LEN_MAX                32 /* must be multiple of sizeof(u64) */
27413 +
27414  struct hist_field;
27415  
27416 -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
27417 +typedef u64 (*hist_field_fn_t) (struct hist_field *field,
27418 +                               struct tracing_map_elt *elt,
27419 +                               struct ring_buffer_event *rbe,
27420 +                               void *event);
27421 +
27422 +#define HIST_FIELD_OPERANDS_MAX        2
27423 +#define HIST_FIELDS_MAX                (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
27424 +#define HIST_ACTIONS_MAX       8
27425 +
27426 +enum field_op_id {
27427 +       FIELD_OP_NONE,
27428 +       FIELD_OP_PLUS,
27429 +       FIELD_OP_MINUS,
27430 +       FIELD_OP_UNARY_MINUS,
27431 +};
27432 +
27433 +struct hist_var {
27434 +       char                            *name;
27435 +       struct hist_trigger_data        *hist_data;
27436 +       unsigned int                    idx;
27437 +};
27438  
27439  struct hist_field {
27440         struct ftrace_event_field       *field;
27441 @@ -34,26 +60,50 @@
27442         hist_field_fn_t                 fn;
27443         unsigned int                    size;
27444         unsigned int                    offset;
27445 +       unsigned int                    is_signed;
27446 +       const char                      *type;
27447 +       struct hist_field               *operands[HIST_FIELD_OPERANDS_MAX];
27448 +       struct hist_trigger_data        *hist_data;
27449 +       struct hist_var                 var;
27450 +       enum field_op_id                operator;
27451 +       char                            *system;
27452 +       char                            *event_name;
27453 +       char                            *name;
27454 +       unsigned int                    var_idx;
27455 +       unsigned int                    var_ref_idx;
27456 +       bool                            read_once;
27457  };
27458  
27459 -static u64 hist_field_none(struct hist_field *field, void *event)
27460 +static u64 hist_field_none(struct hist_field *field,
27461 +                          struct tracing_map_elt *elt,
27462 +                          struct ring_buffer_event *rbe,
27463 +                          void *event)
27464  {
27465         return 0;
27466  }
27467  
27468 -static u64 hist_field_counter(struct hist_field *field, void *event)
27469 +static u64 hist_field_counter(struct hist_field *field,
27470 +                             struct tracing_map_elt *elt,
27471 +                             struct ring_buffer_event *rbe,
27472 +                             void *event)
27473  {
27474         return 1;
27475  }
27476  
27477 -static u64 hist_field_string(struct hist_field *hist_field, void *event)
27478 +static u64 hist_field_string(struct hist_field *hist_field,
27479 +                            struct tracing_map_elt *elt,
27480 +                            struct ring_buffer_event *rbe,
27481 +                            void *event)
27482  {
27483         char *addr = (char *)(event + hist_field->field->offset);
27484  
27485         return (u64)(unsigned long)addr;
27486  }
27487  
27488 -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
27489 +static u64 hist_field_dynstring(struct hist_field *hist_field,
27490 +                               struct tracing_map_elt *elt,
27491 +                               struct ring_buffer_event *rbe,
27492 +                               void *event)
27493  {
27494         u32 str_item = *(u32 *)(event + hist_field->field->offset);
27495         int str_loc = str_item & 0xffff;
27496 @@ -62,22 +112,74 @@
27497         return (u64)(unsigned long)addr;
27498  }
27499  
27500 -static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
27501 +static u64 hist_field_pstring(struct hist_field *hist_field,
27502 +                             struct tracing_map_elt *elt,
27503 +                             struct ring_buffer_event *rbe,
27504 +                             void *event)
27505  {
27506         char **addr = (char **)(event + hist_field->field->offset);
27507  
27508         return (u64)(unsigned long)*addr;
27509  }
27510  
27511 -static u64 hist_field_log2(struct hist_field *hist_field, void *event)
27512 +static u64 hist_field_log2(struct hist_field *hist_field,
27513 +                          struct tracing_map_elt *elt,
27514 +                          struct ring_buffer_event *rbe,
27515 +                          void *event)
27516  {
27517 -       u64 val = *(u64 *)(event + hist_field->field->offset);
27518 +       struct hist_field *operand = hist_field->operands[0];
27519 +
27520 +       u64 val = operand->fn(operand, elt, rbe, event);
27521  
27522         return (u64) ilog2(roundup_pow_of_two(val));
27523  }
27524  
27525 +static u64 hist_field_plus(struct hist_field *hist_field,
27526 +                          struct tracing_map_elt *elt,
27527 +                          struct ring_buffer_event *rbe,
27528 +                          void *event)
27529 +{
27530 +       struct hist_field *operand1 = hist_field->operands[0];
27531 +       struct hist_field *operand2 = hist_field->operands[1];
27532 +
27533 +       u64 val1 = operand1->fn(operand1, elt, rbe, event);
27534 +       u64 val2 = operand2->fn(operand2, elt, rbe, event);
27535 +
27536 +       return val1 + val2;
27537 +}
27538 +
27539 +static u64 hist_field_minus(struct hist_field *hist_field,
27540 +                           struct tracing_map_elt *elt,
27541 +                           struct ring_buffer_event *rbe,
27542 +                           void *event)
27543 +{
27544 +       struct hist_field *operand1 = hist_field->operands[0];
27545 +       struct hist_field *operand2 = hist_field->operands[1];
27546 +
27547 +       u64 val1 = operand1->fn(operand1, elt, rbe, event);
27548 +       u64 val2 = operand2->fn(operand2, elt, rbe, event);
27549 +
27550 +       return val1 - val2;
27551 +}
27552 +
27553 +static u64 hist_field_unary_minus(struct hist_field *hist_field,
27554 +                                 struct tracing_map_elt *elt,
27555 +                                 struct ring_buffer_event *rbe,
27556 +                                 void *event)
27557 +{
27558 +       struct hist_field *operand = hist_field->operands[0];
27559 +
27560 +       s64 sval = (s64)operand->fn(operand, elt, rbe, event);
27561 +       u64 val = (u64)-sval;
27562 +
27563 +       return val;
27564 +}
27565 +
27566  #define DEFINE_HIST_FIELD_FN(type)                                     \
27567 -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
27568 +       static u64 hist_field_##type(struct hist_field *hist_field,     \
27569 +                                    struct tracing_map_elt *elt,       \
27570 +                                    struct ring_buffer_event *rbe,     \
27571 +                                    void *event)                       \
27572  {                                                                      \
27573         type *addr = (type *)(event + hist_field->field->offset);       \
27574                                                                         \
27575 @@ -110,16 +212,29 @@
27576  #define HIST_KEY_SIZE_MAX      (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
27577  
27578  enum hist_field_flags {
27579 -       HIST_FIELD_FL_HITCOUNT          = 1,
27580 -       HIST_FIELD_FL_KEY               = 2,
27581 -       HIST_FIELD_FL_STRING            = 4,
27582 -       HIST_FIELD_FL_HEX               = 8,
27583 -       HIST_FIELD_FL_SYM               = 16,
27584 -       HIST_FIELD_FL_SYM_OFFSET        = 32,
27585 -       HIST_FIELD_FL_EXECNAME          = 64,
27586 -       HIST_FIELD_FL_SYSCALL           = 128,
27587 -       HIST_FIELD_FL_STACKTRACE        = 256,
27588 -       HIST_FIELD_FL_LOG2              = 512,
27589 +       HIST_FIELD_FL_HITCOUNT          = 1 << 0,
27590 +       HIST_FIELD_FL_KEY               = 1 << 1,
27591 +       HIST_FIELD_FL_STRING            = 1 << 2,
27592 +       HIST_FIELD_FL_HEX               = 1 << 3,
27593 +       HIST_FIELD_FL_SYM               = 1 << 4,
27594 +       HIST_FIELD_FL_SYM_OFFSET        = 1 << 5,
27595 +       HIST_FIELD_FL_EXECNAME          = 1 << 6,
27596 +       HIST_FIELD_FL_SYSCALL           = 1 << 7,
27597 +       HIST_FIELD_FL_STACKTRACE        = 1 << 8,
27598 +       HIST_FIELD_FL_LOG2              = 1 << 9,
27599 +       HIST_FIELD_FL_TIMESTAMP         = 1 << 10,
27600 +       HIST_FIELD_FL_TIMESTAMP_USECS   = 1 << 11,
27601 +       HIST_FIELD_FL_VAR               = 1 << 12,
27602 +       HIST_FIELD_FL_EXPR              = 1 << 13,
27603 +       HIST_FIELD_FL_VAR_REF           = 1 << 14,
27604 +       HIST_FIELD_FL_CPU               = 1 << 15,
27605 +       HIST_FIELD_FL_ALIAS             = 1 << 16,
27606 +};
27607 +
27608 +struct var_defs {
27609 +       unsigned int    n_vars;
27610 +       char            *name[TRACING_MAP_VARS_MAX];
27611 +       char            *expr[TRACING_MAP_VARS_MAX];
27612  };
27613  
27614  struct hist_trigger_attrs {
27615 @@ -127,25 +242,1474 @@
27616         char            *vals_str;
27617         char            *sort_key_str;
27618         char            *name;
27619 +       char            *clock;
27620         bool            pause;
27621         bool            cont;
27622         bool            clear;
27623 +       bool            ts_in_usecs;
27624         unsigned int    map_bits;
27625 +
27626 +       char            *assignment_str[TRACING_MAP_VARS_MAX];
27627 +       unsigned int    n_assignments;
27628 +
27629 +       char            *action_str[HIST_ACTIONS_MAX];
27630 +       unsigned int    n_actions;
27631 +
27632 +       struct var_defs var_defs;
27633 +};
27634 +
27635 +struct field_var {
27636 +       struct hist_field       *var;
27637 +       struct hist_field       *val;
27638 +};
27639 +
27640 +struct field_var_hist {
27641 +       struct hist_trigger_data        *hist_data;
27642 +       char                            *cmd;
27643  };
27644  
27645  struct hist_trigger_data {
27646 -       struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
27647 +       struct hist_field               *fields[HIST_FIELDS_MAX];
27648         unsigned int                    n_vals;
27649         unsigned int                    n_keys;
27650         unsigned int                    n_fields;
27651 +       unsigned int                    n_vars;
27652         unsigned int                    key_size;
27653         struct tracing_map_sort_key     sort_keys[TRACING_MAP_SORT_KEYS_MAX];
27654         unsigned int                    n_sort_keys;
27655         struct trace_event_file         *event_file;
27656         struct hist_trigger_attrs       *attrs;
27657         struct tracing_map              *map;
27658 +       bool                            enable_timestamps;
27659 +       bool                            remove;
27660 +       struct hist_field               *var_refs[TRACING_MAP_VARS_MAX];
27661 +       unsigned int                    n_var_refs;
27662 +
27663 +       struct action_data              *actions[HIST_ACTIONS_MAX];
27664 +       unsigned int                    n_actions;
27665 +
27666 +       struct hist_field               *synth_var_refs[SYNTH_FIELDS_MAX];
27667 +       unsigned int                    n_synth_var_refs;
27668 +       struct field_var                *field_vars[SYNTH_FIELDS_MAX];
27669 +       unsigned int                    n_field_vars;
27670 +       unsigned int                    n_field_var_str;
27671 +       struct field_var_hist           *field_var_hists[SYNTH_FIELDS_MAX];
27672 +       unsigned int                    n_field_var_hists;
27673 +
27674 +       struct field_var                *max_vars[SYNTH_FIELDS_MAX];
27675 +       unsigned int                    n_max_vars;
27676 +       unsigned int                    n_max_var_str;
27677 +};
27678 +
27679 +struct synth_field {
27680 +       char *type;
27681 +       char *name;
27682 +       size_t size;
27683 +       bool is_signed;
27684 +       bool is_string;
27685 +};
27686 +
27687 +struct synth_event {
27688 +       struct list_head                        list;
27689 +       int                                     ref;
27690 +       char                                    *name;
27691 +       struct synth_field                      **fields;
27692 +       unsigned int                            n_fields;
27693 +       unsigned int                            n_u64;
27694 +       struct trace_event_class                class;
27695 +       struct trace_event_call                 call;
27696 +       struct tracepoint                       *tp;
27697 +};
27698 +
27699 +struct action_data;
27700 +
27701 +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
27702 +                            struct tracing_map_elt *elt, void *rec,
27703 +                            struct ring_buffer_event *rbe,
27704 +                            struct action_data *data, u64 *var_ref_vals);
27705 +
27706 +struct action_data {
27707 +       action_fn_t             fn;
27708 +       unsigned int            n_params;
27709 +       char                    *params[SYNTH_FIELDS_MAX];
27710 +
27711 +       union {
27712 +               struct {
27713 +                       unsigned int            var_ref_idx;
27714 +                       char                    *match_event;
27715 +                       char                    *match_event_system;
27716 +                       char                    *synth_event_name;
27717 +                       struct synth_event      *synth_event;
27718 +               } onmatch;
27719 +
27720 +               struct {
27721 +                       char                    *var_str;
27722 +                       char                    *fn_name;
27723 +                       unsigned int            max_var_ref_idx;
27724 +                       struct hist_field       *max_var;
27725 +                       struct hist_field       *var;
27726 +               } onmax;
27727 +       };
27728 +};
27729 +
27730 +
27731 +static char last_hist_cmd[MAX_FILTER_STR_VAL];
27732 +static char hist_err_str[MAX_FILTER_STR_VAL];
27733 +
27734 +static void last_cmd_set(char *str)
27735 +{
27736 +       if (!str)
27737 +               return;
27738 +
27739 +       strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
27740 +}
27741 +
27742 +static void hist_err(char *str, char *var)
27743 +{
27744 +       int maxlen = MAX_FILTER_STR_VAL - 1;
27745 +
27746 +       if (!str)
27747 +               return;
27748 +
27749 +       if (strlen(hist_err_str))
27750 +               return;
27751 +
27752 +       if (!var)
27753 +               var = "";
27754 +
27755 +       if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
27756 +               return;
27757 +
27758 +       strcat(hist_err_str, str);
27759 +       strcat(hist_err_str, var);
27760 +}
27761 +
27762 +static void hist_err_event(char *str, char *system, char *event, char *var)
27763 +{
27764 +       char err[MAX_FILTER_STR_VAL];
27765 +
27766 +       if (system && var)
27767 +               snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
27768 +       else if (system)
27769 +               snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
27770 +       else
27771 +               strncpy(err, var, MAX_FILTER_STR_VAL);
27772 +
27773 +       hist_err(str, err);
27774 +}
27775 +
27776 +static void hist_err_clear(void)
27777 +{
27778 +       hist_err_str[0] = '\0';
27779 +}
27780 +
27781 +static bool have_hist_err(void)
27782 +{
27783 +       if (strlen(hist_err_str))
27784 +               return true;
27785 +
27786 +       return false;
27787 +}
27788 +
27789 +static LIST_HEAD(synth_event_list);
27790 +static DEFINE_MUTEX(synth_event_mutex);
27791 +
27792 +struct synth_trace_event {
27793 +       struct trace_entry      ent;
27794 +       u64                     fields[];
27795 +};
27796 +
27797 +static int synth_event_define_fields(struct trace_event_call *call)
27798 +{
27799 +       struct synth_trace_event trace;
27800 +       int offset = offsetof(typeof(trace), fields);
27801 +       struct synth_event *event = call->data;
27802 +       unsigned int i, size, n_u64;
27803 +       char *name, *type;
27804 +       bool is_signed;
27805 +       int ret = 0;
27806 +
27807 +       for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
27808 +               size = event->fields[i]->size;
27809 +               is_signed = event->fields[i]->is_signed;
27810 +               type = event->fields[i]->type;
27811 +               name = event->fields[i]->name;
27812 +               ret = trace_define_field(call, type, name, offset, size,
27813 +                                        is_signed, FILTER_OTHER);
27814 +               if (ret)
27815 +                       break;
27816 +
27817 +               if (event->fields[i]->is_string) {
27818 +                       offset += STR_VAR_LEN_MAX;
27819 +                       n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
27820 +               } else {
27821 +                       offset += sizeof(u64);
27822 +                       n_u64++;
27823 +               }
27824 +       }
27825 +
27826 +       event->n_u64 = n_u64;
27827 +
27828 +       return ret;
27829 +}
27830 +
27831 +static bool synth_field_signed(char *type)
27832 +{
27833 +       if (strncmp(type, "u", 1) == 0)
27834 +               return false;
27835 +
27836 +       return true;
27837 +}
27838 +
27839 +static int synth_field_is_string(char *type)
27840 +{
27841 +       if (strstr(type, "char[") != NULL)
27842 +               return true;
27843 +
27844 +       return false;
27845 +}
27846 +
27847 +static int synth_field_string_size(char *type)
27848 +{
27849 +       char buf[4], *end, *start;
27850 +       unsigned int len;
27851 +       int size, err;
27852 +
27853 +       start = strstr(type, "char[");
27854 +       if (start == NULL)
27855 +               return -EINVAL;
27856 +       start += strlen("char[");
27857 +
27858 +       end = strchr(type, ']');
27859 +       if (!end || end < start)
27860 +               return -EINVAL;
27861 +
27862 +       len = end - start;
27863 +       if (len > 3)
27864 +               return -EINVAL;
27865 +
27866 +       strncpy(buf, start, len);
27867 +       buf[len] = '\0';
27868 +
27869 +       err = kstrtouint(buf, 0, &size);
27870 +       if (err)
27871 +               return err;
27872 +
27873 +       if (size > STR_VAR_LEN_MAX)
27874 +               return -EINVAL;
27875 +
27876 +       return size;
27877 +}
27878 +
27879 +static int synth_field_size(char *type)
27880 +{
27881 +       int size = 0;
27882 +
27883 +       if (strcmp(type, "s64") == 0)
27884 +               size = sizeof(s64);
27885 +       else if (strcmp(type, "u64") == 0)
27886 +               size = sizeof(u64);
27887 +       else if (strcmp(type, "s32") == 0)
27888 +               size = sizeof(s32);
27889 +       else if (strcmp(type, "u32") == 0)
27890 +               size = sizeof(u32);
27891 +       else if (strcmp(type, "s16") == 0)
27892 +               size = sizeof(s16);
27893 +       else if (strcmp(type, "u16") == 0)
27894 +               size = sizeof(u16);
27895 +       else if (strcmp(type, "s8") == 0)
27896 +               size = sizeof(s8);
27897 +       else if (strcmp(type, "u8") == 0)
27898 +               size = sizeof(u8);
27899 +       else if (strcmp(type, "char") == 0)
27900 +               size = sizeof(char);
27901 +       else if (strcmp(type, "unsigned char") == 0)
27902 +               size = sizeof(unsigned char);
27903 +       else if (strcmp(type, "int") == 0)
27904 +               size = sizeof(int);
27905 +       else if (strcmp(type, "unsigned int") == 0)
27906 +               size = sizeof(unsigned int);
27907 +       else if (strcmp(type, "long") == 0)
27908 +               size = sizeof(long);
27909 +       else if (strcmp(type, "unsigned long") == 0)
27910 +               size = sizeof(unsigned long);
27911 +       else if (strcmp(type, "pid_t") == 0)
27912 +               size = sizeof(pid_t);
27913 +       else if (synth_field_is_string(type))
27914 +               size = synth_field_string_size(type);
27915 +
27916 +       return size;
27917 +}
27918 +
27919 +static const char *synth_field_fmt(char *type)
27920 +{
27921 +       const char *fmt = "%llu";
27922 +
27923 +       if (strcmp(type, "s64") == 0)
27924 +               fmt = "%lld";
27925 +       else if (strcmp(type, "u64") == 0)
27926 +               fmt = "%llu";
27927 +       else if (strcmp(type, "s32") == 0)
27928 +               fmt = "%d";
27929 +       else if (strcmp(type, "u32") == 0)
27930 +               fmt = "%u";
27931 +       else if (strcmp(type, "s16") == 0)
27932 +               fmt = "%d";
27933 +       else if (strcmp(type, "u16") == 0)
27934 +               fmt = "%u";
27935 +       else if (strcmp(type, "s8") == 0)
27936 +               fmt = "%d";
27937 +       else if (strcmp(type, "u8") == 0)
27938 +               fmt = "%u";
27939 +       else if (strcmp(type, "char") == 0)
27940 +               fmt = "%d";
27941 +       else if (strcmp(type, "unsigned char") == 0)
27942 +               fmt = "%u";
27943 +       else if (strcmp(type, "int") == 0)
27944 +               fmt = "%d";
27945 +       else if (strcmp(type, "unsigned int") == 0)
27946 +               fmt = "%u";
27947 +       else if (strcmp(type, "long") == 0)
27948 +               fmt = "%ld";
27949 +       else if (strcmp(type, "unsigned long") == 0)
27950 +               fmt = "%lu";
27951 +       else if (strcmp(type, "pid_t") == 0)
27952 +               fmt = "%d";
27953 +       else if (synth_field_is_string(type))
27954 +               fmt = "%s";
27955 +
27956 +       return fmt;
27957 +}
27958 +
27959 +static enum print_line_t print_synth_event(struct trace_iterator *iter,
27960 +                                          int flags,
27961 +                                          struct trace_event *event)
27962 +{
27963 +       struct trace_array *tr = iter->tr;
27964 +       struct trace_seq *s = &iter->seq;
27965 +       struct synth_trace_event *entry;
27966 +       struct synth_event *se;
27967 +       unsigned int i, n_u64;
27968 +       char print_fmt[32];
27969 +       const char *fmt;
27970 +
27971 +       entry = (struct synth_trace_event *)iter->ent;
27972 +       se = container_of(event, struct synth_event, call.event);
27973 +
27974 +       trace_seq_printf(s, "%s: ", se->name);
27975 +
27976 +       for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
27977 +               if (trace_seq_has_overflowed(s))
27978 +                       goto end;
27979 +
27980 +               fmt = synth_field_fmt(se->fields[i]->type);
27981 +
27982 +               /* parameter types */
27983 +               if (tr->trace_flags & TRACE_ITER_VERBOSE)
27984 +                       trace_seq_printf(s, "%s ", fmt);
27985 +
27986 +               snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
27987 +
27988 +               /* parameter values */
27989 +               if (se->fields[i]->is_string) {
27990 +                       trace_seq_printf(s, print_fmt, se->fields[i]->name,
27991 +                                        (char *)&entry->fields[n_u64],
27992 +                                        i == se->n_fields - 1 ? "" : " ");
27993 +                       n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
27994 +               } else {
27995 +                       trace_seq_printf(s, print_fmt, se->fields[i]->name,
27996 +                                        entry->fields[n_u64],
27997 +                                        i == se->n_fields - 1 ? "" : " ");
27998 +                       n_u64++;
27999 +               }
28000 +       }
28001 +end:
28002 +       trace_seq_putc(s, '\n');
28003 +
28004 +       return trace_handle_return(s);
28005 +}
28006 +
28007 +static struct trace_event_functions synth_event_funcs = {
28008 +       .trace          = print_synth_event
28009 +};
28010 +
28011 +static notrace void trace_event_raw_event_synth(void *__data,
28012 +                                               u64 *var_ref_vals,
28013 +                                               unsigned int var_ref_idx)
28014 +{
28015 +       struct trace_event_file *trace_file = __data;
28016 +       struct synth_trace_event *entry;
28017 +       struct trace_event_buffer fbuffer;
28018 +       struct ring_buffer *buffer;
28019 +       struct synth_event *event;
28020 +       unsigned int i, n_u64;
28021 +       int fields_size = 0;
28022 +
28023 +       event = trace_file->event_call->data;
28024 +
28025 +       if (trace_trigger_soft_disabled(trace_file))
28026 +               return;
28027 +
28028 +       fields_size = event->n_u64 * sizeof(u64);
28029 +
28030 +       /*
28031 +        * Avoid ring buffer recursion detection, as this event
28032 +        * is being performed within another event.
28033 +        */
28034 +       buffer = trace_file->tr->trace_buffer.buffer;
28035 +       ring_buffer_nest_start(buffer);
28036 +
28037 +       entry = trace_event_buffer_reserve(&fbuffer, trace_file,
28038 +                                          sizeof(*entry) + fields_size);
28039 +       if (!entry)
28040 +               goto out;
28041 +
28042 +       for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28043 +               if (event->fields[i]->is_string) {
28044 +                       char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
28045 +                       char *str_field = (char *)&entry->fields[n_u64];
28046 +
28047 +                       strscpy(str_field, str_val, STR_VAR_LEN_MAX);
28048 +                       n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28049 +               } else {
28050 +                       entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
28051 +                       n_u64++;
28052 +               }
28053 +       }
28054 +
28055 +       trace_event_buffer_commit(&fbuffer);
28056 +out:
28057 +       ring_buffer_nest_end(buffer);
28058 +}
28059 +
28060 +static void free_synth_event_print_fmt(struct trace_event_call *call)
28061 +{
28062 +       if (call) {
28063 +               kfree(call->print_fmt);
28064 +               call->print_fmt = NULL;
28065 +       }
28066 +}
28067 +
28068 +static int __set_synth_event_print_fmt(struct synth_event *event,
28069 +                                      char *buf, int len)
28070 +{
28071 +       const char *fmt;
28072 +       int pos = 0;
28073 +       int i;
28074 +
28075 +       /* When len=0, we just calculate the needed length */
28076 +#define LEN_OR_ZERO (len ? len - pos : 0)
28077 +
28078 +       pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
28079 +       for (i = 0; i < event->n_fields; i++) {
28080 +               fmt = synth_field_fmt(event->fields[i]->type);
28081 +               pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
28082 +                               event->fields[i]->name, fmt,
28083 +                               i == event->n_fields - 1 ? "" : ", ");
28084 +       }
28085 +       pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
28086 +
28087 +       for (i = 0; i < event->n_fields; i++) {
28088 +               pos += snprintf(buf + pos, LEN_OR_ZERO,
28089 +                               ", REC->%s", event->fields[i]->name);
28090 +       }
28091 +
28092 +#undef LEN_OR_ZERO
28093 +
28094 +       /* return the length of print_fmt */
28095 +       return pos;
28096 +}
28097 +
28098 +static int set_synth_event_print_fmt(struct trace_event_call *call)
28099 +{
28100 +       struct synth_event *event = call->data;
28101 +       char *print_fmt;
28102 +       int len;
28103 +
28104 +       /* First: called with 0 length to calculate the needed length */
28105 +       len = __set_synth_event_print_fmt(event, NULL, 0);
28106 +
28107 +       print_fmt = kmalloc(len + 1, GFP_KERNEL);
28108 +       if (!print_fmt)
28109 +               return -ENOMEM;
28110 +
28111 +       /* Second: actually write the @print_fmt */
28112 +       __set_synth_event_print_fmt(event, print_fmt, len + 1);
28113 +       call->print_fmt = print_fmt;
28114 +
28115 +       return 0;
28116 +}
28117 +
28118 +static void free_synth_field(struct synth_field *field)
28119 +{
28120 +       kfree(field->type);
28121 +       kfree(field->name);
28122 +       kfree(field);
28123 +}
28124 +
28125 +static struct synth_field *parse_synth_field(char *field_type,
28126 +                                            char *field_name)
28127 +{
28128 +       struct synth_field *field;
28129 +       int len, ret = 0;
28130 +       char *array;
28131 +
28132 +       if (field_type[0] == ';')
28133 +               field_type++;
28134 +
28135 +       len = strlen(field_name);
28136 +       if (field_name[len - 1] == ';')
28137 +               field_name[len - 1] = '\0';
28138 +
28139 +       field = kzalloc(sizeof(*field), GFP_KERNEL);
28140 +       if (!field)
28141 +               return ERR_PTR(-ENOMEM);
28142 +
28143 +       len = strlen(field_type) + 1;
28144 +       array = strchr(field_name, '[');
28145 +       if (array)
28146 +               len += strlen(array);
28147 +       field->type = kzalloc(len, GFP_KERNEL);
28148 +       if (!field->type) {
28149 +               ret = -ENOMEM;
28150 +               goto free;
28151 +       }
28152 +       strcat(field->type, field_type);
28153 +       if (array) {
28154 +               strcat(field->type, array);
28155 +               *array = '\0';
28156 +       }
28157 +
28158 +       field->size = synth_field_size(field->type);
28159 +       if (!field->size) {
28160 +               ret = -EINVAL;
28161 +               goto free;
28162 +       }
28163 +
28164 +       if (synth_field_is_string(field->type))
28165 +               field->is_string = true;
28166 +
28167 +       field->is_signed = synth_field_signed(field->type);
28168 +
28169 +       field->name = kstrdup(field_name, GFP_KERNEL);
28170 +       if (!field->name) {
28171 +               ret = -ENOMEM;
28172 +               goto free;
28173 +       }
28174 + out:
28175 +       return field;
28176 + free:
28177 +       free_synth_field(field);
28178 +       field = ERR_PTR(ret);
28179 +       goto out;
28180 +}
28181 +
28182 +static void free_synth_tracepoint(struct tracepoint *tp)
28183 +{
28184 +       if (!tp)
28185 +               return;
28186 +
28187 +       kfree(tp->name);
28188 +       kfree(tp);
28189 +}
28190 +
28191 +static struct tracepoint *alloc_synth_tracepoint(char *name)
28192 +{
28193 +       struct tracepoint *tp;
28194 +
28195 +       tp = kzalloc(sizeof(*tp), GFP_KERNEL);
28196 +       if (!tp)
28197 +               return ERR_PTR(-ENOMEM);
28198 +
28199 +       tp->name = kstrdup(name, GFP_KERNEL);
28200 +       if (!tp->name) {
28201 +               kfree(tp);
28202 +               return ERR_PTR(-ENOMEM);
28203 +       }
28204 +
28205 +       return tp;
28206 +}
28207 +
28208 +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
28209 +                                   unsigned int var_ref_idx);
28210 +
28211 +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
28212 +                              unsigned int var_ref_idx)
28213 +{
28214 +       struct tracepoint *tp = event->tp;
28215 +
28216 +       if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
28217 +               struct tracepoint_func *probe_func_ptr;
28218 +               synth_probe_func_t probe_func;
28219 +               void *__data;
28220 +
28221 +               if (!(cpu_online(raw_smp_processor_id())))
28222 +                       return;
28223 +
28224 +               probe_func_ptr = rcu_dereference_sched((tp)->funcs);
28225 +               if (probe_func_ptr) {
28226 +                       do {
28227 +                               probe_func = probe_func_ptr->func;
28228 +                               __data = probe_func_ptr->data;
28229 +                               probe_func(__data, var_ref_vals, var_ref_idx);
28230 +                       } while ((++probe_func_ptr)->func);
28231 +               }
28232 +       }
28233 +}
28234 +
28235 +static struct synth_event *find_synth_event(const char *name)
28236 +{
28237 +       struct synth_event *event;
28238 +
28239 +       list_for_each_entry(event, &synth_event_list, list) {
28240 +               if (strcmp(event->name, name) == 0)
28241 +                       return event;
28242 +       }
28243 +
28244 +       return NULL;
28245 +}
28246 +
28247 +static int register_synth_event(struct synth_event *event)
28248 +{
28249 +       struct trace_event_call *call = &event->call;
28250 +       int ret = 0;
28251 +
28252 +       event->call.class = &event->class;
28253 +       event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
28254 +       if (!event->class.system) {
28255 +               ret = -ENOMEM;
28256 +               goto out;
28257 +       }
28258 +
28259 +       event->tp = alloc_synth_tracepoint(event->name);
28260 +       if (IS_ERR(event->tp)) {
28261 +               ret = PTR_ERR(event->tp);
28262 +               event->tp = NULL;
28263 +               goto out;
28264 +       }
28265 +
28266 +       INIT_LIST_HEAD(&call->class->fields);
28267 +       call->event.funcs = &synth_event_funcs;
28268 +       call->class->define_fields = synth_event_define_fields;
28269 +
28270 +       ret = register_trace_event(&call->event);
28271 +       if (!ret) {
28272 +               ret = -ENODEV;
28273 +               goto out;
28274 +       }
28275 +       call->flags = TRACE_EVENT_FL_TRACEPOINT;
28276 +       call->class->reg = trace_event_reg;
28277 +       call->class->probe = trace_event_raw_event_synth;
28278 +       call->data = event;
28279 +       call->tp = event->tp;
28280 +
28281 +       ret = trace_add_event_call(call);
28282 +       if (ret) {
28283 +               pr_warn("Failed to register synthetic event: %s\n",
28284 +                       trace_event_name(call));
28285 +               goto err;
28286 +       }
28287 +
28288 +       ret = set_synth_event_print_fmt(call);
28289 +       if (ret < 0) {
28290 +               trace_remove_event_call(call);
28291 +               goto err;
28292 +       }
28293 + out:
28294 +       return ret;
28295 + err:
28296 +       unregister_trace_event(&call->event);
28297 +       goto out;
28298 +}
28299 +
28300 +static int unregister_synth_event(struct synth_event *event)
28301 +{
28302 +       struct trace_event_call *call = &event->call;
28303 +       int ret;
28304 +
28305 +       ret = trace_remove_event_call(call);
28306 +
28307 +       return ret;
28308 +}
28309 +
28310 +static void free_synth_event(struct synth_event *event)
28311 +{
28312 +       unsigned int i;
28313 +
28314 +       if (!event)
28315 +               return;
28316 +
28317 +       for (i = 0; i < event->n_fields; i++)
28318 +               free_synth_field(event->fields[i]);
28319 +
28320 +       kfree(event->fields);
28321 +       kfree(event->name);
28322 +       kfree(event->class.system);
28323 +       free_synth_tracepoint(event->tp);
28324 +       free_synth_event_print_fmt(&event->call);
28325 +       kfree(event);
28326 +}
28327 +
28328 +static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
28329 +                                            struct synth_field **fields)
28330 +{
28331 +       struct synth_event *event;
28332 +       unsigned int i;
28333 +
28334 +       event = kzalloc(sizeof(*event), GFP_KERNEL);
28335 +       if (!event) {
28336 +               event = ERR_PTR(-ENOMEM);
28337 +               goto out;
28338 +       }
28339 +
28340 +       event->name = kstrdup(event_name, GFP_KERNEL);
28341 +       if (!event->name) {
28342 +               kfree(event);
28343 +               event = ERR_PTR(-ENOMEM);
28344 +               goto out;
28345 +       }
28346 +
28347 +       event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
28348 +       if (!event->fields) {
28349 +               free_synth_event(event);
28350 +               event = ERR_PTR(-ENOMEM);
28351 +               goto out;
28352 +       }
28353 +
28354 +       for (i = 0; i < n_fields; i++)
28355 +               event->fields[i] = fields[i];
28356 +
28357 +       event->n_fields = n_fields;
28358 + out:
28359 +       return event;
28360 +}
28361 +
28362 +static void action_trace(struct hist_trigger_data *hist_data,
28363 +                        struct tracing_map_elt *elt, void *rec,
28364 +                        struct ring_buffer_event *rbe,
28365 +                        struct action_data *data, u64 *var_ref_vals)
28366 +{
28367 +       struct synth_event *event = data->onmatch.synth_event;
28368 +
28369 +       trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
28370 +}
28371 +
28372 +struct hist_var_data {
28373 +       struct list_head list;
28374 +       struct hist_trigger_data *hist_data;
28375 +};
28376 +
28377 +static void add_or_delete_synth_event(struct synth_event *event, int delete)
28378 +{
28379 +       if (delete)
28380 +               free_synth_event(event);
28381 +       else {
28382 +               mutex_lock(&synth_event_mutex);
28383 +               if (!find_synth_event(event->name))
28384 +                       list_add(&event->list, &synth_event_list);
28385 +               else
28386 +                       free_synth_event(event);
28387 +               mutex_unlock(&synth_event_mutex);
28388 +       }
28389 +}
28390 +
28391 +static int create_synth_event(int argc, char **argv)
28392 +{
28393 +       struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
28394 +       struct synth_event *event = NULL;
28395 +       bool delete_event = false;
28396 +       int i, n_fields = 0, ret = 0;
28397 +       char *name;
28398 +
28399 +       mutex_lock(&synth_event_mutex);
28400 +
28401 +       /*
28402 +        * Argument syntax:
28403 +        *  - Add synthetic event: <event_name> field[;field] ...
28404 +        *  - Remove synthetic event: !<event_name> field[;field] ...
28405 +        *      where 'field' = type field_name
28406 +        */
28407 +       if (argc < 1) {
28408 +               ret = -EINVAL;
28409 +               goto out;
28410 +       }
28411 +
28412 +       name = argv[0];
28413 +       if (name[0] == '!') {
28414 +               delete_event = true;
28415 +               name++;
28416 +       }
28417 +
28418 +       event = find_synth_event(name);
28419 +       if (event) {
28420 +               if (delete_event) {
28421 +                       if (event->ref) {
28422 +                               event = NULL;
28423 +                               ret = -EBUSY;
28424 +                               goto out;
28425 +                       }
28426 +                       list_del(&event->list);
28427 +                       goto out;
28428 +               }
28429 +               event = NULL;
28430 +               ret = -EEXIST;
28431 +               goto out;
28432 +       } else if (delete_event)
28433 +               goto out;
28434 +
28435 +       if (argc < 2) {
28436 +               ret = -EINVAL;
28437 +               goto out;
28438 +       }
28439 +
28440 +       for (i = 1; i < argc - 1; i++) {
28441 +               if (strcmp(argv[i], ";") == 0)
28442 +                       continue;
28443 +               if (n_fields == SYNTH_FIELDS_MAX) {
28444 +                       ret = -EINVAL;
28445 +                       goto err;
28446 +               }
28447 +
28448 +               field = parse_synth_field(argv[i], argv[i + 1]);
28449 +               if (IS_ERR(field)) {
28450 +                       ret = PTR_ERR(field);
28451 +                       goto err;
28452 +               }
28453 +               fields[n_fields] = field;
28454 +               i++; n_fields++;
28455 +       }
28456 +
28457 +       if (i < argc) {
28458 +               ret = -EINVAL;
28459 +               goto err;
28460 +       }
28461 +
28462 +       event = alloc_synth_event(name, n_fields, fields);
28463 +       if (IS_ERR(event)) {
28464 +               ret = PTR_ERR(event);
28465 +               event = NULL;
28466 +               goto err;
28467 +       }
28468 + out:
28469 +       mutex_unlock(&synth_event_mutex);
28470 +
28471 +       if (event) {
28472 +               if (delete_event) {
28473 +                       ret = unregister_synth_event(event);
28474 +                       add_or_delete_synth_event(event, !ret);
28475 +               } else {
28476 +                       ret = register_synth_event(event);
28477 +                       add_or_delete_synth_event(event, ret);
28478 +               }
28479 +       }
28480 +
28481 +       return ret;
28482 + err:
28483 +       mutex_unlock(&synth_event_mutex);
28484 +
28485 +       for (i = 0; i < n_fields; i++)
28486 +               free_synth_field(fields[i]);
28487 +       free_synth_event(event);
28488 +
28489 +       return ret;
28490 +}
28491 +
28492 +static int release_all_synth_events(void)
28493 +{
28494 +       struct list_head release_events;
28495 +       struct synth_event *event, *e;
28496 +       int ret = 0;
28497 +
28498 +       INIT_LIST_HEAD(&release_events);
28499 +
28500 +       mutex_lock(&synth_event_mutex);
28501 +
28502 +       list_for_each_entry(event, &synth_event_list, list) {
28503 +               if (event->ref) {
28504 +                       mutex_unlock(&synth_event_mutex);
28505 +                       return -EBUSY;
28506 +               }
28507 +       }
28508 +
28509 +       list_splice_init(&event->list, &release_events);
28510 +
28511 +       mutex_unlock(&synth_event_mutex);
28512 +
28513 +       list_for_each_entry_safe(event, e, &release_events, list) {
28514 +               list_del(&event->list);
28515 +
28516 +               ret = unregister_synth_event(event);
28517 +               add_or_delete_synth_event(event, !ret);
28518 +       }
28519 +
28520 +       return ret;
28521 +}
28522 +
28523 +
28524 +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
28525 +{
28526 +       mutex_lock(&synth_event_mutex);
28527 +
28528 +       return seq_list_start(&synth_event_list, *pos);
28529 +}
28530 +
28531 +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
28532 +{
28533 +       return seq_list_next(v, &synth_event_list, pos);
28534 +}
28535 +
28536 +static void synth_events_seq_stop(struct seq_file *m, void *v)
28537 +{
28538 +       mutex_unlock(&synth_event_mutex);
28539 +}
28540 +
28541 +static int synth_events_seq_show(struct seq_file *m, void *v)
28542 +{
28543 +       struct synth_field *field;
28544 +       struct synth_event *event = v;
28545 +       unsigned int i;
28546 +
28547 +       seq_printf(m, "%s\t", event->name);
28548 +
28549 +       for (i = 0; i < event->n_fields; i++) {
28550 +               field = event->fields[i];
28551 +
28552 +               /* parameter values */
28553 +               seq_printf(m, "%s %s%s", field->type, field->name,
28554 +                          i == event->n_fields - 1 ? "" : "; ");
28555 +       }
28556 +
28557 +       seq_putc(m, '\n');
28558 +
28559 +       return 0;
28560 +}
28561 +
28562 +static const struct seq_operations synth_events_seq_op = {
28563 +       .start  = synth_events_seq_start,
28564 +       .next   = synth_events_seq_next,
28565 +       .stop   = synth_events_seq_stop,
28566 +       .show   = synth_events_seq_show
28567 +};
28568 +
28569 +static int synth_events_open(struct inode *inode, struct file *file)
28570 +{
28571 +       int ret;
28572 +
28573 +       if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
28574 +               ret = release_all_synth_events();
28575 +               if (ret < 0)
28576 +                       return ret;
28577 +       }
28578 +
28579 +       return seq_open(file, &synth_events_seq_op);
28580 +}
28581 +
28582 +static ssize_t synth_events_write(struct file *file,
28583 +                                 const char __user *buffer,
28584 +                                 size_t count, loff_t *ppos)
28585 +{
28586 +       return trace_parse_run_command(file, buffer, count, ppos,
28587 +                                      create_synth_event);
28588 +}
28589 +
28590 +static const struct file_operations synth_events_fops = {
28591 +       .open           = synth_events_open,
28592 +       .write          = synth_events_write,
28593 +       .read           = seq_read,
28594 +       .llseek         = seq_lseek,
28595 +       .release        = seq_release,
28596 +};
28597 +
28598 +static u64 hist_field_timestamp(struct hist_field *hist_field,
28599 +                               struct tracing_map_elt *elt,
28600 +                               struct ring_buffer_event *rbe,
28601 +                               void *event)
28602 +{
28603 +       struct hist_trigger_data *hist_data = hist_field->hist_data;
28604 +       struct trace_array *tr = hist_data->event_file->tr;
28605 +
28606 +       u64 ts = ring_buffer_event_time_stamp(rbe);
28607 +
28608 +       if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
28609 +               ts = ns2usecs(ts);
28610 +
28611 +       return ts;
28612 +}
28613 +
28614 +static u64 hist_field_cpu(struct hist_field *hist_field,
28615 +                         struct tracing_map_elt *elt,
28616 +                         struct ring_buffer_event *rbe,
28617 +                         void *event)
28618 +{
28619 +       int cpu = smp_processor_id();
28620 +
28621 +       return cpu;
28622 +}
28623 +
28624 +static struct hist_field *
28625 +check_field_for_var_ref(struct hist_field *hist_field,
28626 +                       struct hist_trigger_data *var_data,
28627 +                       unsigned int var_idx)
28628 +{
28629 +       struct hist_field *found = NULL;
28630 +
28631 +       if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
28632 +               if (hist_field->var.idx == var_idx &&
28633 +                   hist_field->var.hist_data == var_data) {
28634 +                       found = hist_field;
28635 +               }
28636 +       }
28637 +
28638 +       return found;
28639 +}
28640 +
28641 +static struct hist_field *
28642 +check_field_for_var_refs(struct hist_trigger_data *hist_data,
28643 +                        struct hist_field *hist_field,
28644 +                        struct hist_trigger_data *var_data,
28645 +                        unsigned int var_idx,
28646 +                        unsigned int level)
28647 +{
28648 +       struct hist_field *found = NULL;
28649 +       unsigned int i;
28650 +
28651 +       if (level > 3)
28652 +               return found;
28653 +
28654 +       if (!hist_field)
28655 +               return found;
28656 +
28657 +       found = check_field_for_var_ref(hist_field, var_data, var_idx);
28658 +       if (found)
28659 +               return found;
28660 +
28661 +       for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
28662 +               struct hist_field *operand;
28663 +
28664 +               operand = hist_field->operands[i];
28665 +               found = check_field_for_var_refs(hist_data, operand, var_data,
28666 +                                                var_idx, level + 1);
28667 +               if (found)
28668 +                       return found;
28669 +       }
28670 +
28671 +       return found;
28672 +}
28673 +
28674 +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
28675 +                                      struct hist_trigger_data *var_data,
28676 +                                      unsigned int var_idx)
28677 +{
28678 +       struct hist_field *hist_field, *found = NULL;
28679 +       unsigned int i;
28680 +
28681 +       for_each_hist_field(i, hist_data) {
28682 +               hist_field = hist_data->fields[i];
28683 +               found = check_field_for_var_refs(hist_data, hist_field,
28684 +                                                var_data, var_idx, 0);
28685 +               if (found)
28686 +                       return found;
28687 +       }
28688 +
28689 +       for (i = 0; i < hist_data->n_synth_var_refs; i++) {
28690 +               hist_field = hist_data->synth_var_refs[i];
28691 +               found = check_field_for_var_refs(hist_data, hist_field,
28692 +                                                var_data, var_idx, 0);
28693 +               if (found)
28694 +                       return found;
28695 +       }
28696 +
28697 +       return found;
28698 +}
28699 +
28700 +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
28701 +                                          unsigned int var_idx)
28702 +{
28703 +       struct trace_array *tr = hist_data->event_file->tr;
28704 +       struct hist_field *found = NULL;
28705 +       struct hist_var_data *var_data;
28706 +
28707 +       list_for_each_entry(var_data, &tr->hist_vars, list) {
28708 +               if (var_data->hist_data == hist_data)
28709 +                       continue;
28710 +               found = find_var_ref(var_data->hist_data, hist_data, var_idx);
28711 +               if (found)
28712 +                       break;
28713 +       }
28714 +
28715 +       return found;
28716 +}
28717 +
28718 +static bool check_var_refs(struct hist_trigger_data *hist_data)
28719 +{
28720 +       struct hist_field *field;
28721 +       bool found = false;
28722 +       int i;
28723 +
28724 +       for_each_hist_field(i, hist_data) {
28725 +               field = hist_data->fields[i];
28726 +               if (field && field->flags & HIST_FIELD_FL_VAR) {
28727 +                       if (find_any_var_ref(hist_data, field->var.idx)) {
28728 +                               found = true;
28729 +                               break;
28730 +                       }
28731 +               }
28732 +       }
28733 +
28734 +       return found;
28735 +}
28736 +
28737 +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
28738 +{
28739 +       struct trace_array *tr = hist_data->event_file->tr;
28740 +       struct hist_var_data *var_data, *found = NULL;
28741 +
28742 +       list_for_each_entry(var_data, &tr->hist_vars, list) {
28743 +               if (var_data->hist_data == hist_data) {
28744 +                       found = var_data;
28745 +                       break;
28746 +               }
28747 +       }
28748 +
28749 +       return found;
28750 +}
28751 +
28752 +static bool field_has_hist_vars(struct hist_field *hist_field,
28753 +                               unsigned int level)
28754 +{
28755 +       int i;
28756 +
28757 +       if (level > 3)
28758 +               return false;
28759 +
28760 +       if (!hist_field)
28761 +               return false;
28762 +
28763 +       if (hist_field->flags & HIST_FIELD_FL_VAR ||
28764 +           hist_field->flags & HIST_FIELD_FL_VAR_REF)
28765 +               return true;
28766 +
28767 +       for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
28768 +               struct hist_field *operand;
28769 +
28770 +               operand = hist_field->operands[i];
28771 +               if (field_has_hist_vars(operand, level + 1))
28772 +                       return true;
28773 +       }
28774 +
28775 +       return false;
28776 +}
28777 +
28778 +static bool has_hist_vars(struct hist_trigger_data *hist_data)
28779 +{
28780 +       struct hist_field *hist_field;
28781 +       int i;
28782 +
28783 +       for_each_hist_field(i, hist_data) {
28784 +               hist_field = hist_data->fields[i];
28785 +               if (field_has_hist_vars(hist_field, 0))
28786 +                       return true;
28787 +       }
28788 +
28789 +       return false;
28790 +}
28791 +
28792 +static int save_hist_vars(struct hist_trigger_data *hist_data)
28793 +{
28794 +       struct trace_array *tr = hist_data->event_file->tr;
28795 +       struct hist_var_data *var_data;
28796 +
28797 +       var_data = find_hist_vars(hist_data);
28798 +       if (var_data)
28799 +               return 0;
28800 +
28801 +       if (trace_array_get(tr) < 0)
28802 +               return -ENODEV;
28803 +
28804 +       var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
28805 +       if (!var_data) {
28806 +               trace_array_put(tr);
28807 +               return -ENOMEM;
28808 +       }
28809 +
28810 +       var_data->hist_data = hist_data;
28811 +       list_add(&var_data->list, &tr->hist_vars);
28812 +
28813 +       return 0;
28814 +}
28815 +
28816 +static void remove_hist_vars(struct hist_trigger_data *hist_data)
28817 +{
28818 +       struct trace_array *tr = hist_data->event_file->tr;
28819 +       struct hist_var_data *var_data;
28820 +
28821 +       var_data = find_hist_vars(hist_data);
28822 +       if (!var_data)
28823 +               return;
28824 +
28825 +       if (WARN_ON(check_var_refs(hist_data)))
28826 +               return;
28827 +
28828 +       list_del(&var_data->list);
28829 +
28830 +       kfree(var_data);
28831 +
28832 +       trace_array_put(tr);
28833 +}
28834 +
28835 +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
28836 +                                        const char *var_name)
28837 +{
28838 +       struct hist_field *hist_field, *found = NULL;
28839 +       int i;
28840 +
28841 +       for_each_hist_field(i, hist_data) {
28842 +               hist_field = hist_data->fields[i];
28843 +               if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
28844 +                   strcmp(hist_field->var.name, var_name) == 0) {
28845 +                       found = hist_field;
28846 +                       break;
28847 +               }
28848 +       }
28849 +
28850 +       return found;
28851 +}
28852 +
28853 +static struct hist_field *find_var(struct hist_trigger_data *hist_data,
28854 +                                  struct trace_event_file *file,
28855 +                                  const char *var_name)
28856 +{
28857 +       struct hist_trigger_data *test_data;
28858 +       struct event_trigger_data *test;
28859 +       struct hist_field *hist_field;
28860 +
28861 +       hist_field = find_var_field(hist_data, var_name);
28862 +       if (hist_field)
28863 +               return hist_field;
28864 +
28865 +       list_for_each_entry_rcu(test, &file->triggers, list) {
28866 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
28867 +                       test_data = test->private_data;
28868 +                       hist_field = find_var_field(test_data, var_name);
28869 +                       if (hist_field)
28870 +                               return hist_field;
28871 +               }
28872 +       }
28873 +
28874 +       return NULL;
28875 +}
28876 +
28877 +static struct trace_event_file *find_var_file(struct trace_array *tr,
28878 +                                             char *system,
28879 +                                             char *event_name,
28880 +                                             char *var_name)
28881 +{
28882 +       struct hist_trigger_data *var_hist_data;
28883 +       struct hist_var_data *var_data;
28884 +       struct trace_event_file *file, *found = NULL;
28885 +
28886 +       if (system)
28887 +               return find_event_file(tr, system, event_name);
28888 +
28889 +       list_for_each_entry(var_data, &tr->hist_vars, list) {
28890 +               var_hist_data = var_data->hist_data;
28891 +               file = var_hist_data->event_file;
28892 +               if (file == found)
28893 +                       continue;
28894 +
28895 +               if (find_var_field(var_hist_data, var_name)) {
28896 +                       if (found) {
28897 +                               hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
28898 +                               return NULL;
28899 +                       }
28900 +
28901 +                       found = file;
28902 +               }
28903 +       }
28904 +
28905 +       return found;
28906 +}
28907 +
28908 +static struct hist_field *find_file_var(struct trace_event_file *file,
28909 +                                       const char *var_name)
28910 +{
28911 +       struct hist_trigger_data *test_data;
28912 +       struct event_trigger_data *test;
28913 +       struct hist_field *hist_field;
28914 +
28915 +       list_for_each_entry_rcu(test, &file->triggers, list) {
28916 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
28917 +                       test_data = test->private_data;
28918 +                       hist_field = find_var_field(test_data, var_name);
28919 +                       if (hist_field)
28920 +                               return hist_field;
28921 +               }
28922 +       }
28923 +
28924 +       return NULL;
28925 +}
28926 +
28927 +static struct hist_field *
28928 +find_match_var(struct hist_trigger_data *hist_data, char *var_name)
28929 +{
28930 +       struct trace_array *tr = hist_data->event_file->tr;
28931 +       struct hist_field *hist_field, *found = NULL;
28932 +       struct trace_event_file *file;
28933 +       unsigned int i;
28934 +
28935 +       for (i = 0; i < hist_data->n_actions; i++) {
28936 +               struct action_data *data = hist_data->actions[i];
28937 +
28938 +               if (data->fn == action_trace) {
28939 +                       char *system = data->onmatch.match_event_system;
28940 +                       char *event_name = data->onmatch.match_event;
28941 +
28942 +                       file = find_var_file(tr, system, event_name, var_name);
28943 +                       if (!file)
28944 +                               continue;
28945 +                       hist_field = find_file_var(file, var_name);
28946 +                       if (hist_field) {
28947 +                               if (found) {
28948 +                                       hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
28949 +                                       return ERR_PTR(-EINVAL);
28950 +                               }
28951 +
28952 +                               found = hist_field;
28953 +                       }
28954 +               }
28955 +       }
28956 +       return found;
28957 +}
28958 +
28959 +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
28960 +                                        char *system,
28961 +                                        char *event_name,
28962 +                                        char *var_name)
28963 +{
28964 +       struct trace_array *tr = hist_data->event_file->tr;
28965 +       struct hist_field *hist_field = NULL;
28966 +       struct trace_event_file *file;
28967 +
28968 +       if (!system || !event_name) {
28969 +               hist_field = find_match_var(hist_data, var_name);
28970 +               if (IS_ERR(hist_field))
28971 +                       return NULL;
28972 +               if (hist_field)
28973 +                       return hist_field;
28974 +       }
28975 +
28976 +       file = find_var_file(tr, system, event_name, var_name);
28977 +       if (!file)
28978 +               return NULL;
28979 +
28980 +       hist_field = find_file_var(file, var_name);
28981 +
28982 +       return hist_field;
28983 +}
28984 +
28985 +struct hist_elt_data {
28986 +       char *comm;
28987 +       u64 *var_ref_vals;
28988 +       char *field_var_str[SYNTH_FIELDS_MAX];
28989  };
28990  
28991 +static u64 hist_field_var_ref(struct hist_field *hist_field,
28992 +                             struct tracing_map_elt *elt,
28993 +                             struct ring_buffer_event *rbe,
28994 +                             void *event)
28995 +{
28996 +       struct hist_elt_data *elt_data;
28997 +       u64 var_val = 0;
28998 +
28999 +       elt_data = elt->private_data;
29000 +       var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
29001 +
29002 +       return var_val;
29003 +}
29004 +
29005 +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
29006 +                            u64 *var_ref_vals, bool self)
29007 +{
29008 +       struct hist_trigger_data *var_data;
29009 +       struct tracing_map_elt *var_elt;
29010 +       struct hist_field *hist_field;
29011 +       unsigned int i, var_idx;
29012 +       bool resolved = true;
29013 +       u64 var_val = 0;
29014 +
29015 +       for (i = 0; i < hist_data->n_var_refs; i++) {
29016 +               hist_field = hist_data->var_refs[i];
29017 +               var_idx = hist_field->var.idx;
29018 +               var_data = hist_field->var.hist_data;
29019 +
29020 +               if (var_data == NULL) {
29021 +                       resolved = false;
29022 +                       break;
29023 +               }
29024 +
29025 +               if ((self && var_data != hist_data) ||
29026 +                   (!self && var_data == hist_data))
29027 +                       continue;
29028 +
29029 +               var_elt = tracing_map_lookup(var_data->map, key);
29030 +               if (!var_elt) {
29031 +                       resolved = false;
29032 +                       break;
29033 +               }
29034 +
29035 +               if (!tracing_map_var_set(var_elt, var_idx)) {
29036 +                       resolved = false;
29037 +                       break;
29038 +               }
29039 +
29040 +               if (self || !hist_field->read_once)
29041 +                       var_val = tracing_map_read_var(var_elt, var_idx);
29042 +               else
29043 +                       var_val = tracing_map_read_var_once(var_elt, var_idx);
29044 +
29045 +               var_ref_vals[i] = var_val;
29046 +       }
29047 +
29048 +       return resolved;
29049 +}
29050 +
29051 +static const char *hist_field_name(struct hist_field *field,
29052 +                                  unsigned int level)
29053 +{
29054 +       const char *field_name = "";
29055 +
29056 +       if (level > 1)
29057 +               return field_name;
29058 +
29059 +       if (field->field)
29060 +               field_name = field->field->name;
29061 +       else if (field->flags & HIST_FIELD_FL_LOG2 ||
29062 +                field->flags & HIST_FIELD_FL_ALIAS)
29063 +               field_name = hist_field_name(field->operands[0], ++level);
29064 +       else if (field->flags & HIST_FIELD_FL_CPU)
29065 +               field_name = "cpu";
29066 +       else if (field->flags & HIST_FIELD_FL_EXPR ||
29067 +                field->flags & HIST_FIELD_FL_VAR_REF) {
29068 +               if (field->system) {
29069 +                       static char full_name[MAX_FILTER_STR_VAL];
29070 +
29071 +                       strcat(full_name, field->system);
29072 +                       strcat(full_name, ".");
29073 +                       strcat(full_name, field->event_name);
29074 +                       strcat(full_name, ".");
29075 +                       strcat(full_name, field->name);
29076 +                       field_name = full_name;
29077 +               } else
29078 +                       field_name = field->name;
29079 +       } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
29080 +               field_name = "common_timestamp";
29081 +
29082 +       if (field_name == NULL)
29083 +               field_name = "";
29084 +
29085 +       return field_name;
29086 +}
29087 +
29088  static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
29089  {
29090         hist_field_fn_t fn = NULL;
29091 @@ -207,16 +1771,119 @@
29092  
29093  static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
29094  {
29095 +       unsigned int i;
29096 +
29097         if (!attrs)
29098                 return;
29099  
29100 +       for (i = 0; i < attrs->n_assignments; i++)
29101 +               kfree(attrs->assignment_str[i]);
29102 +
29103 +       for (i = 0; i < attrs->n_actions; i++)
29104 +               kfree(attrs->action_str[i]);
29105 +
29106         kfree(attrs->name);
29107         kfree(attrs->sort_key_str);
29108         kfree(attrs->keys_str);
29109         kfree(attrs->vals_str);
29110 +       kfree(attrs->clock);
29111         kfree(attrs);
29112  }
29113  
29114 +static int parse_action(char *str, struct hist_trigger_attrs *attrs)
29115 +{
29116 +       int ret = -EINVAL;
29117 +
29118 +       if (attrs->n_actions >= HIST_ACTIONS_MAX)
29119 +               return ret;
29120 +
29121 +       if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
29122 +           (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
29123 +               attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
29124 +               if (!attrs->action_str[attrs->n_actions]) {
29125 +                       ret = -ENOMEM;
29126 +                       return ret;
29127 +               }
29128 +               attrs->n_actions++;
29129 +               ret = 0;
29130 +       }
29131 +
29132 +       return ret;
29133 +}
29134 +
29135 +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
29136 +{
29137 +       int ret = 0;
29138 +
29139 +       if ((strncmp(str, "key=", strlen("key=")) == 0) ||
29140 +           (strncmp(str, "keys=", strlen("keys=")) == 0)) {
29141 +               attrs->keys_str = kstrdup(str, GFP_KERNEL);
29142 +               if (!attrs->keys_str) {
29143 +                       ret = -ENOMEM;
29144 +                       goto out;
29145 +               }
29146 +       } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
29147 +                (strncmp(str, "vals=", strlen("vals=")) == 0) ||
29148 +                (strncmp(str, "values=", strlen("values=")) == 0)) {
29149 +               attrs->vals_str = kstrdup(str, GFP_KERNEL);
29150 +               if (!attrs->vals_str) {
29151 +                       ret = -ENOMEM;
29152 +                       goto out;
29153 +               }
29154 +       } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
29155 +               attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
29156 +               if (!attrs->sort_key_str) {
29157 +                       ret = -ENOMEM;
29158 +                       goto out;
29159 +               }
29160 +       } else if (strncmp(str, "name=", strlen("name=")) == 0) {
29161 +               attrs->name = kstrdup(str, GFP_KERNEL);
29162 +               if (!attrs->name) {
29163 +                       ret = -ENOMEM;
29164 +                       goto out;
29165 +               }
29166 +       } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
29167 +               strsep(&str, "=");
29168 +               if (!str) {
29169 +                       ret = -EINVAL;
29170 +                       goto out;
29171 +               }
29172 +
29173 +               str = strstrip(str);
29174 +               attrs->clock = kstrdup(str, GFP_KERNEL);
29175 +               if (!attrs->clock) {
29176 +                       ret = -ENOMEM;
29177 +                       goto out;
29178 +               }
29179 +       } else if (strncmp(str, "size=", strlen("size=")) == 0) {
29180 +               int map_bits = parse_map_size(str);
29181 +
29182 +               if (map_bits < 0) {
29183 +                       ret = map_bits;
29184 +                       goto out;
29185 +               }
29186 +               attrs->map_bits = map_bits;
29187 +       } else {
29188 +               char *assignment;
29189 +
29190 +               if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
29191 +                       hist_err("Too many variables defined: ", str);
29192 +                       ret = -EINVAL;
29193 +                       goto out;
29194 +               }
29195 +
29196 +               assignment = kstrdup(str, GFP_KERNEL);
29197 +               if (!assignment) {
29198 +                       ret = -ENOMEM;
29199 +                       goto out;
29200 +               }
29201 +
29202 +               attrs->assignment_str[attrs->n_assignments++] = assignment;
29203 +       }
29204 + out:
29205 +       return ret;
29206 +}
29207 +
29208  static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
29209  {
29210         struct hist_trigger_attrs *attrs;
29211 @@ -229,35 +1896,21 @@
29212         while (trigger_str) {
29213                 char *str = strsep(&trigger_str, ":");
29214  
29215 -               if ((strncmp(str, "key=", strlen("key=")) == 0) ||
29216 -                   (strncmp(str, "keys=", strlen("keys=")) == 0))
29217 -                       attrs->keys_str = kstrdup(str, GFP_KERNEL);
29218 -               else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
29219 -                        (strncmp(str, "vals=", strlen("vals=")) == 0) ||
29220 -                        (strncmp(str, "values=", strlen("values=")) == 0))
29221 -                       attrs->vals_str = kstrdup(str, GFP_KERNEL);
29222 -               else if (strncmp(str, "sort=", strlen("sort=")) == 0)
29223 -                       attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
29224 -               else if (strncmp(str, "name=", strlen("name=")) == 0)
29225 -                       attrs->name = kstrdup(str, GFP_KERNEL);
29226 -               else if (strcmp(str, "pause") == 0)
29227 +               if (strchr(str, '=')) {
29228 +                       ret = parse_assignment(str, attrs);
29229 +                       if (ret)
29230 +                               goto free;
29231 +               } else if (strcmp(str, "pause") == 0)
29232                         attrs->pause = true;
29233                 else if ((strcmp(str, "cont") == 0) ||
29234                          (strcmp(str, "continue") == 0))
29235                         attrs->cont = true;
29236                 else if (strcmp(str, "clear") == 0)
29237                         attrs->clear = true;
29238 -               else if (strncmp(str, "size=", strlen("size=")) == 0) {
29239 -                       int map_bits = parse_map_size(str);
29240 -
29241 -                       if (map_bits < 0) {
29242 -                               ret = map_bits;
29243 +               else {
29244 +                       ret = parse_action(str, attrs);
29245 +                       if (ret)
29246                                 goto free;
29247 -                       }
29248 -                       attrs->map_bits = map_bits;
29249 -               } else {
29250 -                       ret = -EINVAL;
29251 -                       goto free;
29252                 }
29253         }
29254  
29255 @@ -266,6 +1919,14 @@
29256                 goto free;
29257         }
29258  
29259 +       if (!attrs->clock) {
29260 +               attrs->clock = kstrdup("global", GFP_KERNEL);
29261 +               if (!attrs->clock) {
29262 +                       ret = -ENOMEM;
29263 +                       goto free;
29264 +               }
29265 +       }
29266 +
29267         return attrs;
29268   free:
29269         destroy_hist_trigger_attrs(attrs);
29270 @@ -288,65 +1949,222 @@
29271         memcpy(comm, task->comm, TASK_COMM_LEN);
29272  }
29273  
29274 -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
29275 +static void hist_elt_data_free(struct hist_elt_data *elt_data)
29276  {
29277 -       kfree((char *)elt->private_data);
29278 +       unsigned int i;
29279 +
29280 +       for (i = 0; i < SYNTH_FIELDS_MAX; i++)
29281 +               kfree(elt_data->field_var_str[i]);
29282 +
29283 +       kfree(elt_data->comm);
29284 +       kfree(elt_data);
29285  }
29286  
29287 -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
29288 +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
29289 +{
29290 +       struct hist_elt_data *elt_data = elt->private_data;
29291 +
29292 +       hist_elt_data_free(elt_data);
29293 +}
29294 +
29295 +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
29296  {
29297         struct hist_trigger_data *hist_data = elt->map->private_data;
29298 +       unsigned int size = TASK_COMM_LEN;
29299 +       struct hist_elt_data *elt_data;
29300         struct hist_field *key_field;
29301 -       unsigned int i;
29302 +       unsigned int i, n_str;
29303 +
29304 +       elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
29305 +       if (!elt_data)
29306 +               return -ENOMEM;
29307  
29308         for_each_hist_key_field(i, hist_data) {
29309                 key_field = hist_data->fields[i];
29310  
29311                 if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
29312 -                       unsigned int size = TASK_COMM_LEN + 1;
29313 -
29314 -                       elt->private_data = kzalloc(size, GFP_KERNEL);
29315 -                       if (!elt->private_data)
29316 +                       elt_data->comm = kzalloc(size, GFP_KERNEL);
29317 +                       if (!elt_data->comm) {
29318 +                               kfree(elt_data);
29319                                 return -ENOMEM;
29320 +                       }
29321                         break;
29322                 }
29323         }
29324  
29325 +       n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
29326 +
29327 +       size = STR_VAR_LEN_MAX;
29328 +
29329 +       for (i = 0; i < n_str; i++) {
29330 +               elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
29331 +               if (!elt_data->field_var_str[i]) {
29332 +                       hist_elt_data_free(elt_data);
29333 +                       return -ENOMEM;
29334 +               }
29335 +       }
29336 +
29337 +       elt->private_data = elt_data;
29338 +
29339         return 0;
29340  }
29341  
29342 -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
29343 -                                      struct tracing_map_elt *from)
29344 +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
29345  {
29346 -       char *comm_from = from->private_data;
29347 -       char *comm_to = to->private_data;
29348 +       struct hist_elt_data *elt_data = elt->private_data;
29349  
29350 -       if (comm_from)
29351 -               memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
29352 +       if (elt_data->comm)
29353 +               save_comm(elt_data->comm, current);
29354  }
29355  
29356 -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
29357 +static const struct tracing_map_ops hist_trigger_elt_data_ops = {
29358 +       .elt_alloc      = hist_trigger_elt_data_alloc,
29359 +       .elt_free       = hist_trigger_elt_data_free,
29360 +       .elt_init       = hist_trigger_elt_data_init,
29361 +};
29362 +
29363 +static const char *get_hist_field_flags(struct hist_field *hist_field)
29364  {
29365 -       char *comm = elt->private_data;
29366 +       const char *flags_str = NULL;
29367  
29368 -       if (comm)
29369 -               save_comm(comm, current);
29370 +       if (hist_field->flags & HIST_FIELD_FL_HEX)
29371 +               flags_str = "hex";
29372 +       else if (hist_field->flags & HIST_FIELD_FL_SYM)
29373 +               flags_str = "sym";
29374 +       else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
29375 +               flags_str = "sym-offset";
29376 +       else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
29377 +               flags_str = "execname";
29378 +       else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
29379 +               flags_str = "syscall";
29380 +       else if (hist_field->flags & HIST_FIELD_FL_LOG2)
29381 +               flags_str = "log2";
29382 +       else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
29383 +               flags_str = "usecs";
29384 +
29385 +       return flags_str;
29386  }
29387  
29388 -static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
29389 -       .elt_alloc      = hist_trigger_elt_comm_alloc,
29390 -       .elt_copy       = hist_trigger_elt_comm_copy,
29391 -       .elt_free       = hist_trigger_elt_comm_free,
29392 -       .elt_init       = hist_trigger_elt_comm_init,
29393 -};
29394 +static void expr_field_str(struct hist_field *field, char *expr)
29395 +{
29396 +       if (field->flags & HIST_FIELD_FL_VAR_REF)
29397 +               strcat(expr, "$");
29398  
29399 -static void destroy_hist_field(struct hist_field *hist_field)
29400 +       strcat(expr, hist_field_name(field, 0));
29401 +
29402 +       if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
29403 +               const char *flags_str = get_hist_field_flags(field);
29404 +
29405 +               if (flags_str) {
29406 +                       strcat(expr, ".");
29407 +                       strcat(expr, flags_str);
29408 +               }
29409 +       }
29410 +}
29411 +
29412 +static char *expr_str(struct hist_field *field, unsigned int level)
29413 +{
29414 +       char *expr;
29415 +
29416 +       if (level > 1)
29417 +               return NULL;
29418 +
29419 +       expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
29420 +       if (!expr)
29421 +               return NULL;
29422 +
29423 +       if (!field->operands[0]) {
29424 +               expr_field_str(field, expr);
29425 +               return expr;
29426 +       }
29427 +
29428 +       if (field->operator == FIELD_OP_UNARY_MINUS) {
29429 +               char *subexpr;
29430 +
29431 +               strcat(expr, "-(");
29432 +               subexpr = expr_str(field->operands[0], ++level);
29433 +               if (!subexpr) {
29434 +                       kfree(expr);
29435 +                       return NULL;
29436 +               }
29437 +               strcat(expr, subexpr);
29438 +               strcat(expr, ")");
29439 +
29440 +               kfree(subexpr);
29441 +
29442 +               return expr;
29443 +       }
29444 +
29445 +       expr_field_str(field->operands[0], expr);
29446 +
29447 +       switch (field->operator) {
29448 +       case FIELD_OP_MINUS:
29449 +               strcat(expr, "-");
29450 +               break;
29451 +       case FIELD_OP_PLUS:
29452 +               strcat(expr, "+");
29453 +               break;
29454 +       default:
29455 +               kfree(expr);
29456 +               return NULL;
29457 +       }
29458 +
29459 +       expr_field_str(field->operands[1], expr);
29460 +
29461 +       return expr;
29462 +}
29463 +
29464 +static int contains_operator(char *str)
29465 +{
29466 +       enum field_op_id field_op = FIELD_OP_NONE;
29467 +       char *op;
29468 +
29469 +       op = strpbrk(str, "+-");
29470 +       if (!op)
29471 +               return FIELD_OP_NONE;
29472 +
29473 +       switch (*op) {
29474 +       case '-':
29475 +               if (*str == '-')
29476 +                       field_op = FIELD_OP_UNARY_MINUS;
29477 +               else
29478 +                       field_op = FIELD_OP_MINUS;
29479 +               break;
29480 +       case '+':
29481 +               field_op = FIELD_OP_PLUS;
29482 +               break;
29483 +       default:
29484 +               break;
29485 +       }
29486 +
29487 +       return field_op;
29488 +}
29489 +
29490 +static void destroy_hist_field(struct hist_field *hist_field,
29491 +                              unsigned int level)
29492  {
29493 +       unsigned int i;
29494 +
29495 +       if (level > 3)
29496 +               return;
29497 +
29498 +       if (!hist_field)
29499 +               return;
29500 +
29501 +       for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
29502 +               destroy_hist_field(hist_field->operands[i], level + 1);
29503 +
29504 +       kfree(hist_field->var.name);
29505 +       kfree(hist_field->name);
29506 +       kfree(hist_field->type);
29507 +
29508         kfree(hist_field);
29509  }
29510  
29511 -static struct hist_field *create_hist_field(struct ftrace_event_field *field,
29512 -                                           unsigned long flags)
29513 +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
29514 +                                           struct ftrace_event_field *field,
29515 +                                           unsigned long flags,
29516 +                                           char *var_name)
29517  {
29518         struct hist_field *hist_field;
29519  
29520 @@ -357,8 +2175,22 @@
29521         if (!hist_field)
29522                 return NULL;
29523  
29524 +       hist_field->hist_data = hist_data;
29525 +
29526 +       if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
29527 +               goto out; /* caller will populate */
29528 +
29529 +       if (flags & HIST_FIELD_FL_VAR_REF) {
29530 +               hist_field->fn = hist_field_var_ref;
29531 +               goto out;
29532 +       }
29533 +
29534         if (flags & HIST_FIELD_FL_HITCOUNT) {
29535                 hist_field->fn = hist_field_counter;
29536 +               hist_field->size = sizeof(u64);
29537 +               hist_field->type = kstrdup("u64", GFP_KERNEL);
29538 +               if (!hist_field->type)
29539 +                       goto free;
29540                 goto out;
29541         }
29542  
29543 @@ -368,7 +2200,31 @@
29544         }
29545  
29546         if (flags & HIST_FIELD_FL_LOG2) {
29547 +               unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
29548                 hist_field->fn = hist_field_log2;
29549 +               hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
29550 +               hist_field->size = hist_field->operands[0]->size;
29551 +               hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
29552 +               if (!hist_field->type)
29553 +                       goto free;
29554 +               goto out;
29555 +       }
29556 +
29557 +       if (flags & HIST_FIELD_FL_TIMESTAMP) {
29558 +               hist_field->fn = hist_field_timestamp;
29559 +               hist_field->size = sizeof(u64);
29560 +               hist_field->type = kstrdup("u64", GFP_KERNEL);
29561 +               if (!hist_field->type)
29562 +                       goto free;
29563 +               goto out;
29564 +       }
29565 +
29566 +       if (flags & HIST_FIELD_FL_CPU) {
29567 +               hist_field->fn = hist_field_cpu;
29568 +               hist_field->size = sizeof(int);
29569 +               hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
29570 +               if (!hist_field->type)
29571 +                       goto free;
29572                 goto out;
29573         }
29574  
29575 @@ -378,6 +2234,11 @@
29576         if (is_string_field(field)) {
29577                 flags |= HIST_FIELD_FL_STRING;
29578  
29579 +               hist_field->size = MAX_FILTER_STR_VAL;
29580 +               hist_field->type = kstrdup(field->type, GFP_KERNEL);
29581 +               if (!hist_field->type)
29582 +                       goto free;
29583 +
29584                 if (field->filter_type == FILTER_STATIC_STRING)
29585                         hist_field->fn = hist_field_string;
29586                 else if (field->filter_type == FILTER_DYN_STRING)
29587 @@ -385,10 +2246,16 @@
29588                 else
29589                         hist_field->fn = hist_field_pstring;
29590         } else {
29591 +               hist_field->size = field->size;
29592 +               hist_field->is_signed = field->is_signed;
29593 +               hist_field->type = kstrdup(field->type, GFP_KERNEL);
29594 +               if (!hist_field->type)
29595 +                       goto free;
29596 +
29597                 hist_field->fn = select_value_fn(field->size,
29598                                                  field->is_signed);
29599                 if (!hist_field->fn) {
29600 -                       destroy_hist_field(hist_field);
29601 +                       destroy_hist_field(hist_field, 0);
29602                         return NULL;
29603                 }
29604         }
29605 @@ -396,84 +2263,1636 @@
29606         hist_field->field = field;
29607         hist_field->flags = flags;
29608  
29609 +       if (var_name) {
29610 +               hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
29611 +               if (!hist_field->var.name)
29612 +                       goto free;
29613 +       }
29614 +
29615         return hist_field;
29616 + free:
29617 +       destroy_hist_field(hist_field, 0);
29618 +       return NULL;
29619  }
29620  
29621  static void destroy_hist_fields(struct hist_trigger_data *hist_data)
29622  {
29623         unsigned int i;
29624  
29625 -       for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
29626 +       for (i = 0; i < HIST_FIELDS_MAX; i++) {
29627                 if (hist_data->fields[i]) {
29628 -                       destroy_hist_field(hist_data->fields[i]);
29629 +                       destroy_hist_field(hist_data->fields[i], 0);
29630                         hist_data->fields[i] = NULL;
29631                 }
29632         }
29633  }
29634  
29635 -static int create_hitcount_val(struct hist_trigger_data *hist_data)
29636 +static int init_var_ref(struct hist_field *ref_field,
29637 +                       struct hist_field *var_field,
29638 +                       char *system, char *event_name)
29639  {
29640 -       hist_data->fields[HITCOUNT_IDX] =
29641 -               create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
29642 -       if (!hist_data->fields[HITCOUNT_IDX])
29643 -               return -ENOMEM;
29644 +       int err = 0;
29645  
29646 -       hist_data->n_vals++;
29647 +       ref_field->var.idx = var_field->var.idx;
29648 +       ref_field->var.hist_data = var_field->hist_data;
29649 +       ref_field->size = var_field->size;
29650 +       ref_field->is_signed = var_field->is_signed;
29651 +       ref_field->flags |= var_field->flags &
29652 +               (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
29653  
29654 -       if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
29655 +       if (system) {
29656 +               ref_field->system = kstrdup(system, GFP_KERNEL);
29657 +               if (!ref_field->system)
29658 +                       return -ENOMEM;
29659 +       }
29660 +
29661 +       if (event_name) {
29662 +               ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
29663 +               if (!ref_field->event_name) {
29664 +                       err = -ENOMEM;
29665 +                       goto free;
29666 +               }
29667 +       }
29668 +
29669 +       if (var_field->var.name) {
29670 +               ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
29671 +               if (!ref_field->name) {
29672 +                       err = -ENOMEM;
29673 +                       goto free;
29674 +               }
29675 +       } else if (var_field->name) {
29676 +               ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
29677 +               if (!ref_field->name) {
29678 +                       err = -ENOMEM;
29679 +                       goto free;
29680 +               }
29681 +       }
29682 +
29683 +       ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
29684 +       if (!ref_field->type) {
29685 +               err = -ENOMEM;
29686 +               goto free;
29687 +       }
29688 + out:
29689 +       return err;
29690 + free:
29691 +       kfree(ref_field->system);
29692 +       kfree(ref_field->event_name);
29693 +       kfree(ref_field->name);
29694 +
29695 +       goto out;
29696 +}
29697 +
29698 +static struct hist_field *create_var_ref(struct hist_field *var_field,
29699 +                                        char *system, char *event_name)
29700 +{
29701 +       unsigned long flags = HIST_FIELD_FL_VAR_REF;
29702 +       struct hist_field *ref_field;
29703 +
29704 +       ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
29705 +       if (ref_field) {
29706 +               if (init_var_ref(ref_field, var_field, system, event_name)) {
29707 +                       destroy_hist_field(ref_field, 0);
29708 +                       return NULL;
29709 +               }
29710 +       }
29711 +
29712 +       return ref_field;
29713 +}
29714 +
29715 +static bool is_var_ref(char *var_name)
29716 +{
29717 +       if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
29718 +               return false;
29719 +
29720 +       return true;
29721 +}
29722 +
29723 +static char *field_name_from_var(struct hist_trigger_data *hist_data,
29724 +                                char *var_name)
29725 +{
29726 +       char *name, *field;
29727 +       unsigned int i;
29728 +
29729 +       for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
29730 +               name = hist_data->attrs->var_defs.name[i];
29731 +
29732 +               if (strcmp(var_name, name) == 0) {
29733 +                       field = hist_data->attrs->var_defs.expr[i];
29734 +                       if (contains_operator(field) || is_var_ref(field))
29735 +                               continue;
29736 +                       return field;
29737 +               }
29738 +       }
29739 +
29740 +       return NULL;
29741 +}
29742 +
29743 +static char *local_field_var_ref(struct hist_trigger_data *hist_data,
29744 +                                char *system, char *event_name,
29745 +                                char *var_name)
29746 +{
29747 +       struct trace_event_call *call;
29748 +
29749 +       if (system && event_name) {
29750 +               call = hist_data->event_file->event_call;
29751 +
29752 +               if (strcmp(system, call->class->system) != 0)
29753 +                       return NULL;
29754 +
29755 +               if (strcmp(event_name, trace_event_name(call)) != 0)
29756 +                       return NULL;
29757 +       }
29758 +
29759 +       if (!!system != !!event_name)
29760 +               return NULL;
29761 +
29762 +       if (!is_var_ref(var_name))
29763 +               return NULL;
29764 +
29765 +       var_name++;
29766 +
29767 +       return field_name_from_var(hist_data, var_name);
29768 +}
29769 +
29770 +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
29771 +                                       char *system, char *event_name,
29772 +                                       char *var_name)
29773 +{
29774 +       struct hist_field *var_field = NULL, *ref_field = NULL;
29775 +
29776 +       if (!is_var_ref(var_name))
29777 +               return NULL;
29778 +
29779 +       var_name++;
29780 +
29781 +       var_field = find_event_var(hist_data, system, event_name, var_name);
29782 +       if (var_field)
29783 +               ref_field = create_var_ref(var_field, system, event_name);
29784 +
29785 +       if (!ref_field)
29786 +               hist_err_event("Couldn't find variable: $",
29787 +                              system, event_name, var_name);
29788 +
29789 +       return ref_field;
29790 +}
29791 +
29792 +static struct ftrace_event_field *
29793 +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
29794 +           char *field_str, unsigned long *flags)
29795 +{
29796 +       struct ftrace_event_field *field = NULL;
29797 +       char *field_name, *modifier, *str;
29798 +
29799 +       modifier = str = kstrdup(field_str, GFP_KERNEL);
29800 +       if (!modifier)
29801 +               return ERR_PTR(-ENOMEM);
29802 +
29803 +       field_name = strsep(&modifier, ".");
29804 +       if (modifier) {
29805 +               if (strcmp(modifier, "hex") == 0)
29806 +                       *flags |= HIST_FIELD_FL_HEX;
29807 +               else if (strcmp(modifier, "sym") == 0)
29808 +                       *flags |= HIST_FIELD_FL_SYM;
29809 +               else if (strcmp(modifier, "sym-offset") == 0)
29810 +                       *flags |= HIST_FIELD_FL_SYM_OFFSET;
29811 +               else if ((strcmp(modifier, "execname") == 0) &&
29812 +                        (strcmp(field_name, "common_pid") == 0))
29813 +                       *flags |= HIST_FIELD_FL_EXECNAME;
29814 +               else if (strcmp(modifier, "syscall") == 0)
29815 +                       *flags |= HIST_FIELD_FL_SYSCALL;
29816 +               else if (strcmp(modifier, "log2") == 0)
29817 +                       *flags |= HIST_FIELD_FL_LOG2;
29818 +               else if (strcmp(modifier, "usecs") == 0)
29819 +                       *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
29820 +               else {
29821 +                       hist_err("Invalid field modifier: ", modifier);
29822 +                       field = ERR_PTR(-EINVAL);
29823 +                       goto out;
29824 +               }
29825 +       }
29826 +
29827 +       if (strcmp(field_name, "common_timestamp") == 0) {
29828 +               *flags |= HIST_FIELD_FL_TIMESTAMP;
29829 +               hist_data->enable_timestamps = true;
29830 +               if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
29831 +                       hist_data->attrs->ts_in_usecs = true;
29832 +       } else if (strcmp(field_name, "cpu") == 0)
29833 +               *flags |= HIST_FIELD_FL_CPU;
29834 +       else {
29835 +               field = trace_find_event_field(file->event_call, field_name);
29836 +               if (!field || !field->size) {
29837 +                       hist_err("Couldn't find field: ", field_name);
29838 +                       field = ERR_PTR(-EINVAL);
29839 +                       goto out;
29840 +               }
29841 +       }
29842 + out:
29843 +       kfree(str);
29844 +
29845 +       return field;
29846 +}
29847 +
29848 +static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
29849 +                                      struct hist_field *var_ref,
29850 +                                      char *var_name)
29851 +{
29852 +       struct hist_field *alias = NULL;
29853 +       unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
29854 +
29855 +       alias = create_hist_field(hist_data, NULL, flags, var_name);
29856 +       if (!alias)
29857 +               return NULL;
29858 +
29859 +       alias->fn = var_ref->fn;
29860 +       alias->operands[0] = var_ref;
29861 +
29862 +       if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
29863 +               destroy_hist_field(alias, 0);
29864 +               return NULL;
29865 +       }
29866 +
29867 +       return alias;
29868 +}
29869 +
29870 +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
29871 +                                    struct trace_event_file *file, char *str,
29872 +                                    unsigned long *flags, char *var_name)
29873 +{
29874 +       char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
29875 +       struct ftrace_event_field *field = NULL;
29876 +       struct hist_field *hist_field = NULL;
29877 +       int ret = 0;
29878 +
29879 +       s = strchr(str, '.');
29880 +       if (s) {
29881 +               s = strchr(++s, '.');
29882 +               if (s) {
29883 +                       ref_system = strsep(&str, ".");
29884 +                       if (!str) {
29885 +                               ret = -EINVAL;
29886 +                               goto out;
29887 +                       }
29888 +                       ref_event = strsep(&str, ".");
29889 +                       if (!str) {
29890 +                               ret = -EINVAL;
29891 +                               goto out;
29892 +                       }
29893 +                       ref_var = str;
29894 +               }
29895 +       }
29896 +
29897 +       s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
29898 +       if (!s) {
29899 +               hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
29900 +               if (hist_field) {
29901 +                       hist_data->var_refs[hist_data->n_var_refs] = hist_field;
29902 +                       hist_field->var_ref_idx = hist_data->n_var_refs++;
29903 +                       if (var_name) {
29904 +                               hist_field = create_alias(hist_data, hist_field, var_name);
29905 +                               if (!hist_field) {
29906 +                                       ret = -ENOMEM;
29907 +                                       goto out;
29908 +                               }
29909 +                       }
29910 +                       return hist_field;
29911 +               }
29912 +       } else
29913 +               str = s;
29914 +
29915 +       field = parse_field(hist_data, file, str, flags);
29916 +       if (IS_ERR(field)) {
29917 +               ret = PTR_ERR(field);
29918 +               goto out;
29919 +       }
29920 +
29921 +       hist_field = create_hist_field(hist_data, field, *flags, var_name);
29922 +       if (!hist_field) {
29923 +               ret = -ENOMEM;
29924 +               goto out;
29925 +       }
29926 +
29927 +       return hist_field;
29928 + out:
29929 +       return ERR_PTR(ret);
29930 +}
29931 +
29932 +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
29933 +                                    struct trace_event_file *file,
29934 +                                    char *str, unsigned long flags,
29935 +                                    char *var_name, unsigned int level);
29936 +
29937 +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
29938 +                                     struct trace_event_file *file,
29939 +                                     char *str, unsigned long flags,
29940 +                                     char *var_name, unsigned int level)
29941 +{
29942 +       struct hist_field *operand1, *expr = NULL;
29943 +       unsigned long operand_flags;
29944 +       int ret = 0;
29945 +       char *s;
29946 +
29947 +       // we support only -(xxx) i.e. explicit parens required
29948 +
29949 +       if (level > 3) {
29950 +               hist_err("Too many subexpressions (3 max): ", str);
29951 +               ret = -EINVAL;
29952 +               goto free;
29953 +       }
29954 +
29955 +       str++; // skip leading '-'
29956 +
29957 +       s = strchr(str, '(');
29958 +       if (s)
29959 +               str++;
29960 +       else {
29961 +               ret = -EINVAL;
29962 +               goto free;
29963 +       }
29964 +
29965 +       s = strrchr(str, ')');
29966 +       if (s)
29967 +               *s = '\0';
29968 +       else {
29969 +               ret = -EINVAL; // no closing ')'
29970 +               goto free;
29971 +       }
29972 +
29973 +       flags |= HIST_FIELD_FL_EXPR;
29974 +       expr = create_hist_field(hist_data, NULL, flags, var_name);
29975 +       if (!expr) {
29976 +               ret = -ENOMEM;
29977 +               goto free;
29978 +       }
29979 +
29980 +       operand_flags = 0;
29981 +       operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
29982 +       if (IS_ERR(operand1)) {
29983 +               ret = PTR_ERR(operand1);
29984 +               goto free;
29985 +       }
29986 +
29987 +       expr->flags |= operand1->flags &
29988 +               (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
29989 +       expr->fn = hist_field_unary_minus;
29990 +       expr->operands[0] = operand1;
29991 +       expr->operator = FIELD_OP_UNARY_MINUS;
29992 +       expr->name = expr_str(expr, 0);
29993 +       expr->type = kstrdup(operand1->type, GFP_KERNEL);
29994 +       if (!expr->type) {
29995 +               ret = -ENOMEM;
29996 +               goto free;
29997 +       }
29998 +
29999 +       return expr;
30000 + free:
30001 +       destroy_hist_field(expr, 0);
30002 +       return ERR_PTR(ret);
30003 +}
30004 +
30005 +static int check_expr_operands(struct hist_field *operand1,
30006 +                              struct hist_field *operand2)
30007 +{
30008 +       unsigned long operand1_flags = operand1->flags;
30009 +       unsigned long operand2_flags = operand2->flags;
30010 +
30011 +       if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
30012 +           (operand1_flags & HIST_FIELD_FL_ALIAS)) {
30013 +               struct hist_field *var;
30014 +
30015 +               var = find_var_field(operand1->var.hist_data, operand1->name);
30016 +               if (!var)
30017 +                       return -EINVAL;
30018 +               operand1_flags = var->flags;
30019 +       }
30020 +
30021 +       if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
30022 +           (operand2_flags & HIST_FIELD_FL_ALIAS)) {
30023 +               struct hist_field *var;
30024 +
30025 +               var = find_var_field(operand2->var.hist_data, operand2->name);
30026 +               if (!var)
30027 +                       return -EINVAL;
30028 +               operand2_flags = var->flags;
30029 +       }
30030 +
30031 +       if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
30032 +           (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
30033 +               hist_err("Timestamp units in expression don't match", NULL);
30034                 return -EINVAL;
30035 +       }
30036  
30037         return 0;
30038  }
30039  
30040 -static int create_val_field(struct hist_trigger_data *hist_data,
30041 -                           unsigned int val_idx,
30042 -                           struct trace_event_file *file,
30043 -                           char *field_str)
30044 +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
30045 +                                    struct trace_event_file *file,
30046 +                                    char *str, unsigned long flags,
30047 +                                    char *var_name, unsigned int level)
30048  {
30049 -       struct ftrace_event_field *field = NULL;
30050 -       unsigned long flags = 0;
30051 -       char *field_name;
30052 +       struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
30053 +       unsigned long operand_flags;
30054 +       int field_op, ret = -EINVAL;
30055 +       char *sep, *operand1_str;
30056 +
30057 +       if (level > 3) {
30058 +               hist_err("Too many subexpressions (3 max): ", str);
30059 +               return ERR_PTR(-EINVAL);
30060 +       }
30061 +
30062 +       field_op = contains_operator(str);
30063 +
30064 +       if (field_op == FIELD_OP_NONE)
30065 +               return parse_atom(hist_data, file, str, &flags, var_name);
30066 +
30067 +       if (field_op == FIELD_OP_UNARY_MINUS)
30068 +               return parse_unary(hist_data, file, str, flags, var_name, ++level);
30069 +
30070 +       switch (field_op) {
30071 +       case FIELD_OP_MINUS:
30072 +               sep = "-";
30073 +               break;
30074 +       case FIELD_OP_PLUS:
30075 +               sep = "+";
30076 +               break;
30077 +       default:
30078 +               goto free;
30079 +       }
30080 +
30081 +       operand1_str = strsep(&str, sep);
30082 +       if (!operand1_str || !str)
30083 +               goto free;
30084 +
30085 +       operand_flags = 0;
30086 +       operand1 = parse_atom(hist_data, file, operand1_str,
30087 +                             &operand_flags, NULL);
30088 +       if (IS_ERR(operand1)) {
30089 +               ret = PTR_ERR(operand1);
30090 +               operand1 = NULL;
30091 +               goto free;
30092 +       }
30093 +
30094 +       // rest of string could be another expression e.g. b+c in a+b+c
30095 +       operand_flags = 0;
30096 +       operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
30097 +       if (IS_ERR(operand2)) {
30098 +               ret = PTR_ERR(operand2);
30099 +               operand2 = NULL;
30100 +               goto free;
30101 +       }
30102 +
30103 +       ret = check_expr_operands(operand1, operand2);
30104 +       if (ret)
30105 +               goto free;
30106 +
30107 +       flags |= HIST_FIELD_FL_EXPR;
30108 +
30109 +       flags |= operand1->flags &
30110 +               (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30111 +
30112 +       expr = create_hist_field(hist_data, NULL, flags, var_name);
30113 +       if (!expr) {
30114 +               ret = -ENOMEM;
30115 +               goto free;
30116 +       }
30117 +
30118 +       operand1->read_once = true;
30119 +       operand2->read_once = true;
30120 +
30121 +       expr->operands[0] = operand1;
30122 +       expr->operands[1] = operand2;
30123 +       expr->operator = field_op;
30124 +       expr->name = expr_str(expr, 0);
30125 +       expr->type = kstrdup(operand1->type, GFP_KERNEL);
30126 +       if (!expr->type) {
30127 +               ret = -ENOMEM;
30128 +               goto free;
30129 +       }
30130 +
30131 +       switch (field_op) {
30132 +       case FIELD_OP_MINUS:
30133 +               expr->fn = hist_field_minus;
30134 +               break;
30135 +       case FIELD_OP_PLUS:
30136 +               expr->fn = hist_field_plus;
30137 +               break;
30138 +       default:
30139 +               ret = -EINVAL;
30140 +               goto free;
30141 +       }
30142 +
30143 +       return expr;
30144 + free:
30145 +       destroy_hist_field(operand1, 0);
30146 +       destroy_hist_field(operand2, 0);
30147 +       destroy_hist_field(expr, 0);
30148 +
30149 +       return ERR_PTR(ret);
30150 +}
30151 +
30152 +static char *find_trigger_filter(struct hist_trigger_data *hist_data,
30153 +                                struct trace_event_file *file)
30154 +{
30155 +       struct event_trigger_data *test;
30156 +
30157 +       list_for_each_entry_rcu(test, &file->triggers, list) {
30158 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
30159 +                       if (test->private_data == hist_data)
30160 +                               return test->filter_str;
30161 +               }
30162 +       }
30163 +
30164 +       return NULL;
30165 +}
30166 +
30167 +static struct event_command trigger_hist_cmd;
30168 +static int event_hist_trigger_func(struct event_command *cmd_ops,
30169 +                                  struct trace_event_file *file,
30170 +                                  char *glob, char *cmd, char *param);
30171 +
30172 +static bool compatible_keys(struct hist_trigger_data *target_hist_data,
30173 +                           struct hist_trigger_data *hist_data,
30174 +                           unsigned int n_keys)
30175 +{
30176 +       struct hist_field *target_hist_field, *hist_field;
30177 +       unsigned int n, i, j;
30178 +
30179 +       if (hist_data->n_fields - hist_data->n_vals != n_keys)
30180 +               return false;
30181 +
30182 +       i = hist_data->n_vals;
30183 +       j = target_hist_data->n_vals;
30184 +
30185 +       for (n = 0; n < n_keys; n++) {
30186 +               hist_field = hist_data->fields[i + n];
30187 +               target_hist_field = target_hist_data->fields[j + n];
30188 +
30189 +               if (strcmp(hist_field->type, target_hist_field->type) != 0)
30190 +                       return false;
30191 +               if (hist_field->size != target_hist_field->size)
30192 +                       return false;
30193 +               if (hist_field->is_signed != target_hist_field->is_signed)
30194 +                       return false;
30195 +       }
30196 +
30197 +       return true;
30198 +}
30199 +
30200 +static struct hist_trigger_data *
30201 +find_compatible_hist(struct hist_trigger_data *target_hist_data,
30202 +                    struct trace_event_file *file)
30203 +{
30204 +       struct hist_trigger_data *hist_data;
30205 +       struct event_trigger_data *test;
30206 +       unsigned int n_keys;
30207 +
30208 +       n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
30209 +
30210 +       list_for_each_entry_rcu(test, &file->triggers, list) {
30211 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
30212 +                       hist_data = test->private_data;
30213 +
30214 +                       if (compatible_keys(target_hist_data, hist_data, n_keys))
30215 +                               return hist_data;
30216 +               }
30217 +       }
30218 +
30219 +       return NULL;
30220 +}
30221 +
30222 +static struct trace_event_file *event_file(struct trace_array *tr,
30223 +                                          char *system, char *event_name)
30224 +{
30225 +       struct trace_event_file *file;
30226 +
30227 +       file = find_event_file(tr, system, event_name);
30228 +       if (!file)
30229 +               return ERR_PTR(-EINVAL);
30230 +
30231 +       return file;
30232 +}
30233 +
30234 +static struct hist_field *
30235 +find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
30236 +                        char *system, char *event_name, char *field_name)
30237 +{
30238 +       struct hist_field *event_var;
30239 +       char *synthetic_name;
30240 +
30241 +       synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30242 +       if (!synthetic_name)
30243 +               return ERR_PTR(-ENOMEM);
30244 +
30245 +       strcpy(synthetic_name, "synthetic_");
30246 +       strcat(synthetic_name, field_name);
30247 +
30248 +       event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
30249 +
30250 +       kfree(synthetic_name);
30251 +
30252 +       return event_var;
30253 +}
30254 +
30255 +/**
30256 + * create_field_var_hist - Automatically create a histogram and var for a field
30257 + * @target_hist_data: The target hist trigger
30258 + * @subsys_name: Optional subsystem name
30259 + * @event_name: Optional event name
30260 + * @field_name: The name of the field (and the resulting variable)
30261 + *
30262 + * Hist trigger actions fetch data from variables, not directly from
30263 + * events.  However, for convenience, users are allowed to directly
30264 + * specify an event field in an action, which will be automatically
30265 + * converted into a variable on their behalf.
30266 +
30267 + * If a user specifies a field on an event that isn't the event the
30268 + * histogram currently being defined (the target event histogram), the
30269 + * only way that can be accomplished is if a new hist trigger is
30270 + * created and the field variable defined on that.
30271 + *
30272 + * This function creates a new histogram compatible with the target
30273 + * event (meaning a histogram with the same key as the target
30274 + * histogram), and creates a variable for the specified field, but
30275 + * with 'synthetic_' prepended to the variable name in order to avoid
30276 + * collision with normal field variables.
30277 + *
30278 + * Return: The variable created for the field.
30279 + */
30280 +static struct hist_field *
30281 +create_field_var_hist(struct hist_trigger_data *target_hist_data,
30282 +                     char *subsys_name, char *event_name, char *field_name)
30283 +{
30284 +       struct trace_array *tr = target_hist_data->event_file->tr;
30285 +       struct hist_field *event_var = ERR_PTR(-EINVAL);
30286 +       struct hist_trigger_data *hist_data;
30287 +       unsigned int i, n, first = true;
30288 +       struct field_var_hist *var_hist;
30289 +       struct trace_event_file *file;
30290 +       struct hist_field *key_field;
30291 +       char *saved_filter;
30292 +       char *cmd;
30293 +       int ret;
30294 +
30295 +       if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
30296 +               hist_err_event("onmatch: Too many field variables defined: ",
30297 +                              subsys_name, event_name, field_name);
30298 +               return ERR_PTR(-EINVAL);
30299 +       }
30300 +
30301 +       file = event_file(tr, subsys_name, event_name);
30302 +
30303 +       if (IS_ERR(file)) {
30304 +               hist_err_event("onmatch: Event file not found: ",
30305 +                              subsys_name, event_name, field_name);
30306 +               ret = PTR_ERR(file);
30307 +               return ERR_PTR(ret);
30308 +       }
30309 +
30310 +       /*
30311 +        * Look for a histogram compatible with target.  We'll use the
30312 +        * found histogram specification to create a new matching
30313 +        * histogram with our variable on it.  target_hist_data is not
30314 +        * yet a registered histogram so we can't use that.
30315 +        */
30316 +       hist_data = find_compatible_hist(target_hist_data, file);
30317 +       if (!hist_data) {
30318 +               hist_err_event("onmatch: Matching event histogram not found: ",
30319 +                              subsys_name, event_name, field_name);
30320 +               return ERR_PTR(-EINVAL);
30321 +       }
30322 +
30323 +       /* See if a synthetic field variable has already been created */
30324 +       event_var = find_synthetic_field_var(target_hist_data, subsys_name,
30325 +                                            event_name, field_name);
30326 +       if (!IS_ERR_OR_NULL(event_var))
30327 +               return event_var;
30328 +
30329 +       var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
30330 +       if (!var_hist)
30331 +               return ERR_PTR(-ENOMEM);
30332 +
30333 +       cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30334 +       if (!cmd) {
30335 +               kfree(var_hist);
30336 +               return ERR_PTR(-ENOMEM);
30337 +       }
30338 +
30339 +       /* Use the same keys as the compatible histogram */
30340 +       strcat(cmd, "keys=");
30341 +
30342 +       for_each_hist_key_field(i, hist_data) {
30343 +               key_field = hist_data->fields[i];
30344 +               if (!first)
30345 +                       strcat(cmd, ",");
30346 +               strcat(cmd, key_field->field->name);
30347 +               first = false;
30348 +       }
30349 +
30350 +       /* Create the synthetic field variable specification */
30351 +       strcat(cmd, ":synthetic_");
30352 +       strcat(cmd, field_name);
30353 +       strcat(cmd, "=");
30354 +       strcat(cmd, field_name);
30355 +
30356 +       /* Use the same filter as the compatible histogram */
30357 +       saved_filter = find_trigger_filter(hist_data, file);
30358 +       if (saved_filter) {
30359 +               strcat(cmd, " if ");
30360 +               strcat(cmd, saved_filter);
30361 +       }
30362 +
30363 +       var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
30364 +       if (!var_hist->cmd) {
30365 +               kfree(cmd);
30366 +               kfree(var_hist);
30367 +               return ERR_PTR(-ENOMEM);
30368 +       }
30369 +
30370 +       /* Save the compatible histogram information */
30371 +       var_hist->hist_data = hist_data;
30372 +
30373 +       /* Create the new histogram with our variable */
30374 +       ret = event_hist_trigger_func(&trigger_hist_cmd, file,
30375 +                                     "", "hist", cmd);
30376 +       if (ret) {
30377 +               kfree(cmd);
30378 +               kfree(var_hist->cmd);
30379 +               kfree(var_hist);
30380 +               hist_err_event("onmatch: Couldn't create histogram for field: ",
30381 +                              subsys_name, event_name, field_name);
30382 +               return ERR_PTR(ret);
30383 +       }
30384 +
30385 +       kfree(cmd);
30386 +
30387 +       /* If we can't find the variable, something went wrong */
30388 +       event_var = find_synthetic_field_var(target_hist_data, subsys_name,
30389 +                                            event_name, field_name);
30390 +       if (IS_ERR_OR_NULL(event_var)) {
30391 +               kfree(var_hist->cmd);
30392 +               kfree(var_hist);
30393 +               hist_err_event("onmatch: Couldn't find synthetic variable: ",
30394 +                              subsys_name, event_name, field_name);
30395 +               return ERR_PTR(-EINVAL);
30396 +       }
30397 +
30398 +       n = target_hist_data->n_field_var_hists;
30399 +       target_hist_data->field_var_hists[n] = var_hist;
30400 +       target_hist_data->n_field_var_hists++;
30401 +
30402 +       return event_var;
30403 +}
30404 +
30405 +static struct hist_field *
30406 +find_target_event_var(struct hist_trigger_data *hist_data,
30407 +                     char *subsys_name, char *event_name, char *var_name)
30408 +{
30409 +       struct trace_event_file *file = hist_data->event_file;
30410 +       struct hist_field *hist_field = NULL;
30411 +
30412 +       if (subsys_name) {
30413 +               struct trace_event_call *call;
30414 +
30415 +               if (!event_name)
30416 +                       return NULL;
30417 +
30418 +               call = file->event_call;
30419 +
30420 +               if (strcmp(subsys_name, call->class->system) != 0)
30421 +                       return NULL;
30422 +
30423 +               if (strcmp(event_name, trace_event_name(call)) != 0)
30424 +                       return NULL;
30425 +       }
30426 +
30427 +       hist_field = find_var_field(hist_data, var_name);
30428 +
30429 +       return hist_field;
30430 +}
30431 +
30432 +static inline void __update_field_vars(struct tracing_map_elt *elt,
30433 +                                      struct ring_buffer_event *rbe,
30434 +                                      void *rec,
30435 +                                      struct field_var **field_vars,
30436 +                                      unsigned int n_field_vars,
30437 +                                      unsigned int field_var_str_start)
30438 +{
30439 +       struct hist_elt_data *elt_data = elt->private_data;
30440 +       unsigned int i, j, var_idx;
30441 +       u64 var_val;
30442 +
30443 +       for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
30444 +               struct field_var *field_var = field_vars[i];
30445 +               struct hist_field *var = field_var->var;
30446 +               struct hist_field *val = field_var->val;
30447 +
30448 +               var_val = val->fn(val, elt, rbe, rec);
30449 +               var_idx = var->var.idx;
30450 +
30451 +               if (val->flags & HIST_FIELD_FL_STRING) {
30452 +                       char *str = elt_data->field_var_str[j++];
30453 +                       char *val_str = (char *)(uintptr_t)var_val;
30454 +
30455 +                       strscpy(str, val_str, STR_VAR_LEN_MAX);
30456 +                       var_val = (u64)(uintptr_t)str;
30457 +               }
30458 +               tracing_map_set_var(elt, var_idx, var_val);
30459 +       }
30460 +}
30461 +
30462 +static void update_field_vars(struct hist_trigger_data *hist_data,
30463 +                             struct tracing_map_elt *elt,
30464 +                             struct ring_buffer_event *rbe,
30465 +                             void *rec)
30466 +{
30467 +       __update_field_vars(elt, rbe, rec, hist_data->field_vars,
30468 +                           hist_data->n_field_vars, 0);
30469 +}
30470 +
30471 +static void update_max_vars(struct hist_trigger_data *hist_data,
30472 +                           struct tracing_map_elt *elt,
30473 +                           struct ring_buffer_event *rbe,
30474 +                           void *rec)
30475 +{
30476 +       __update_field_vars(elt, rbe, rec, hist_data->max_vars,
30477 +                           hist_data->n_max_vars, hist_data->n_field_var_str);
30478 +}
30479 +
30480 +static struct hist_field *create_var(struct hist_trigger_data *hist_data,
30481 +                                    struct trace_event_file *file,
30482 +                                    char *name, int size, const char *type)
30483 +{
30484 +       struct hist_field *var;
30485 +       int idx;
30486 +
30487 +       if (find_var(hist_data, file, name) && !hist_data->remove) {
30488 +               var = ERR_PTR(-EINVAL);
30489 +               goto out;
30490 +       }
30491 +
30492 +       var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
30493 +       if (!var) {
30494 +               var = ERR_PTR(-ENOMEM);
30495 +               goto out;
30496 +       }
30497 +
30498 +       idx = tracing_map_add_var(hist_data->map);
30499 +       if (idx < 0) {
30500 +               kfree(var);
30501 +               var = ERR_PTR(-EINVAL);
30502 +               goto out;
30503 +       }
30504 +
30505 +       var->flags = HIST_FIELD_FL_VAR;
30506 +       var->var.idx = idx;
30507 +       var->var.hist_data = var->hist_data = hist_data;
30508 +       var->size = size;
30509 +       var->var.name = kstrdup(name, GFP_KERNEL);
30510 +       var->type = kstrdup(type, GFP_KERNEL);
30511 +       if (!var->var.name || !var->type) {
30512 +               kfree(var->var.name);
30513 +               kfree(var->type);
30514 +               kfree(var);
30515 +               var = ERR_PTR(-ENOMEM);
30516 +       }
30517 + out:
30518 +       return var;
30519 +}
30520 +
30521 +static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
30522 +                                         struct trace_event_file *file,
30523 +                                         char *field_name)
30524 +{
30525 +       struct hist_field *val = NULL, *var = NULL;
30526 +       unsigned long flags = HIST_FIELD_FL_VAR;
30527 +       struct field_var *field_var;
30528         int ret = 0;
30529  
30530 -       if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
30531 +       if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
30532 +               hist_err("Too many field variables defined: ", field_name);
30533 +               ret = -EINVAL;
30534 +               goto err;
30535 +       }
30536 +
30537 +       val = parse_atom(hist_data, file, field_name, &flags, NULL);
30538 +       if (IS_ERR(val)) {
30539 +               hist_err("Couldn't parse field variable: ", field_name);
30540 +               ret = PTR_ERR(val);
30541 +               goto err;
30542 +       }
30543 +
30544 +       var = create_var(hist_data, file, field_name, val->size, val->type);
30545 +       if (IS_ERR(var)) {
30546 +               hist_err("Couldn't create or find variable: ", field_name);
30547 +               kfree(val);
30548 +               ret = PTR_ERR(var);
30549 +               goto err;
30550 +       }
30551 +
30552 +       field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
30553 +       if (!field_var) {
30554 +               kfree(val);
30555 +               kfree(var);
30556 +               ret =  -ENOMEM;
30557 +               goto err;
30558 +       }
30559 +
30560 +       field_var->var = var;
30561 +       field_var->val = val;
30562 + out:
30563 +       return field_var;
30564 + err:
30565 +       field_var = ERR_PTR(ret);
30566 +       goto out;
30567 +}
30568 +
30569 +/**
30570 + * create_target_field_var - Automatically create a variable for a field
30571 + * @target_hist_data: The target hist trigger
30572 + * @subsys_name: Optional subsystem name
30573 + * @event_name: Optional event name
30574 + * @var_name: The name of the field (and the resulting variable)
30575 + *
30576 + * Hist trigger actions fetch data from variables, not directly from
30577 + * events.  However, for convenience, users are allowed to directly
30578 + * specify an event field in an action, which will be automatically
30579 + * converted into a variable on their behalf.
30580 +
30581 + * This function creates a field variable with the name var_name on
30582 + * the hist trigger currently being defined on the target event.  If
30583 + * subsys_name and event_name are specified, this function simply
30584 + * verifies that they do in fact match the target event subsystem and
30585 + * event name.
30586 + *
30587 + * Return: The variable created for the field.
30588 + */
30589 +static struct field_var *
30590 +create_target_field_var(struct hist_trigger_data *target_hist_data,
30591 +                       char *subsys_name, char *event_name, char *var_name)
30592 +{
30593 +       struct trace_event_file *file = target_hist_data->event_file;
30594 +
30595 +       if (subsys_name) {
30596 +               struct trace_event_call *call;
30597 +
30598 +               if (!event_name)
30599 +                       return NULL;
30600 +
30601 +               call = file->event_call;
30602 +
30603 +               if (strcmp(subsys_name, call->class->system) != 0)
30604 +                       return NULL;
30605 +
30606 +               if (strcmp(event_name, trace_event_name(call)) != 0)
30607 +                       return NULL;
30608 +       }
30609 +
30610 +       return create_field_var(target_hist_data, file, var_name);
30611 +}
30612 +
30613 +static void onmax_print(struct seq_file *m,
30614 +                       struct hist_trigger_data *hist_data,
30615 +                       struct tracing_map_elt *elt,
30616 +                       struct action_data *data)
30617 +{
30618 +       unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
30619 +
30620 +       seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
30621 +
30622 +       for (i = 0; i < hist_data->n_max_vars; i++) {
30623 +               struct hist_field *save_val = hist_data->max_vars[i]->val;
30624 +               struct hist_field *save_var = hist_data->max_vars[i]->var;
30625 +               u64 val;
30626 +
30627 +               save_var_idx = save_var->var.idx;
30628 +
30629 +               val = tracing_map_read_var(elt, save_var_idx);
30630 +
30631 +               if (save_val->flags & HIST_FIELD_FL_STRING) {
30632 +                       seq_printf(m, "  %s: %-32s", save_var->var.name,
30633 +                                  (char *)(uintptr_t)(val));
30634 +               } else
30635 +                       seq_printf(m, "  %s: %10llu", save_var->var.name, val);
30636 +       }
30637 +}
30638 +
30639 +static void onmax_save(struct hist_trigger_data *hist_data,
30640 +                      struct tracing_map_elt *elt, void *rec,
30641 +                      struct ring_buffer_event *rbe,
30642 +                      struct action_data *data, u64 *var_ref_vals)
30643 +{
30644 +       unsigned int max_idx = data->onmax.max_var->var.idx;
30645 +       unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
30646 +
30647 +       u64 var_val, max_val;
30648 +
30649 +       var_val = var_ref_vals[max_var_ref_idx];
30650 +       max_val = tracing_map_read_var(elt, max_idx);
30651 +
30652 +       if (var_val <= max_val)
30653 +               return;
30654 +
30655 +       tracing_map_set_var(elt, max_idx, var_val);
30656 +
30657 +       update_max_vars(hist_data, elt, rbe, rec);
30658 +}
30659 +
30660 +static void onmax_destroy(struct action_data *data)
30661 +{
30662 +       unsigned int i;
30663 +
30664 +       destroy_hist_field(data->onmax.max_var, 0);
30665 +       destroy_hist_field(data->onmax.var, 0);
30666 +
30667 +       kfree(data->onmax.var_str);
30668 +       kfree(data->onmax.fn_name);
30669 +
30670 +       for (i = 0; i < data->n_params; i++)
30671 +               kfree(data->params[i]);
30672 +
30673 +       kfree(data);
30674 +}
30675 +
30676 +static int onmax_create(struct hist_trigger_data *hist_data,
30677 +                       struct action_data *data)
30678 +{
30679 +       struct trace_event_file *file = hist_data->event_file;
30680 +       struct hist_field *var_field, *ref_field, *max_var;
30681 +       unsigned int var_ref_idx = hist_data->n_var_refs;
30682 +       struct field_var *field_var;
30683 +       char *onmax_var_str, *param;
30684 +       unsigned long flags;
30685 +       unsigned int i;
30686 +       int ret = 0;
30687 +
30688 +       onmax_var_str = data->onmax.var_str;
30689 +       if (onmax_var_str[0] != '$') {
30690 +               hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
30691                 return -EINVAL;
30692 +       }
30693 +       onmax_var_str++;
30694  
30695 -       field_name = strsep(&field_str, ".");
30696 -       if (field_str) {
30697 -               if (strcmp(field_str, "hex") == 0)
30698 -                       flags |= HIST_FIELD_FL_HEX;
30699 -               else {
30700 +       var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
30701 +       if (!var_field) {
30702 +               hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
30703 +               return -EINVAL;
30704 +       }
30705 +
30706 +       flags = HIST_FIELD_FL_VAR_REF;
30707 +       ref_field = create_hist_field(hist_data, NULL, flags, NULL);
30708 +       if (!ref_field)
30709 +               return -ENOMEM;
30710 +
30711 +       if (init_var_ref(ref_field, var_field, NULL, NULL)) {
30712 +               destroy_hist_field(ref_field, 0);
30713 +               ret = -ENOMEM;
30714 +               goto out;
30715 +       }
30716 +       hist_data->var_refs[hist_data->n_var_refs] = ref_field;
30717 +       ref_field->var_ref_idx = hist_data->n_var_refs++;
30718 +       data->onmax.var = ref_field;
30719 +
30720 +       data->fn = onmax_save;
30721 +       data->onmax.max_var_ref_idx = var_ref_idx;
30722 +       max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
30723 +       if (IS_ERR(max_var)) {
30724 +               hist_err("onmax: Couldn't create onmax variable: ", "max");
30725 +               ret = PTR_ERR(max_var);
30726 +               goto out;
30727 +       }
30728 +       data->onmax.max_var = max_var;
30729 +
30730 +       for (i = 0; i < data->n_params; i++) {
30731 +               param = kstrdup(data->params[i], GFP_KERNEL);
30732 +               if (!param) {
30733 +                       ret = -ENOMEM;
30734 +                       goto out;
30735 +               }
30736 +
30737 +               field_var = create_target_field_var(hist_data, NULL, NULL, param);
30738 +               if (IS_ERR(field_var)) {
30739 +                       hist_err("onmax: Couldn't create field variable: ", param);
30740 +                       ret = PTR_ERR(field_var);
30741 +                       kfree(param);
30742 +                       goto out;
30743 +               }
30744 +
30745 +               hist_data->max_vars[hist_data->n_max_vars++] = field_var;
30746 +               if (field_var->val->flags & HIST_FIELD_FL_STRING)
30747 +                       hist_data->n_max_var_str++;
30748 +
30749 +               kfree(param);
30750 +       }
30751 + out:
30752 +       return ret;
30753 +}
30754 +
30755 +static int parse_action_params(char *params, struct action_data *data)
30756 +{
30757 +       char *param, *saved_param;
30758 +       int ret = 0;
30759 +
30760 +       while (params) {
30761 +               if (data->n_params >= SYNTH_FIELDS_MAX)
30762 +                       goto out;
30763 +
30764 +               param = strsep(&params, ",");
30765 +               if (!param) {
30766 +                       ret = -EINVAL;
30767 +                       goto out;
30768 +               }
30769 +
30770 +               param = strstrip(param);
30771 +               if (strlen(param) < 2) {
30772 +                       hist_err("Invalid action param: ", param);
30773                         ret = -EINVAL;
30774                         goto out;
30775                 }
30776 +
30777 +               saved_param = kstrdup(param, GFP_KERNEL);
30778 +               if (!saved_param) {
30779 +                       ret = -ENOMEM;
30780 +                       goto out;
30781 +               }
30782 +
30783 +               data->params[data->n_params++] = saved_param;
30784         }
30785 + out:
30786 +       return ret;
30787 +}
30788  
30789 -       field = trace_find_event_field(file->event_call, field_name);
30790 -       if (!field || !field->size) {
30791 +static struct action_data *onmax_parse(char *str)
30792 +{
30793 +       char *onmax_fn_name, *onmax_var_str;
30794 +       struct action_data *data;
30795 +       int ret = -EINVAL;
30796 +
30797 +       data = kzalloc(sizeof(*data), GFP_KERNEL);
30798 +       if (!data)
30799 +               return ERR_PTR(-ENOMEM);
30800 +
30801 +       onmax_var_str = strsep(&str, ")");
30802 +       if (!onmax_var_str || !str) {
30803                 ret = -EINVAL;
30804 -               goto out;
30805 +               goto free;
30806         }
30807  
30808 -       hist_data->fields[val_idx] = create_hist_field(field, flags);
30809 -       if (!hist_data->fields[val_idx]) {
30810 +       data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
30811 +       if (!data->onmax.var_str) {
30812 +               ret = -ENOMEM;
30813 +               goto free;
30814 +       }
30815 +
30816 +       strsep(&str, ".");
30817 +       if (!str)
30818 +               goto free;
30819 +
30820 +       onmax_fn_name = strsep(&str, "(");
30821 +       if (!onmax_fn_name || !str)
30822 +               goto free;
30823 +
30824 +       if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
30825 +               char *params = strsep(&str, ")");
30826 +
30827 +               if (!params) {
30828 +                       ret = -EINVAL;
30829 +                       goto free;
30830 +               }
30831 +
30832 +               ret = parse_action_params(params, data);
30833 +               if (ret)
30834 +                       goto free;
30835 +       } else
30836 +               goto free;
30837 +
30838 +       data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
30839 +       if (!data->onmax.fn_name) {
30840 +               ret = -ENOMEM;
30841 +               goto free;
30842 +       }
30843 + out:
30844 +       return data;
30845 + free:
30846 +       onmax_destroy(data);
30847 +       data = ERR_PTR(ret);
30848 +       goto out;
30849 +}
30850 +
30851 +static void onmatch_destroy(struct action_data *data)
30852 +{
30853 +       unsigned int i;
30854 +
30855 +       mutex_lock(&synth_event_mutex);
30856 +
30857 +       kfree(data->onmatch.match_event);
30858 +       kfree(data->onmatch.match_event_system);
30859 +       kfree(data->onmatch.synth_event_name);
30860 +
30861 +       for (i = 0; i < data->n_params; i++)
30862 +               kfree(data->params[i]);
30863 +
30864 +       if (data->onmatch.synth_event)
30865 +               data->onmatch.synth_event->ref--;
30866 +
30867 +       kfree(data);
30868 +
30869 +       mutex_unlock(&synth_event_mutex);
30870 +}
30871 +
30872 +static void destroy_field_var(struct field_var *field_var)
30873 +{
30874 +       if (!field_var)
30875 +               return;
30876 +
30877 +       destroy_hist_field(field_var->var, 0);
30878 +       destroy_hist_field(field_var->val, 0);
30879 +
30880 +       kfree(field_var);
30881 +}
30882 +
30883 +static void destroy_field_vars(struct hist_trigger_data *hist_data)
30884 +{
30885 +       unsigned int i;
30886 +
30887 +       for (i = 0; i < hist_data->n_field_vars; i++)
30888 +               destroy_field_var(hist_data->field_vars[i]);
30889 +}
30890 +
30891 +static void save_field_var(struct hist_trigger_data *hist_data,
30892 +                          struct field_var *field_var)
30893 +{
30894 +       hist_data->field_vars[hist_data->n_field_vars++] = field_var;
30895 +
30896 +       if (field_var->val->flags & HIST_FIELD_FL_STRING)
30897 +               hist_data->n_field_var_str++;
30898 +}
30899 +
30900 +
30901 +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
30902 +{
30903 +       unsigned int i;
30904 +
30905 +       for (i = 0; i < hist_data->n_synth_var_refs; i++)
30906 +               destroy_hist_field(hist_data->synth_var_refs[i], 0);
30907 +}
30908 +
30909 +static void save_synth_var_ref(struct hist_trigger_data *hist_data,
30910 +                        struct hist_field *var_ref)
30911 +{
30912 +       hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
30913 +
30914 +       hist_data->var_refs[hist_data->n_var_refs] = var_ref;
30915 +       var_ref->var_ref_idx = hist_data->n_var_refs++;
30916 +}
30917 +
30918 +static int check_synth_field(struct synth_event *event,
30919 +                            struct hist_field *hist_field,
30920 +                            unsigned int field_pos)
30921 +{
30922 +       struct synth_field *field;
30923 +
30924 +       if (field_pos >= event->n_fields)
30925 +               return -EINVAL;
30926 +
30927 +       field = event->fields[field_pos];
30928 +
30929 +       if (strcmp(field->type, hist_field->type) != 0)
30930 +               return -EINVAL;
30931 +
30932 +       return 0;
30933 +}
30934 +
30935 +static struct hist_field *
30936 +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
30937 +                char *system, char *event, char *var)
30938 +{
30939 +       struct hist_field *hist_field;
30940 +
30941 +       var++; /* skip '$' */
30942 +
30943 +       hist_field = find_target_event_var(hist_data, system, event, var);
30944 +       if (!hist_field) {
30945 +               if (!system) {
30946 +                       system = data->onmatch.match_event_system;
30947 +                       event = data->onmatch.match_event;
30948 +               }
30949 +
30950 +               hist_field = find_event_var(hist_data, system, event, var);
30951 +       }
30952 +
30953 +       if (!hist_field)
30954 +               hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
30955 +
30956 +       return hist_field;
30957 +}
30958 +
30959 +static struct hist_field *
30960 +onmatch_create_field_var(struct hist_trigger_data *hist_data,
30961 +                        struct action_data *data, char *system,
30962 +                        char *event, char *var)
30963 +{
30964 +       struct hist_field *hist_field = NULL;
30965 +       struct field_var *field_var;
30966 +
30967 +       /*
30968 +        * First try to create a field var on the target event (the
30969 +        * currently being defined).  This will create a variable for
30970 +        * unqualified fields on the target event, or if qualified,
30971 +        * target fields that have qualified names matching the target.
30972 +        */
30973 +       field_var = create_target_field_var(hist_data, system, event, var);
30974 +
30975 +       if (field_var && !IS_ERR(field_var)) {
30976 +               save_field_var(hist_data, field_var);
30977 +               hist_field = field_var->var;
30978 +       } else {
30979 +               field_var = NULL;
30980 +               /*
30981 +                * If no explicit system.event is specfied, default to
30982 +                * looking for fields on the onmatch(system.event.xxx)
30983 +                * event.
30984 +                */
30985 +               if (!system) {
30986 +                       system = data->onmatch.match_event_system;
30987 +                       event = data->onmatch.match_event;
30988 +               }
30989 +
30990 +               /*
30991 +                * At this point, we're looking at a field on another
30992 +                * event.  Because we can't modify a hist trigger on
30993 +                * another event to add a variable for a field, we need
30994 +                * to create a new trigger on that event and create the
30995 +                * variable at the same time.
30996 +                */
30997 +               hist_field = create_field_var_hist(hist_data, system, event, var);
30998 +               if (IS_ERR(hist_field))
30999 +                       goto free;
31000 +       }
31001 + out:
31002 +       return hist_field;
31003 + free:
31004 +       destroy_field_var(field_var);
31005 +       hist_field = NULL;
31006 +       goto out;
31007 +}
31008 +
31009 +static int onmatch_create(struct hist_trigger_data *hist_data,
31010 +                         struct trace_event_file *file,
31011 +                         struct action_data *data)
31012 +{
31013 +       char *event_name, *param, *system = NULL;
31014 +       struct hist_field *hist_field, *var_ref;
31015 +       unsigned int i, var_ref_idx;
31016 +       unsigned int field_pos = 0;
31017 +       struct synth_event *event;
31018 +       int ret = 0;
31019 +
31020 +       mutex_lock(&synth_event_mutex);
31021 +       event = find_synth_event(data->onmatch.synth_event_name);
31022 +       if (!event) {
31023 +               hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
31024 +               mutex_unlock(&synth_event_mutex);
31025 +               return -EINVAL;
31026 +       }
31027 +       event->ref++;
31028 +       mutex_unlock(&synth_event_mutex);
31029 +
31030 +       var_ref_idx = hist_data->n_var_refs;
31031 +
31032 +       for (i = 0; i < data->n_params; i++) {
31033 +               char *p;
31034 +
31035 +               p = param = kstrdup(data->params[i], GFP_KERNEL);
31036 +               if (!param) {
31037 +                       ret = -ENOMEM;
31038 +                       goto err;
31039 +               }
31040 +
31041 +               system = strsep(&param, ".");
31042 +               if (!param) {
31043 +                       param = (char *)system;
31044 +                       system = event_name = NULL;
31045 +               } else {
31046 +                       event_name = strsep(&param, ".");
31047 +                       if (!param) {
31048 +                               kfree(p);
31049 +                               ret = -EINVAL;
31050 +                               goto err;
31051 +                       }
31052 +               }
31053 +
31054 +               if (param[0] == '$')
31055 +                       hist_field = onmatch_find_var(hist_data, data, system,
31056 +                                                     event_name, param);
31057 +               else
31058 +                       hist_field = onmatch_create_field_var(hist_data, data,
31059 +                                                             system,
31060 +                                                             event_name,
31061 +                                                             param);
31062 +
31063 +               if (!hist_field) {
31064 +                       kfree(p);
31065 +                       ret = -EINVAL;
31066 +                       goto err;
31067 +               }
31068 +
31069 +               if (check_synth_field(event, hist_field, field_pos) == 0) {
31070 +                       var_ref = create_var_ref(hist_field, system, event_name);
31071 +                       if (!var_ref) {
31072 +                               kfree(p);
31073 +                               ret = -ENOMEM;
31074 +                               goto err;
31075 +                       }
31076 +
31077 +                       save_synth_var_ref(hist_data, var_ref);
31078 +                       field_pos++;
31079 +                       kfree(p);
31080 +                       continue;
31081 +               }
31082 +
31083 +               hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
31084 +                              system, event_name, param);
31085 +               kfree(p);
31086 +               ret = -EINVAL;
31087 +               goto err;
31088 +       }
31089 +
31090 +       if (field_pos != event->n_fields) {
31091 +               hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
31092 +               ret = -EINVAL;
31093 +               goto err;
31094 +       }
31095 +
31096 +       data->fn = action_trace;
31097 +       data->onmatch.synth_event = event;
31098 +       data->onmatch.var_ref_idx = var_ref_idx;
31099 + out:
31100 +       return ret;
31101 + err:
31102 +       mutex_lock(&synth_event_mutex);
31103 +       event->ref--;
31104 +       mutex_unlock(&synth_event_mutex);
31105 +
31106 +       goto out;
31107 +}
31108 +
31109 +static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
31110 +{
31111 +       char *match_event, *match_event_system;
31112 +       char *synth_event_name, *params;
31113 +       struct action_data *data;
31114 +       int ret = -EINVAL;
31115 +
31116 +       data = kzalloc(sizeof(*data), GFP_KERNEL);
31117 +       if (!data)
31118 +               return ERR_PTR(-ENOMEM);
31119 +
31120 +       match_event = strsep(&str, ")");
31121 +       if (!match_event || !str) {
31122 +               hist_err("onmatch: Missing closing paren: ", match_event);
31123 +               goto free;
31124 +       }
31125 +
31126 +       match_event_system = strsep(&match_event, ".");
31127 +       if (!match_event) {
31128 +               hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
31129 +               goto free;
31130 +       }
31131 +
31132 +       if (IS_ERR(event_file(tr, match_event_system, match_event))) {
31133 +               hist_err_event("onmatch: Invalid subsystem or event name: ",
31134 +                              match_event_system, match_event, NULL);
31135 +               goto free;
31136 +       }
31137 +
31138 +       data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
31139 +       if (!data->onmatch.match_event) {
31140 +               ret = -ENOMEM;
31141 +               goto free;
31142 +       }
31143 +
31144 +       data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
31145 +       if (!data->onmatch.match_event_system) {
31146 +               ret = -ENOMEM;
31147 +               goto free;
31148 +       }
31149 +
31150 +       strsep(&str, ".");
31151 +       if (!str) {
31152 +               hist_err("onmatch: Missing . after onmatch(): ", str);
31153 +               goto free;
31154 +       }
31155 +
31156 +       synth_event_name = strsep(&str, "(");
31157 +       if (!synth_event_name || !str) {
31158 +               hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
31159 +               goto free;
31160 +       }
31161 +
31162 +       data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
31163 +       if (!data->onmatch.synth_event_name) {
31164                 ret = -ENOMEM;
31165 +               goto free;
31166 +       }
31167 +
31168 +       params = strsep(&str, ")");
31169 +       if (!params || !str || (str && strlen(str))) {
31170 +               hist_err("onmatch: Missing closing paramlist paren: ", params);
31171 +               goto free;
31172 +       }
31173 +
31174 +       ret = parse_action_params(params, data);
31175 +       if (ret)
31176 +               goto free;
31177 + out:
31178 +       return data;
31179 + free:
31180 +       onmatch_destroy(data);
31181 +       data = ERR_PTR(ret);
31182 +       goto out;
31183 +}
31184 +
31185 +static int create_hitcount_val(struct hist_trigger_data *hist_data)
31186 +{
31187 +       hist_data->fields[HITCOUNT_IDX] =
31188 +               create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
31189 +       if (!hist_data->fields[HITCOUNT_IDX])
31190 +               return -ENOMEM;
31191 +
31192 +       hist_data->n_vals++;
31193 +       hist_data->n_fields++;
31194 +
31195 +       if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
31196 +               return -EINVAL;
31197 +
31198 +       return 0;
31199 +}
31200 +
31201 +static int __create_val_field(struct hist_trigger_data *hist_data,
31202 +                             unsigned int val_idx,
31203 +                             struct trace_event_file *file,
31204 +                             char *var_name, char *field_str,
31205 +                             unsigned long flags)
31206 +{
31207 +       struct hist_field *hist_field;
31208 +       int ret = 0;
31209 +
31210 +       hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
31211 +       if (IS_ERR(hist_field)) {
31212 +               ret = PTR_ERR(hist_field);
31213                 goto out;
31214         }
31215  
31216 +       hist_data->fields[val_idx] = hist_field;
31217 +
31218         ++hist_data->n_vals;
31219 +       ++hist_data->n_fields;
31220  
31221 -       if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
31222 +       if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
31223                 ret = -EINVAL;
31224   out:
31225         return ret;
31226  }
31227  
31228 +static int create_val_field(struct hist_trigger_data *hist_data,
31229 +                           unsigned int val_idx,
31230 +                           struct trace_event_file *file,
31231 +                           char *field_str)
31232 +{
31233 +       if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
31234 +               return -EINVAL;
31235 +
31236 +       return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
31237 +}
31238 +
31239 +static int create_var_field(struct hist_trigger_data *hist_data,
31240 +                           unsigned int val_idx,
31241 +                           struct trace_event_file *file,
31242 +                           char *var_name, char *expr_str)
31243 +{
31244 +       unsigned long flags = 0;
31245 +
31246 +       if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
31247 +               return -EINVAL;
31248 +
31249 +       if (find_var(hist_data, file, var_name) && !hist_data->remove) {
31250 +               hist_err("Variable already defined: ", var_name);
31251 +               return -EINVAL;
31252 +       }
31253 +
31254 +       flags |= HIST_FIELD_FL_VAR;
31255 +       hist_data->n_vars++;
31256 +       if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
31257 +               return -EINVAL;
31258 +
31259 +       return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
31260 +}
31261 +
31262  static int create_val_fields(struct hist_trigger_data *hist_data,
31263                              struct trace_event_file *file)
31264  {
31265         char *fields_str, *field_str;
31266 -       unsigned int i, j;
31267 +       unsigned int i, j = 1;
31268         int ret;
31269  
31270         ret = create_hitcount_val(hist_data);
31271 @@ -493,12 +3912,15 @@
31272                 field_str = strsep(&fields_str, ",");
31273                 if (!field_str)
31274                         break;
31275 +
31276                 if (strcmp(field_str, "hitcount") == 0)
31277                         continue;
31278 +
31279                 ret = create_val_field(hist_data, j++, file, field_str);
31280                 if (ret)
31281                         goto out;
31282         }
31283 +
31284         if (fields_str && (strcmp(fields_str, "hitcount") != 0))
31285                 ret = -EINVAL;
31286   out:
31287 @@ -511,12 +3933,13 @@
31288                             struct trace_event_file *file,
31289                             char *field_str)
31290  {
31291 -       struct ftrace_event_field *field = NULL;
31292 +       struct hist_field *hist_field = NULL;
31293 +
31294         unsigned long flags = 0;
31295         unsigned int key_size;
31296         int ret = 0;
31297  
31298 -       if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
31299 +       if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
31300                 return -EINVAL;
31301  
31302         flags |= HIST_FIELD_FL_KEY;
31303 @@ -524,57 +3947,40 @@
31304         if (strcmp(field_str, "stacktrace") == 0) {
31305                 flags |= HIST_FIELD_FL_STACKTRACE;
31306                 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
31307 +               hist_field = create_hist_field(hist_data, NULL, flags, NULL);
31308         } else {
31309 -               char *field_name = strsep(&field_str, ".");
31310 -
31311 -               if (field_str) {
31312 -                       if (strcmp(field_str, "hex") == 0)
31313 -                               flags |= HIST_FIELD_FL_HEX;
31314 -                       else if (strcmp(field_str, "sym") == 0)
31315 -                               flags |= HIST_FIELD_FL_SYM;
31316 -                       else if (strcmp(field_str, "sym-offset") == 0)
31317 -                               flags |= HIST_FIELD_FL_SYM_OFFSET;
31318 -                       else if ((strcmp(field_str, "execname") == 0) &&
31319 -                                (strcmp(field_name, "common_pid") == 0))
31320 -                               flags |= HIST_FIELD_FL_EXECNAME;
31321 -                       else if (strcmp(field_str, "syscall") == 0)
31322 -                               flags |= HIST_FIELD_FL_SYSCALL;
31323 -                       else if (strcmp(field_str, "log2") == 0)
31324 -                               flags |= HIST_FIELD_FL_LOG2;
31325 -                       else {
31326 -                               ret = -EINVAL;
31327 -                               goto out;
31328 -                       }
31329 +               hist_field = parse_expr(hist_data, file, field_str, flags,
31330 +                                       NULL, 0);
31331 +               if (IS_ERR(hist_field)) {
31332 +                       ret = PTR_ERR(hist_field);
31333 +                       goto out;
31334                 }
31335  
31336 -               field = trace_find_event_field(file->event_call, field_name);
31337 -               if (!field || !field->size) {
31338 +               if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
31339 +                       hist_err("Using variable references as keys not supported: ", field_str);
31340 +                       destroy_hist_field(hist_field, 0);
31341                         ret = -EINVAL;
31342                         goto out;
31343                 }
31344  
31345 -               if (is_string_field(field))
31346 -                       key_size = MAX_FILTER_STR_VAL;
31347 -               else
31348 -                       key_size = field->size;
31349 +               key_size = hist_field->size;
31350         }
31351  
31352 -       hist_data->fields[key_idx] = create_hist_field(field, flags);
31353 -       if (!hist_data->fields[key_idx]) {
31354 -               ret = -ENOMEM;
31355 -               goto out;
31356 -       }
31357 +       hist_data->fields[key_idx] = hist_field;
31358  
31359         key_size = ALIGN(key_size, sizeof(u64));
31360         hist_data->fields[key_idx]->size = key_size;
31361         hist_data->fields[key_idx]->offset = key_offset;
31362 +
31363         hist_data->key_size += key_size;
31364 +
31365         if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
31366                 ret = -EINVAL;
31367                 goto out;
31368         }
31369  
31370         hist_data->n_keys++;
31371 +       hist_data->n_fields++;
31372  
31373         if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
31374                 return -EINVAL;
31375 @@ -618,21 +4024,113 @@
31376         return ret;
31377  }
31378  
31379 +static int create_var_fields(struct hist_trigger_data *hist_data,
31380 +                            struct trace_event_file *file)
31381 +{
31382 +       unsigned int i, j = hist_data->n_vals;
31383 +       int ret = 0;
31384 +
31385 +       unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
31386 +
31387 +       for (i = 0; i < n_vars; i++) {
31388 +               char *var_name = hist_data->attrs->var_defs.name[i];
31389 +               char *expr = hist_data->attrs->var_defs.expr[i];
31390 +
31391 +               ret = create_var_field(hist_data, j++, file, var_name, expr);
31392 +               if (ret)
31393 +                       goto out;
31394 +       }
31395 + out:
31396 +       return ret;
31397 +}
31398 +
31399 +static void free_var_defs(struct hist_trigger_data *hist_data)
31400 +{
31401 +       unsigned int i;
31402 +
31403 +       for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
31404 +               kfree(hist_data->attrs->var_defs.name[i]);
31405 +               kfree(hist_data->attrs->var_defs.expr[i]);
31406 +       }
31407 +
31408 +       hist_data->attrs->var_defs.n_vars = 0;
31409 +}
31410 +
31411 +static int parse_var_defs(struct hist_trigger_data *hist_data)
31412 +{
31413 +       char *s, *str, *var_name, *field_str;
31414 +       unsigned int i, j, n_vars = 0;
31415 +       int ret = 0;
31416 +
31417 +       for (i = 0; i < hist_data->attrs->n_assignments; i++) {
31418 +               str = hist_data->attrs->assignment_str[i];
31419 +               for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
31420 +                       field_str = strsep(&str, ",");
31421 +                       if (!field_str)
31422 +                               break;
31423 +
31424 +                       var_name = strsep(&field_str, "=");
31425 +                       if (!var_name || !field_str) {
31426 +                               hist_err("Malformed assignment: ", var_name);
31427 +                               ret = -EINVAL;
31428 +                               goto free;
31429 +                       }
31430 +
31431 +                       if (n_vars == TRACING_MAP_VARS_MAX) {
31432 +                               hist_err("Too many variables defined: ", var_name);
31433 +                               ret = -EINVAL;
31434 +                               goto free;
31435 +                       }
31436 +
31437 +                       s = kstrdup(var_name, GFP_KERNEL);
31438 +                       if (!s) {
31439 +                               ret = -ENOMEM;
31440 +                               goto free;
31441 +                       }
31442 +                       hist_data->attrs->var_defs.name[n_vars] = s;
31443 +
31444 +                       s = kstrdup(field_str, GFP_KERNEL);
31445 +                       if (!s) {
31446 +                               kfree(hist_data->attrs->var_defs.name[n_vars]);
31447 +                               ret = -ENOMEM;
31448 +                               goto free;
31449 +                       }
31450 +                       hist_data->attrs->var_defs.expr[n_vars++] = s;
31451 +
31452 +                       hist_data->attrs->var_defs.n_vars = n_vars;
31453 +               }
31454 +       }
31455 +
31456 +       return ret;
31457 + free:
31458 +       free_var_defs(hist_data);
31459 +
31460 +       return ret;
31461 +}
31462 +
31463  static int create_hist_fields(struct hist_trigger_data *hist_data,
31464                               struct trace_event_file *file)
31465  {
31466         int ret;
31467  
31468 +       ret = parse_var_defs(hist_data);
31469 +       if (ret)
31470 +               goto out;
31471 +
31472         ret = create_val_fields(hist_data, file);
31473         if (ret)
31474                 goto out;
31475  
31476 -       ret = create_key_fields(hist_data, file);
31477 +       ret = create_var_fields(hist_data, file);
31478         if (ret)
31479                 goto out;
31480  
31481 -       hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
31482 +       ret = create_key_fields(hist_data, file);
31483 +       if (ret)
31484 +               goto out;
31485   out:
31486 +       free_var_defs(hist_data);
31487 +
31488         return ret;
31489  }
31490  
31491 @@ -653,10 +4151,9 @@
31492  static int create_sort_keys(struct hist_trigger_data *hist_data)
31493  {
31494         char *fields_str = hist_data->attrs->sort_key_str;
31495 -       struct ftrace_event_field *field = NULL;
31496         struct tracing_map_sort_key *sort_key;
31497         int descending, ret = 0;
31498 -       unsigned int i, j;
31499 +       unsigned int i, j, k;
31500  
31501         hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
31502  
31503 @@ -670,7 +4167,9 @@
31504         }
31505  
31506         for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
31507 +               struct hist_field *hist_field;
31508                 char *field_str, *field_name;
31509 +               const char *test_name;
31510  
31511                 sort_key = &hist_data->sort_keys[i];
31512  
31513 @@ -702,10 +4201,19 @@
31514                         continue;
31515                 }
31516  
31517 -               for (j = 1; j < hist_data->n_fields; j++) {
31518 -                       field = hist_data->fields[j]->field;
31519 -                       if (field && (strcmp(field_name, field->name) == 0)) {
31520 -                               sort_key->field_idx = j;
31521 +               for (j = 1, k = 1; j < hist_data->n_fields; j++) {
31522 +                       unsigned int idx;
31523 +
31524 +                       hist_field = hist_data->fields[j];
31525 +                       if (hist_field->flags & HIST_FIELD_FL_VAR)
31526 +                               continue;
31527 +
31528 +                       idx = k++;
31529 +
31530 +                       test_name = hist_field_name(hist_field, 0);
31531 +
31532 +                       if (strcmp(field_name, test_name) == 0) {
31533 +                               sort_key->field_idx = idx;
31534                                 descending = is_descending(field_str);
31535                                 if (descending < 0) {
31536                                         ret = descending;
31537 @@ -720,16 +4228,230 @@
31538                         break;
31539                 }
31540         }
31541 +
31542         hist_data->n_sort_keys = i;
31543   out:
31544         return ret;
31545  }
31546  
31547 +static void destroy_actions(struct hist_trigger_data *hist_data)
31548 +{
31549 +       unsigned int i;
31550 +
31551 +       for (i = 0; i < hist_data->n_actions; i++) {
31552 +               struct action_data *data = hist_data->actions[i];
31553 +
31554 +               if (data->fn == action_trace)
31555 +                       onmatch_destroy(data);
31556 +               else if (data->fn == onmax_save)
31557 +                       onmax_destroy(data);
31558 +               else
31559 +                       kfree(data);
31560 +       }
31561 +}
31562 +
31563 +static int parse_actions(struct hist_trigger_data *hist_data)
31564 +{
31565 +       struct trace_array *tr = hist_data->event_file->tr;
31566 +       struct action_data *data;
31567 +       unsigned int i;
31568 +       int ret = 0;
31569 +       char *str;
31570 +
31571 +       for (i = 0; i < hist_data->attrs->n_actions; i++) {
31572 +               str = hist_data->attrs->action_str[i];
31573 +
31574 +               if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
31575 +                       char *action_str = str + strlen("onmatch(");
31576 +
31577 +                       data = onmatch_parse(tr, action_str);
31578 +                       if (IS_ERR(data)) {
31579 +                               ret = PTR_ERR(data);
31580 +                               break;
31581 +                       }
31582 +                       data->fn = action_trace;
31583 +               } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
31584 +                       char *action_str = str + strlen("onmax(");
31585 +
31586 +                       data = onmax_parse(action_str);
31587 +                       if (IS_ERR(data)) {
31588 +                               ret = PTR_ERR(data);
31589 +                               break;
31590 +                       }
31591 +                       data->fn = onmax_save;
31592 +               } else {
31593 +                       ret = -EINVAL;
31594 +                       break;
31595 +               }
31596 +
31597 +               hist_data->actions[hist_data->n_actions++] = data;
31598 +       }
31599 +
31600 +       return ret;
31601 +}
31602 +
31603 +static int create_actions(struct hist_trigger_data *hist_data,
31604 +                         struct trace_event_file *file)
31605 +{
31606 +       struct action_data *data;
31607 +       unsigned int i;
31608 +       int ret = 0;
31609 +
31610 +       for (i = 0; i < hist_data->attrs->n_actions; i++) {
31611 +               data = hist_data->actions[i];
31612 +
31613 +               if (data->fn == action_trace) {
31614 +                       ret = onmatch_create(hist_data, file, data);
31615 +                       if (ret)
31616 +                               return ret;
31617 +               } else if (data->fn == onmax_save) {
31618 +                       ret = onmax_create(hist_data, data);
31619 +                       if (ret)
31620 +                               return ret;
31621 +               }
31622 +       }
31623 +
31624 +       return ret;
31625 +}
31626 +
31627 +static void print_actions(struct seq_file *m,
31628 +                         struct hist_trigger_data *hist_data,
31629 +                         struct tracing_map_elt *elt)
31630 +{
31631 +       unsigned int i;
31632 +
31633 +       for (i = 0; i < hist_data->n_actions; i++) {
31634 +               struct action_data *data = hist_data->actions[i];
31635 +
31636 +               if (data->fn == onmax_save)
31637 +                       onmax_print(m, hist_data, elt, data);
31638 +       }
31639 +}
31640 +
31641 +static void print_onmax_spec(struct seq_file *m,
31642 +                            struct hist_trigger_data *hist_data,
31643 +                            struct action_data *data)
31644 +{
31645 +       unsigned int i;
31646 +
31647 +       seq_puts(m, ":onmax(");
31648 +       seq_printf(m, "%s", data->onmax.var_str);
31649 +       seq_printf(m, ").%s(", data->onmax.fn_name);
31650 +
31651 +       for (i = 0; i < hist_data->n_max_vars; i++) {
31652 +               seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
31653 +               if (i < hist_data->n_max_vars - 1)
31654 +                       seq_puts(m, ",");
31655 +       }
31656 +       seq_puts(m, ")");
31657 +}
31658 +
31659 +static void print_onmatch_spec(struct seq_file *m,
31660 +                              struct hist_trigger_data *hist_data,
31661 +                              struct action_data *data)
31662 +{
31663 +       unsigned int i;
31664 +
31665 +       seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
31666 +                  data->onmatch.match_event);
31667 +
31668 +       seq_printf(m, "%s(", data->onmatch.synth_event->name);
31669 +
31670 +       for (i = 0; i < data->n_params; i++) {
31671 +               if (i)
31672 +                       seq_puts(m, ",");
31673 +               seq_printf(m, "%s", data->params[i]);
31674 +       }
31675 +
31676 +       seq_puts(m, ")");
31677 +}
31678 +
31679 +static bool actions_match(struct hist_trigger_data *hist_data,
31680 +                         struct hist_trigger_data *hist_data_test)
31681 +{
31682 +       unsigned int i, j;
31683 +
31684 +       if (hist_data->n_actions != hist_data_test->n_actions)
31685 +               return false;
31686 +
31687 +       for (i = 0; i < hist_data->n_actions; i++) {
31688 +               struct action_data *data = hist_data->actions[i];
31689 +               struct action_data *data_test = hist_data_test->actions[i];
31690 +
31691 +               if (data->fn != data_test->fn)
31692 +                       return false;
31693 +
31694 +               if (data->n_params != data_test->n_params)
31695 +                       return false;
31696 +
31697 +               for (j = 0; j < data->n_params; j++) {
31698 +                       if (strcmp(data->params[j], data_test->params[j]) != 0)
31699 +                               return false;
31700 +               }
31701 +
31702 +               if (data->fn == action_trace) {
31703 +                       if (strcmp(data->onmatch.synth_event_name,
31704 +                                  data_test->onmatch.synth_event_name) != 0)
31705 +                               return false;
31706 +                       if (strcmp(data->onmatch.match_event_system,
31707 +                                  data_test->onmatch.match_event_system) != 0)
31708 +                               return false;
31709 +                       if (strcmp(data->onmatch.match_event,
31710 +                                  data_test->onmatch.match_event) != 0)
31711 +                               return false;
31712 +               } else if (data->fn == onmax_save) {
31713 +                       if (strcmp(data->onmax.var_str,
31714 +                                  data_test->onmax.var_str) != 0)
31715 +                               return false;
31716 +                       if (strcmp(data->onmax.fn_name,
31717 +                                  data_test->onmax.fn_name) != 0)
31718 +                               return false;
31719 +               }
31720 +       }
31721 +
31722 +       return true;
31723 +}
31724 +
31725 +
31726 +static void print_actions_spec(struct seq_file *m,
31727 +                              struct hist_trigger_data *hist_data)
31728 +{
31729 +       unsigned int i;
31730 +
31731 +       for (i = 0; i < hist_data->n_actions; i++) {
31732 +               struct action_data *data = hist_data->actions[i];
31733 +
31734 +               if (data->fn == action_trace)
31735 +                       print_onmatch_spec(m, hist_data, data);
31736 +               else if (data->fn == onmax_save)
31737 +                       print_onmax_spec(m, hist_data, data);
31738 +       }
31739 +}
31740 +
31741 +static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
31742 +{
31743 +       unsigned int i;
31744 +
31745 +       for (i = 0; i < hist_data->n_field_var_hists; i++) {
31746 +               kfree(hist_data->field_var_hists[i]->cmd);
31747 +               kfree(hist_data->field_var_hists[i]);
31748 +       }
31749 +}
31750 +
31751  static void destroy_hist_data(struct hist_trigger_data *hist_data)
31752  {
31753 +       if (!hist_data)
31754 +               return;
31755 +
31756         destroy_hist_trigger_attrs(hist_data->attrs);
31757         destroy_hist_fields(hist_data);
31758         tracing_map_destroy(hist_data->map);
31759 +
31760 +       destroy_actions(hist_data);
31761 +       destroy_field_vars(hist_data);
31762 +       destroy_field_var_hists(hist_data);
31763 +       destroy_synth_var_refs(hist_data);
31764 +
31765         kfree(hist_data);
31766  }
31767  
31768 @@ -738,7 +4460,7 @@
31769         struct tracing_map *map = hist_data->map;
31770         struct ftrace_event_field *field;
31771         struct hist_field *hist_field;
31772 -       int i, idx;
31773 +       int i, idx = 0;
31774  
31775         for_each_hist_field(i, hist_data) {
31776                 hist_field = hist_data->fields[i];
31777 @@ -749,6 +4471,9 @@
31778  
31779                         if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
31780                                 cmp_fn = tracing_map_cmp_none;
31781 +                       else if (!field)
31782 +                               cmp_fn = tracing_map_cmp_num(hist_field->size,
31783 +                                                            hist_field->is_signed);
31784                         else if (is_string_field(field))
31785                                 cmp_fn = tracing_map_cmp_string;
31786                         else
31787 @@ -757,36 +4482,29 @@
31788                         idx = tracing_map_add_key_field(map,
31789                                                         hist_field->offset,
31790                                                         cmp_fn);
31791 -
31792 -               } else
31793 +               } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
31794                         idx = tracing_map_add_sum_field(map);
31795  
31796                 if (idx < 0)
31797                         return idx;
31798 -       }
31799 -
31800 -       return 0;
31801 -}
31802 -
31803 -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
31804 -{
31805 -       struct hist_field *key_field;
31806 -       unsigned int i;
31807 -
31808 -       for_each_hist_key_field(i, hist_data) {
31809 -               key_field = hist_data->fields[i];
31810  
31811 -               if (key_field->flags & HIST_FIELD_FL_EXECNAME)
31812 -                       return true;
31813 +               if (hist_field->flags & HIST_FIELD_FL_VAR) {
31814 +                       idx = tracing_map_add_var(map);
31815 +                       if (idx < 0)
31816 +                               return idx;
31817 +                       hist_field->var.idx = idx;
31818 +                       hist_field->var.hist_data = hist_data;
31819 +               }
31820         }
31821  
31822 -       return false;
31823 +       return 0;
31824  }
31825  
31826  static struct hist_trigger_data *
31827  create_hist_data(unsigned int map_bits,
31828                  struct hist_trigger_attrs *attrs,
31829 -                struct trace_event_file *file)
31830 +                struct trace_event_file *file,
31831 +                bool remove)
31832  {
31833         const struct tracing_map_ops *map_ops = NULL;
31834         struct hist_trigger_data *hist_data;
31835 @@ -797,6 +4515,12 @@
31836                 return ERR_PTR(-ENOMEM);
31837  
31838         hist_data->attrs = attrs;
31839 +       hist_data->remove = remove;
31840 +       hist_data->event_file = file;
31841 +
31842 +       ret = parse_actions(hist_data);
31843 +       if (ret)
31844 +               goto free;
31845  
31846         ret = create_hist_fields(hist_data, file);
31847         if (ret)
31848 @@ -806,8 +4530,7 @@
31849         if (ret)
31850                 goto free;
31851  
31852 -       if (need_tracing_map_ops(hist_data))
31853 -               map_ops = &hist_trigger_elt_comm_ops;
31854 +       map_ops = &hist_trigger_elt_data_ops;
31855  
31856         hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
31857                                             map_ops, hist_data);
31858 @@ -820,12 +4543,6 @@
31859         ret = create_tracing_map_fields(hist_data);
31860         if (ret)
31861                 goto free;
31862 -
31863 -       ret = tracing_map_init(hist_data->map);
31864 -       if (ret)
31865 -               goto free;
31866 -
31867 -       hist_data->event_file = file;
31868   out:
31869         return hist_data;
31870   free:
31871 @@ -839,18 +4556,39 @@
31872  }
31873  
31874  static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
31875 -                                   struct tracing_map_elt *elt,
31876 -                                   void *rec)
31877 +                                   struct tracing_map_elt *elt, void *rec,
31878 +                                   struct ring_buffer_event *rbe,
31879 +                                   u64 *var_ref_vals)
31880  {
31881 +       struct hist_elt_data *elt_data;
31882         struct hist_field *hist_field;
31883 -       unsigned int i;
31884 +       unsigned int i, var_idx;
31885         u64 hist_val;
31886  
31887 +       elt_data = elt->private_data;
31888 +       elt_data->var_ref_vals = var_ref_vals;
31889 +
31890         for_each_hist_val_field(i, hist_data) {
31891                 hist_field = hist_data->fields[i];
31892 -               hist_val = hist_field->fn(hist_field, rec);
31893 +               hist_val = hist_field->fn(hist_field, elt, rbe, rec);
31894 +               if (hist_field->flags & HIST_FIELD_FL_VAR) {
31895 +                       var_idx = hist_field->var.idx;
31896 +                       tracing_map_set_var(elt, var_idx, hist_val);
31897 +                       continue;
31898 +               }
31899                 tracing_map_update_sum(elt, i, hist_val);
31900         }
31901 +
31902 +       for_each_hist_key_field(i, hist_data) {
31903 +               hist_field = hist_data->fields[i];
31904 +               if (hist_field->flags & HIST_FIELD_FL_VAR) {
31905 +                       hist_val = hist_field->fn(hist_field, elt, rbe, rec);
31906 +                       var_idx = hist_field->var.idx;
31907 +                       tracing_map_set_var(elt, var_idx, hist_val);
31908 +               }
31909 +       }
31910 +
31911 +       update_field_vars(hist_data, elt, rbe, rec);
31912  }
31913  
31914  static inline void add_to_key(char *compound_key, void *key,
31915 @@ -877,15 +4615,31 @@
31916         memcpy(compound_key + key_field->offset, key, size);
31917  }
31918  
31919 -static void event_hist_trigger(struct event_trigger_data *data, void *rec)
31920 +static void
31921 +hist_trigger_actions(struct hist_trigger_data *hist_data,
31922 +                    struct tracing_map_elt *elt, void *rec,
31923 +                    struct ring_buffer_event *rbe, u64 *var_ref_vals)
31924 +{
31925 +       struct action_data *data;
31926 +       unsigned int i;
31927 +
31928 +       for (i = 0; i < hist_data->n_actions; i++) {
31929 +               data = hist_data->actions[i];
31930 +               data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
31931 +       }
31932 +}
31933 +
31934 +static void event_hist_trigger(struct event_trigger_data *data, void *rec,
31935 +                              struct ring_buffer_event *rbe)
31936  {
31937         struct hist_trigger_data *hist_data = data->private_data;
31938         bool use_compound_key = (hist_data->n_keys > 1);
31939         unsigned long entries[HIST_STACKTRACE_DEPTH];
31940 +       u64 var_ref_vals[TRACING_MAP_VARS_MAX];
31941         char compound_key[HIST_KEY_SIZE_MAX];
31942 +       struct tracing_map_elt *elt = NULL;
31943         struct stack_trace stacktrace;
31944         struct hist_field *key_field;
31945 -       struct tracing_map_elt *elt;
31946         u64 field_contents;
31947         void *key = NULL;
31948         unsigned int i;
31949 @@ -906,7 +4660,7 @@
31950  
31951                         key = entries;
31952                 } else {
31953 -                       field_contents = key_field->fn(key_field, rec);
31954 +                       field_contents = key_field->fn(key_field, elt, rbe, rec);
31955                         if (key_field->flags & HIST_FIELD_FL_STRING) {
31956                                 key = (void *)(unsigned long)field_contents;
31957                                 use_compound_key = true;
31958 @@ -921,9 +4675,18 @@
31959         if (use_compound_key)
31960                 key = compound_key;
31961  
31962 +       if (hist_data->n_var_refs &&
31963 +           !resolve_var_refs(hist_data, key, var_ref_vals, false))
31964 +               return;
31965 +
31966         elt = tracing_map_insert(hist_data->map, key);
31967 -       if (elt)
31968 -               hist_trigger_elt_update(hist_data, elt, rec);
31969 +       if (!elt)
31970 +               return;
31971 +
31972 +       hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
31973 +
31974 +       if (resolve_var_refs(hist_data, key, var_ref_vals, true))
31975 +               hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
31976  }
31977  
31978  static void hist_trigger_stacktrace_print(struct seq_file *m,
31979 @@ -952,6 +4715,7 @@
31980         struct hist_field *key_field;
31981         char str[KSYM_SYMBOL_LEN];
31982         bool multiline = false;
31983 +       const char *field_name;
31984         unsigned int i;
31985         u64 uval;
31986  
31987 @@ -963,26 +4727,33 @@
31988                 if (i > hist_data->n_vals)
31989                         seq_puts(m, ", ");
31990  
31991 +               field_name = hist_field_name(key_field, 0);
31992 +
31993                 if (key_field->flags & HIST_FIELD_FL_HEX) {
31994                         uval = *(u64 *)(key + key_field->offset);
31995 -                       seq_printf(m, "%s: %llx",
31996 -                                  key_field->field->name, uval);
31997 +                       seq_printf(m, "%s: %llx", field_name, uval);
31998                 } else if (key_field->flags & HIST_FIELD_FL_SYM) {
31999                         uval = *(u64 *)(key + key_field->offset);
32000                         sprint_symbol_no_offset(str, uval);
32001 -                       seq_printf(m, "%s: [%llx] %-45s",
32002 -                                  key_field->field->name, uval, str);
32003 +                       seq_printf(m, "%s: [%llx] %-45s", field_name,
32004 +                                  uval, str);
32005                 } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
32006                         uval = *(u64 *)(key + key_field->offset);
32007                         sprint_symbol(str, uval);
32008 -                       seq_printf(m, "%s: [%llx] %-55s",
32009 -                                  key_field->field->name, uval, str);
32010 +                       seq_printf(m, "%s: [%llx] %-55s", field_name,
32011 +                                  uval, str);
32012                 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
32013 -                       char *comm = elt->private_data;
32014 +                       struct hist_elt_data *elt_data = elt->private_data;
32015 +                       char *comm;
32016 +
32017 +                       if (WARN_ON_ONCE(!elt_data))
32018 +                               return;
32019 +
32020 +                       comm = elt_data->comm;
32021  
32022                         uval = *(u64 *)(key + key_field->offset);
32023 -                       seq_printf(m, "%s: %-16s[%10llu]",
32024 -                                  key_field->field->name, comm, uval);
32025 +                       seq_printf(m, "%s: %-16s[%10llu]", field_name,
32026 +                                  comm, uval);
32027                 } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
32028                         const char *syscall_name;
32029  
32030 @@ -991,8 +4762,8 @@
32031                         if (!syscall_name)
32032                                 syscall_name = "unknown_syscall";
32033  
32034 -                       seq_printf(m, "%s: %-30s[%3llu]",
32035 -                                  key_field->field->name, syscall_name, uval);
32036 +                       seq_printf(m, "%s: %-30s[%3llu]", field_name,
32037 +                                  syscall_name, uval);
32038                 } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
32039                         seq_puts(m, "stacktrace:\n");
32040                         hist_trigger_stacktrace_print(m,
32041 @@ -1000,15 +4771,14 @@
32042                                                       HIST_STACKTRACE_DEPTH);
32043                         multiline = true;
32044                 } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
32045 -                       seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
32046 +                       seq_printf(m, "%s: ~ 2^%-2llu", field_name,
32047                                    *(u64 *)(key + key_field->offset));
32048                 } else if (key_field->flags & HIST_FIELD_FL_STRING) {
32049 -                       seq_printf(m, "%s: %-50s", key_field->field->name,
32050 +                       seq_printf(m, "%s: %-50s", field_name,
32051                                    (char *)(key + key_field->offset));
32052                 } else {
32053                         uval = *(u64 *)(key + key_field->offset);
32054 -                       seq_printf(m, "%s: %10llu", key_field->field->name,
32055 -                                  uval);
32056 +                       seq_printf(m, "%s: %10llu", field_name, uval);
32057                 }
32058         }
32059  
32060 @@ -1021,17 +4791,23 @@
32061                    tracing_map_read_sum(elt, HITCOUNT_IDX));
32062  
32063         for (i = 1; i < hist_data->n_vals; i++) {
32064 +               field_name = hist_field_name(hist_data->fields[i], 0);
32065 +
32066 +               if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
32067 +                   hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
32068 +                       continue;
32069 +
32070                 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
32071 -                       seq_printf(m, "  %s: %10llx",
32072 -                                  hist_data->fields[i]->field->name,
32073 +                       seq_printf(m, "  %s: %10llx", field_name,
32074                                    tracing_map_read_sum(elt, i));
32075                 } else {
32076 -                       seq_printf(m, "  %s: %10llu",
32077 -                                  hist_data->fields[i]->field->name,
32078 +                       seq_printf(m, "  %s: %10llu", field_name,
32079                                    tracing_map_read_sum(elt, i));
32080                 }
32081         }
32082  
32083 +       print_actions(m, hist_data, elt);
32084 +
32085         seq_puts(m, "\n");
32086  }
32087  
32088 @@ -1102,6 +4878,11 @@
32089                         hist_trigger_show(m, data, n++);
32090         }
32091  
32092 +       if (have_hist_err()) {
32093 +               seq_printf(m, "\nERROR: %s\n", hist_err_str);
32094 +               seq_printf(m, "  Last command: %s\n", last_hist_cmd);
32095 +       }
32096 +
32097   out_unlock:
32098         mutex_unlock(&event_mutex);
32099  
32100 @@ -1120,34 +4901,31 @@
32101         .release = single_release,
32102  };
32103  
32104 -static const char *get_hist_field_flags(struct hist_field *hist_field)
32105 +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
32106  {
32107 -       const char *flags_str = NULL;
32108 +       const char *field_name = hist_field_name(hist_field, 0);
32109  
32110 -       if (hist_field->flags & HIST_FIELD_FL_HEX)
32111 -               flags_str = "hex";
32112 -       else if (hist_field->flags & HIST_FIELD_FL_SYM)
32113 -               flags_str = "sym";
32114 -       else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
32115 -               flags_str = "sym-offset";
32116 -       else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
32117 -               flags_str = "execname";
32118 -       else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
32119 -               flags_str = "syscall";
32120 -       else if (hist_field->flags & HIST_FIELD_FL_LOG2)
32121 -               flags_str = "log2";
32122 +       if (hist_field->var.name)
32123 +               seq_printf(m, "%s=", hist_field->var.name);
32124  
32125 -       return flags_str;
32126 -}
32127 +       if (hist_field->flags & HIST_FIELD_FL_CPU)
32128 +               seq_puts(m, "cpu");
32129 +       else if (field_name) {
32130 +               if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
32131 +                   hist_field->flags & HIST_FIELD_FL_ALIAS)
32132 +                       seq_putc(m, '$');
32133 +               seq_printf(m, "%s", field_name);
32134 +       } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
32135 +               seq_puts(m, "common_timestamp");
32136  
32137 -static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
32138 -{
32139 -       seq_printf(m, "%s", hist_field->field->name);
32140         if (hist_field->flags) {
32141 -               const char *flags_str = get_hist_field_flags(hist_field);
32142 +               if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
32143 +                   !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
32144 +                       const char *flags = get_hist_field_flags(hist_field);
32145  
32146 -               if (flags_str)
32147 -                       seq_printf(m, ".%s", flags_str);
32148 +                       if (flags)
32149 +                               seq_printf(m, ".%s", flags);
32150 +               }
32151         }
32152  }
32153  
32154 @@ -1156,7 +4934,8 @@
32155                                     struct event_trigger_data *data)
32156  {
32157         struct hist_trigger_data *hist_data = data->private_data;
32158 -       struct hist_field *key_field;
32159 +       struct hist_field *field;
32160 +       bool have_var = false;
32161         unsigned int i;
32162  
32163         seq_puts(m, "hist:");
32164 @@ -1167,25 +4946,47 @@
32165         seq_puts(m, "keys=");
32166  
32167         for_each_hist_key_field(i, hist_data) {
32168 -               key_field = hist_data->fields[i];
32169 +               field = hist_data->fields[i];
32170  
32171                 if (i > hist_data->n_vals)
32172                         seq_puts(m, ",");
32173  
32174 -               if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
32175 +               if (field->flags & HIST_FIELD_FL_STACKTRACE)
32176                         seq_puts(m, "stacktrace");
32177                 else
32178 -                       hist_field_print(m, key_field);
32179 +                       hist_field_print(m, field);
32180         }
32181  
32182         seq_puts(m, ":vals=");
32183  
32184         for_each_hist_val_field(i, hist_data) {
32185 +               field = hist_data->fields[i];
32186 +               if (field->flags & HIST_FIELD_FL_VAR) {
32187 +                       have_var = true;
32188 +                       continue;
32189 +               }
32190 +
32191                 if (i == HITCOUNT_IDX)
32192                         seq_puts(m, "hitcount");
32193                 else {
32194                         seq_puts(m, ",");
32195 -                       hist_field_print(m, hist_data->fields[i]);
32196 +                       hist_field_print(m, field);
32197 +               }
32198 +       }
32199 +
32200 +       if (have_var) {
32201 +               unsigned int n = 0;
32202 +
32203 +               seq_puts(m, ":");
32204 +
32205 +               for_each_hist_val_field(i, hist_data) {
32206 +                       field = hist_data->fields[i];
32207 +
32208 +                       if (field->flags & HIST_FIELD_FL_VAR) {
32209 +                               if (n++)
32210 +                                       seq_puts(m, ",");
32211 +                               hist_field_print(m, field);
32212 +                       }
32213                 }
32214         }
32215  
32216 @@ -1193,28 +4994,36 @@
32217  
32218         for (i = 0; i < hist_data->n_sort_keys; i++) {
32219                 struct tracing_map_sort_key *sort_key;
32220 +               unsigned int idx, first_key_idx;
32221 +
32222 +               /* skip VAR vals */
32223 +               first_key_idx = hist_data->n_vals - hist_data->n_vars;
32224  
32225                 sort_key = &hist_data->sort_keys[i];
32226 +               idx = sort_key->field_idx;
32227 +
32228 +               if (WARN_ON(idx >= HIST_FIELDS_MAX))
32229 +                       return -EINVAL;
32230  
32231                 if (i > 0)
32232                         seq_puts(m, ",");
32233  
32234 -               if (sort_key->field_idx == HITCOUNT_IDX)
32235 +               if (idx == HITCOUNT_IDX)
32236                         seq_puts(m, "hitcount");
32237                 else {
32238 -                       unsigned int idx = sort_key->field_idx;
32239 -
32240 -                       if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
32241 -                               return -EINVAL;
32242 -
32243 +                       if (idx >= first_key_idx)
32244 +                               idx += hist_data->n_vars;
32245                         hist_field_print(m, hist_data->fields[idx]);
32246                 }
32247  
32248                 if (sort_key->descending)
32249                         seq_puts(m, ".descending");
32250         }
32251 -
32252         seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
32253 +       if (hist_data->enable_timestamps)
32254 +               seq_printf(m, ":clock=%s", hist_data->attrs->clock);
32255 +
32256 +       print_actions_spec(m, hist_data);
32257  
32258         if (data->filter_str)
32259                 seq_printf(m, " if %s", data->filter_str);
32260 @@ -1242,6 +5051,21 @@
32261         return 0;
32262  }
32263  
32264 +static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
32265 +{
32266 +       struct trace_event_file *file;
32267 +       unsigned int i;
32268 +       char *cmd;
32269 +       int ret;
32270 +
32271 +       for (i = 0; i < hist_data->n_field_var_hists; i++) {
32272 +               file = hist_data->field_var_hists[i]->hist_data->event_file;
32273 +               cmd = hist_data->field_var_hists[i]->cmd;
32274 +               ret = event_hist_trigger_func(&trigger_hist_cmd, file,
32275 +                                             "!hist", "hist", cmd);
32276 +       }
32277 +}
32278 +
32279  static void event_hist_trigger_free(struct event_trigger_ops *ops,
32280                                     struct event_trigger_data *data)
32281  {
32282 @@ -1254,7 +5078,13 @@
32283         if (!data->ref) {
32284                 if (data->name)
32285                         del_named_trigger(data);
32286 +
32287                 trigger_data_free(data);
32288 +
32289 +               remove_hist_vars(hist_data);
32290 +
32291 +               unregister_field_var_hists(hist_data);
32292 +
32293                 destroy_hist_data(hist_data);
32294         }
32295  }
32296 @@ -1381,6 +5211,15 @@
32297                         return false;
32298                 if (key_field->offset != key_field_test->offset)
32299                         return false;
32300 +               if (key_field->size != key_field_test->size)
32301 +                       return false;
32302 +               if (key_field->is_signed != key_field_test->is_signed)
32303 +                       return false;
32304 +               if (!!key_field->var.name != !!key_field_test->var.name)
32305 +                       return false;
32306 +               if (key_field->var.name &&
32307 +                   strcmp(key_field->var.name, key_field_test->var.name) != 0)
32308 +                       return false;
32309         }
32310  
32311         for (i = 0; i < hist_data->n_sort_keys; i++) {
32312 @@ -1396,6 +5235,9 @@
32313             (strcmp(data->filter_str, data_test->filter_str) != 0))
32314                 return false;
32315  
32316 +       if (!actions_match(hist_data, hist_data_test))
32317 +               return false;
32318 +
32319         return true;
32320  }
32321  
32322 @@ -1412,6 +5254,7 @@
32323                 if (named_data) {
32324                         if (!hist_trigger_match(data, named_data, named_data,
32325                                                 true)) {
32326 +                               hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
32327                                 ret = -EINVAL;
32328                                 goto out;
32329                         }
32330 @@ -1431,13 +5274,16 @@
32331                                 test->paused = false;
32332                         else if (hist_data->attrs->clear)
32333                                 hist_clear(test);
32334 -                       else
32335 +                       else {
32336 +                               hist_err("Hist trigger already exists", NULL);
32337                                 ret = -EEXIST;
32338 +                       }
32339                         goto out;
32340                 }
32341         }
32342   new:
32343         if (hist_data->attrs->cont || hist_data->attrs->clear) {
32344 +               hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
32345                 ret = -ENOENT;
32346                 goto out;
32347         }
32348 @@ -1446,7 +5292,6 @@
32349                 data->paused = true;
32350  
32351         if (named_data) {
32352 -               destroy_hist_data(data->private_data);
32353                 data->private_data = named_data->private_data;
32354                 set_named_trigger_data(data, named_data);
32355                 data->ops = &event_hist_trigger_named_ops;
32356 @@ -1458,8 +5303,32 @@
32357                         goto out;
32358         }
32359  
32360 -       list_add_rcu(&data->list, &file->triggers);
32361 +       if (hist_data->enable_timestamps) {
32362 +               char *clock = hist_data->attrs->clock;
32363 +
32364 +               ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
32365 +               if (ret) {
32366 +                       hist_err("Couldn't set trace_clock: ", clock);
32367 +                       goto out;
32368 +               }
32369 +
32370 +               tracing_set_time_stamp_abs(file->tr, true);
32371 +       }
32372 +
32373 +       if (named_data)
32374 +               destroy_hist_data(hist_data);
32375 +
32376         ret++;
32377 + out:
32378 +       return ret;
32379 +}
32380 +
32381 +static int hist_trigger_enable(struct event_trigger_data *data,
32382 +                              struct trace_event_file *file)
32383 +{
32384 +       int ret = 0;
32385 +
32386 +       list_add_tail_rcu(&data->list, &file->triggers);
32387  
32388         update_cond_flag(file);
32389  
32390 @@ -1468,10 +5337,55 @@
32391                 update_cond_flag(file);
32392                 ret--;
32393         }
32394 - out:
32395 +
32396         return ret;
32397  }
32398  
32399 +static bool have_hist_trigger_match(struct event_trigger_data *data,
32400 +                                   struct trace_event_file *file)
32401 +{
32402 +       struct hist_trigger_data *hist_data = data->private_data;
32403 +       struct event_trigger_data *test, *named_data = NULL;
32404 +       bool match = false;
32405 +
32406 +       if (hist_data->attrs->name)
32407 +               named_data = find_named_trigger(hist_data->attrs->name);
32408 +
32409 +       list_for_each_entry_rcu(test, &file->triggers, list) {
32410 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32411 +                       if (hist_trigger_match(data, test, named_data, false)) {
32412 +                               match = true;
32413 +                               break;
32414 +                       }
32415 +               }
32416 +       }
32417 +
32418 +       return match;
32419 +}
32420 +
32421 +static bool hist_trigger_check_refs(struct event_trigger_data *data,
32422 +                                   struct trace_event_file *file)
32423 +{
32424 +       struct hist_trigger_data *hist_data = data->private_data;
32425 +       struct event_trigger_data *test, *named_data = NULL;
32426 +
32427 +       if (hist_data->attrs->name)
32428 +               named_data = find_named_trigger(hist_data->attrs->name);
32429 +
32430 +       list_for_each_entry_rcu(test, &file->triggers, list) {
32431 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32432 +                       if (!hist_trigger_match(data, test, named_data, false))
32433 +                               continue;
32434 +                       hist_data = test->private_data;
32435 +                       if (check_var_refs(hist_data))
32436 +                               return true;
32437 +                       break;
32438 +               }
32439 +       }
32440 +
32441 +       return false;
32442 +}
32443 +
32444  static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
32445                                     struct event_trigger_data *data,
32446                                     struct trace_event_file *file)
32447 @@ -1497,17 +5411,55 @@
32448  
32449         if (unregistered && test->ops->free)
32450                 test->ops->free(test->ops, test);
32451 +
32452 +       if (hist_data->enable_timestamps) {
32453 +               if (!hist_data->remove || unregistered)
32454 +                       tracing_set_time_stamp_abs(file->tr, false);
32455 +       }
32456 +}
32457 +
32458 +static bool hist_file_check_refs(struct trace_event_file *file)
32459 +{
32460 +       struct hist_trigger_data *hist_data;
32461 +       struct event_trigger_data *test;
32462 +
32463 +       list_for_each_entry_rcu(test, &file->triggers, list) {
32464 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32465 +                       hist_data = test->private_data;
32466 +                       if (check_var_refs(hist_data))
32467 +                               return true;
32468 +               }
32469 +       }
32470 +
32471 +       return false;
32472  }
32473  
32474  static void hist_unreg_all(struct trace_event_file *file)
32475  {
32476         struct event_trigger_data *test, *n;
32477 +       struct hist_trigger_data *hist_data;
32478 +       struct synth_event *se;
32479 +       const char *se_name;
32480 +
32481 +       if (hist_file_check_refs(file))
32482 +               return;
32483  
32484         list_for_each_entry_safe(test, n, &file->triggers, list) {
32485                 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
32486 +                       hist_data = test->private_data;
32487                         list_del_rcu(&test->list);
32488                         trace_event_trigger_enable_disable(file, 0);
32489 +
32490 +                       mutex_lock(&synth_event_mutex);
32491 +                       se_name = trace_event_name(file->event_call);
32492 +                       se = find_synth_event(se_name);
32493 +                       if (se)
32494 +                               se->ref--;
32495 +                       mutex_unlock(&synth_event_mutex);
32496 +
32497                         update_cond_flag(file);
32498 +                       if (hist_data->enable_timestamps)
32499 +                               tracing_set_time_stamp_abs(file->tr, false);
32500                         if (test->ops->free)
32501                                 test->ops->free(test->ops, test);
32502                 }
32503 @@ -1523,16 +5475,54 @@
32504         struct hist_trigger_attrs *attrs;
32505         struct event_trigger_ops *trigger_ops;
32506         struct hist_trigger_data *hist_data;
32507 -       char *trigger;
32508 +       struct synth_event *se;
32509 +       const char *se_name;
32510 +       bool remove = false;
32511 +       char *trigger, *p;
32512         int ret = 0;
32513  
32514 +       if (glob && strlen(glob)) {
32515 +               last_cmd_set(param);
32516 +               hist_err_clear();
32517 +       }
32518 +
32519         if (!param)
32520                 return -EINVAL;
32521  
32522 -       /* separate the trigger from the filter (k:v [if filter]) */
32523 -       trigger = strsep(&param, " \t");
32524 -       if (!trigger)
32525 -               return -EINVAL;
32526 +       if (glob[0] == '!')
32527 +               remove = true;
32528 +
32529 +       /*
32530 +        * separate the trigger from the filter (k:v [if filter])
32531 +        * allowing for whitespace in the trigger
32532 +        */
32533 +       p = trigger = param;
32534 +       do {
32535 +               p = strstr(p, "if");
32536 +               if (!p)
32537 +                       break;
32538 +               if (p == param)
32539 +                       return -EINVAL;
32540 +               if (*(p - 1) != ' ' && *(p - 1) != '\t') {
32541 +                       p++;
32542 +                       continue;
32543 +               }
32544 +               if (p >= param + strlen(param) - strlen("if") - 1)
32545 +                       return -EINVAL;
32546 +               if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
32547 +                       p++;
32548 +                       continue;
32549 +               }
32550 +               break;
32551 +       } while (p);
32552 +
32553 +       if (!p)
32554 +               param = NULL;
32555 +       else {
32556 +               *(p - 1) = '\0';
32557 +               param = strstrip(p);
32558 +               trigger = strstrip(trigger);
32559 +       }
32560  
32561         attrs = parse_hist_trigger_attrs(trigger);
32562         if (IS_ERR(attrs))
32563 @@ -1541,7 +5531,7 @@
32564         if (attrs->map_bits)
32565                 hist_trigger_bits = attrs->map_bits;
32566  
32567 -       hist_data = create_hist_data(hist_trigger_bits, attrs, file);
32568 +       hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
32569         if (IS_ERR(hist_data)) {
32570                 destroy_hist_trigger_attrs(attrs);
32571                 return PTR_ERR(hist_data);
32572 @@ -1549,10 +5539,11 @@
32573  
32574         trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
32575  
32576 -       ret = -ENOMEM;
32577         trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
32578 -       if (!trigger_data)
32579 +       if (!trigger_data) {
32580 +               ret = -ENOMEM;
32581                 goto out_free;
32582 +       }
32583  
32584         trigger_data->count = -1;
32585         trigger_data->ops = trigger_ops;
32586 @@ -1570,8 +5561,24 @@
32587                         goto out_free;
32588         }
32589  
32590 -       if (glob[0] == '!') {
32591 +       if (remove) {
32592 +               if (!have_hist_trigger_match(trigger_data, file))
32593 +                       goto out_free;
32594 +
32595 +               if (hist_trigger_check_refs(trigger_data, file)) {
32596 +                       ret = -EBUSY;
32597 +                       goto out_free;
32598 +               }
32599 +
32600                 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
32601 +
32602 +               mutex_lock(&synth_event_mutex);
32603 +               se_name = trace_event_name(file->event_call);
32604 +               se = find_synth_event(se_name);
32605 +               if (se)
32606 +                       se->ref--;
32607 +               mutex_unlock(&synth_event_mutex);
32608 +
32609                 ret = 0;
32610                 goto out_free;
32611         }
32612 @@ -1588,14 +5595,47 @@
32613                 goto out_free;
32614         } else if (ret < 0)
32615                 goto out_free;
32616 +
32617 +       if (get_named_trigger_data(trigger_data))
32618 +               goto enable;
32619 +
32620 +       if (has_hist_vars(hist_data))
32621 +               save_hist_vars(hist_data);
32622 +
32623 +       ret = create_actions(hist_data, file);
32624 +       if (ret)
32625 +               goto out_unreg;
32626 +
32627 +       ret = tracing_map_init(hist_data->map);
32628 +       if (ret)
32629 +               goto out_unreg;
32630 +enable:
32631 +       ret = hist_trigger_enable(trigger_data, file);
32632 +       if (ret)
32633 +               goto out_unreg;
32634 +
32635 +       mutex_lock(&synth_event_mutex);
32636 +       se_name = trace_event_name(file->event_call);
32637 +       se = find_synth_event(se_name);
32638 +       if (se)
32639 +               se->ref++;
32640 +       mutex_unlock(&synth_event_mutex);
32641 +
32642         /* Just return zero, not the number of registered triggers */
32643         ret = 0;
32644   out:
32645 +       if (ret == 0)
32646 +               hist_err_clear();
32647 +
32648         return ret;
32649 + out_unreg:
32650 +       cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
32651   out_free:
32652         if (cmd_ops->set_filter)
32653                 cmd_ops->set_filter(NULL, trigger_data, NULL);
32654  
32655 +       remove_hist_vars(hist_data);
32656 +
32657         kfree(trigger_data);
32658  
32659         destroy_hist_data(hist_data);
32660 @@ -1625,7 +5665,8 @@
32661  }
32662  
32663  static void
32664 -hist_enable_trigger(struct event_trigger_data *data, void *rec)
32665 +hist_enable_trigger(struct event_trigger_data *data, void *rec,
32666 +                   struct ring_buffer_event *event)
32667  {
32668         struct enable_trigger_data *enable_data = data->private_data;
32669         struct event_trigger_data *test;
32670 @@ -1641,7 +5682,8 @@
32671  }
32672  
32673  static void
32674 -hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
32675 +hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
32676 +                         struct ring_buffer_event *event)
32677  {
32678         if (!data->count)
32679                 return;
32680 @@ -1649,7 +5691,7 @@
32681         if (data->count != -1)
32682                 (data->count)--;
32683  
32684 -       hist_enable_trigger(data, rec);
32685 +       hist_enable_trigger(data, rec, event);
32686  }
32687  
32688  static struct event_trigger_ops hist_enable_trigger_ops = {
32689 @@ -1754,3 +5796,31 @@
32690  
32691         return ret;
32692  }
32693 +
32694 +static __init int trace_events_hist_init(void)
32695 +{
32696 +       struct dentry *entry = NULL;
32697 +       struct dentry *d_tracer;
32698 +       int err = 0;
32699 +
32700 +       d_tracer = tracing_init_dentry();
32701 +       if (IS_ERR(d_tracer)) {
32702 +               err = PTR_ERR(d_tracer);
32703 +               goto err;
32704 +       }
32705 +
32706 +       entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
32707 +                                   NULL, &synth_events_fops);
32708 +       if (!entry) {
32709 +               err = -ENODEV;
32710 +               goto err;
32711 +       }
32712 +
32713 +       return err;
32714 + err:
32715 +       pr_warn("Could not create tracefs 'synthetic_events' entry\n");
32716 +
32717 +       return err;
32718 +}
32719 +
32720 +fs_initcall(trace_events_hist_init);
32721 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_events_trigger.c linux-4.14/kernel/trace/trace_events_trigger.c
32722 --- linux-4.14.orig/kernel/trace/trace_events_trigger.c 2018-09-05 11:03:22.000000000 +0200
32723 +++ linux-4.14/kernel/trace/trace_events_trigger.c      2018-09-05 11:05:07.000000000 +0200
32724 @@ -63,7 +63,8 @@
32725   * any trigger that should be deferred, ETT_NONE if nothing to defer.
32726   */
32727  enum event_trigger_type
32728 -event_triggers_call(struct trace_event_file *file, void *rec)
32729 +event_triggers_call(struct trace_event_file *file, void *rec,
32730 +                   struct ring_buffer_event *event)
32731  {
32732         struct event_trigger_data *data;
32733         enum event_trigger_type tt = ETT_NONE;
32734 @@ -76,7 +77,7 @@
32735                 if (data->paused)
32736                         continue;
32737                 if (!rec) {
32738 -                       data->ops->func(data, rec);
32739 +                       data->ops->func(data, rec, event);
32740                         continue;
32741                 }
32742                 filter = rcu_dereference_sched(data->filter);
32743 @@ -86,7 +87,7 @@
32744                         tt |= data->cmd_ops->trigger_type;
32745                         continue;
32746                 }
32747 -               data->ops->func(data, rec);
32748 +               data->ops->func(data, rec, event);
32749         }
32750         return tt;
32751  }
32752 @@ -108,7 +109,7 @@
32753  void
32754  event_triggers_post_call(struct trace_event_file *file,
32755                          enum event_trigger_type tt,
32756 -                        void *rec)
32757 +                        void *rec, struct ring_buffer_event *event)
32758  {
32759         struct event_trigger_data *data;
32760  
32761 @@ -116,7 +117,7 @@
32762                 if (data->paused)
32763                         continue;
32764                 if (data->cmd_ops->trigger_type & tt)
32765 -                       data->ops->func(data, rec);
32766 +                       data->ops->func(data, rec, event);
32767         }
32768  }
32769  EXPORT_SYMBOL_GPL(event_triggers_post_call);
32770 @@ -914,8 +915,15 @@
32771         data->named_data = named_data;
32772  }
32773  
32774 +struct event_trigger_data *
32775 +get_named_trigger_data(struct event_trigger_data *data)
32776 +{
32777 +       return data->named_data;
32778 +}
32779 +
32780  static void
32781 -traceon_trigger(struct event_trigger_data *data, void *rec)
32782 +traceon_trigger(struct event_trigger_data *data, void *rec,
32783 +               struct ring_buffer_event *event)
32784  {
32785         if (tracing_is_on())
32786                 return;
32787 @@ -924,7 +932,8 @@
32788  }
32789  
32790  static void
32791 -traceon_count_trigger(struct event_trigger_data *data, void *rec)
32792 +traceon_count_trigger(struct event_trigger_data *data, void *rec,
32793 +                     struct ring_buffer_event *event)
32794  {
32795         if (tracing_is_on())
32796                 return;
32797 @@ -939,7 +948,8 @@
32798  }
32799  
32800  static void
32801 -traceoff_trigger(struct event_trigger_data *data, void *rec)
32802 +traceoff_trigger(struct event_trigger_data *data, void *rec,
32803 +                struct ring_buffer_event *event)
32804  {
32805         if (!tracing_is_on())
32806                 return;
32807 @@ -948,7 +958,8 @@
32808  }
32809  
32810  static void
32811 -traceoff_count_trigger(struct event_trigger_data *data, void *rec)
32812 +traceoff_count_trigger(struct event_trigger_data *data, void *rec,
32813 +                      struct ring_buffer_event *event)
32814  {
32815         if (!tracing_is_on())
32816                 return;
32817 @@ -1045,7 +1056,8 @@
32818  
32819  #ifdef CONFIG_TRACER_SNAPSHOT
32820  static void
32821 -snapshot_trigger(struct event_trigger_data *data, void *rec)
32822 +snapshot_trigger(struct event_trigger_data *data, void *rec,
32823 +                struct ring_buffer_event *event)
32824  {
32825         struct trace_event_file *file = data->private_data;
32826  
32827 @@ -1056,7 +1068,8 @@
32828  }
32829  
32830  static void
32831 -snapshot_count_trigger(struct event_trigger_data *data, void *rec)
32832 +snapshot_count_trigger(struct event_trigger_data *data, void *rec,
32833 +                      struct ring_buffer_event *event)
32834  {
32835         if (!data->count)
32836                 return;
32837 @@ -1064,7 +1077,7 @@
32838         if (data->count != -1)
32839                 (data->count)--;
32840  
32841 -       snapshot_trigger(data, rec);
32842 +       snapshot_trigger(data, rec, event);
32843  }
32844  
32845  static int
32846 @@ -1143,13 +1156,15 @@
32847  #define STACK_SKIP 3
32848  
32849  static void
32850 -stacktrace_trigger(struct event_trigger_data *data, void *rec)
32851 +stacktrace_trigger(struct event_trigger_data *data, void *rec,
32852 +                  struct ring_buffer_event *event)
32853  {
32854         trace_dump_stack(STACK_SKIP);
32855  }
32856  
32857  static void
32858 -stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
32859 +stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
32860 +                        struct ring_buffer_event *event)
32861  {
32862         if (!data->count)
32863                 return;
32864 @@ -1157,7 +1172,7 @@
32865         if (data->count != -1)
32866                 (data->count)--;
32867  
32868 -       stacktrace_trigger(data, rec);
32869 +       stacktrace_trigger(data, rec, event);
32870  }
32871  
32872  static int
32873 @@ -1219,7 +1234,8 @@
32874  }
32875  
32876  static void
32877 -event_enable_trigger(struct event_trigger_data *data, void *rec)
32878 +event_enable_trigger(struct event_trigger_data *data, void *rec,
32879 +                    struct ring_buffer_event *event)
32880  {
32881         struct enable_trigger_data *enable_data = data->private_data;
32882  
32883 @@ -1230,7 +1246,8 @@
32884  }
32885  
32886  static void
32887 -event_enable_count_trigger(struct event_trigger_data *data, void *rec)
32888 +event_enable_count_trigger(struct event_trigger_data *data, void *rec,
32889 +                          struct ring_buffer_event *event)
32890  {
32891         struct enable_trigger_data *enable_data = data->private_data;
32892  
32893 @@ -1244,7 +1261,7 @@
32894         if (data->count != -1)
32895                 (data->count)--;
32896  
32897 -       event_enable_trigger(data, rec);
32898 +       event_enable_trigger(data, rec, event);
32899  }
32900  
32901  int event_enable_trigger_print(struct seq_file *m,
32902 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace.h linux-4.14/kernel/trace/trace.h
32903 --- linux-4.14.orig/kernel/trace/trace.h        2018-09-05 11:03:22.000000000 +0200
32904 +++ linux-4.14/kernel/trace/trace.h     2018-09-05 11:05:07.000000000 +0200
32905 @@ -127,6 +127,7 @@
32906   *  NEED_RESCHED       - reschedule is requested
32907   *  HARDIRQ            - inside an interrupt handler
32908   *  SOFTIRQ            - inside a softirq handler
32909 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
32910   */
32911  enum trace_flag_type {
32912         TRACE_FLAG_IRQS_OFF             = 0x01,
32913 @@ -136,6 +137,7 @@
32914         TRACE_FLAG_SOFTIRQ              = 0x10,
32915         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
32916         TRACE_FLAG_NMI                  = 0x40,
32917 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
32918  };
32919  
32920  #define TRACE_BUF_SIZE         1024
32921 @@ -273,6 +275,8 @@
32922         /* function tracing enabled */
32923         int                     function_enabled;
32924  #endif
32925 +       int                     time_stamp_abs_ref;
32926 +       struct list_head        hist_vars;
32927  };
32928  
32929  enum {
32930 @@ -286,6 +290,11 @@
32931  extern int trace_array_get(struct trace_array *tr);
32932  extern void trace_array_put(struct trace_array *tr);
32933  
32934 +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
32935 +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
32936 +
32937 +extern bool trace_clock_in_ns(struct trace_array *tr);
32938 +
32939  /*
32940   * The global tracer (top) should be the first trace array added,
32941   * but we check the flag anyway.
32942 @@ -1293,7 +1302,7 @@
32943         unsigned long eflags = file->flags;
32944  
32945         if (eflags & EVENT_FILE_FL_TRIGGER_COND)
32946 -               *tt = event_triggers_call(file, entry);
32947 +               *tt = event_triggers_call(file, entry, event);
32948  
32949         if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
32950             (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
32951 @@ -1330,7 +1339,7 @@
32952                 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
32953  
32954         if (tt)
32955 -               event_triggers_post_call(file, tt, entry);
32956 +               event_triggers_post_call(file, tt, entry, event);
32957  }
32958  
32959  /**
32960 @@ -1363,7 +1372,7 @@
32961                                                 irq_flags, pc, regs);
32962  
32963         if (tt)
32964 -               event_triggers_post_call(file, tt, entry);
32965 +               event_triggers_post_call(file, tt, entry, event);
32966  }
32967  
32968  #define FILTER_PRED_INVALID    ((unsigned short)-1)
32969 @@ -1545,6 +1554,8 @@
32970  extern void unpause_named_trigger(struct event_trigger_data *data);
32971  extern void set_named_trigger_data(struct event_trigger_data *data,
32972                                    struct event_trigger_data *named_data);
32973 +extern struct event_trigger_data *
32974 +get_named_trigger_data(struct event_trigger_data *data);
32975  extern int register_event_command(struct event_command *cmd);
32976  extern int unregister_event_command(struct event_command *cmd);
32977  extern int register_trigger_hist_enable_disable_cmds(void);
32978 @@ -1588,7 +1599,8 @@
32979   */
32980  struct event_trigger_ops {
32981         void                    (*func)(struct event_trigger_data *data,
32982 -                                       void *rec);
32983 +                                       void *rec,
32984 +                                       struct ring_buffer_event *rbe);
32985         int                     (*init)(struct event_trigger_ops *ops,
32986                                         struct event_trigger_data *data);
32987         void                    (*free)(struct event_trigger_ops *ops,
32988 @@ -1755,6 +1767,13 @@
32989  int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
32990  int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
32991  
32992 +#define MAX_EVENT_NAME_LEN     64
32993 +
32994 +extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
32995 +extern ssize_t trace_parse_run_command(struct file *file,
32996 +               const char __user *buffer, size_t count, loff_t *ppos,
32997 +               int (*createfn)(int, char**));
32998 +
32999  /*
33000   * Normal trace_printk() and friends allocates special buffers
33001   * to do the manipulation, as well as saves the print formats
33002 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_hwlat.c linux-4.14/kernel/trace/trace_hwlat.c
33003 --- linux-4.14.orig/kernel/trace/trace_hwlat.c  2017-11-12 19:46:13.000000000 +0100
33004 +++ linux-4.14/kernel/trace/trace_hwlat.c       2018-09-05 11:05:07.000000000 +0200
33005 @@ -279,7 +279,7 @@
33006          * of this thread, than stop migrating for the duration
33007          * of the current test.
33008          */
33009 -       if (!cpumask_equal(current_mask, &current->cpus_allowed))
33010 +       if (!cpumask_equal(current_mask, current->cpus_ptr))
33011                 goto disable;
33012  
33013         get_online_cpus();
33014 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_kprobe.c linux-4.14/kernel/trace/trace_kprobe.c
33015 --- linux-4.14.orig/kernel/trace/trace_kprobe.c 2018-09-05 11:03:22.000000000 +0200
33016 +++ linux-4.14/kernel/trace/trace_kprobe.c      2018-09-05 11:05:07.000000000 +0200
33017 @@ -918,8 +918,8 @@
33018  static ssize_t probes_write(struct file *file, const char __user *buffer,
33019                             size_t count, loff_t *ppos)
33020  {
33021 -       return traceprobe_probes_write(file, buffer, count, ppos,
33022 -                       create_trace_kprobe);
33023 +       return trace_parse_run_command(file, buffer, count, ppos,
33024 +                                      create_trace_kprobe);
33025  }
33026  
33027  static const struct file_operations kprobe_events_ops = {
33028 @@ -1444,9 +1444,9 @@
33029  
33030         pr_info("Testing kprobe tracing: ");
33031  
33032 -       ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
33033 -                                 "$stack $stack0 +0($stack)",
33034 -                                 create_trace_kprobe);
33035 +       ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
33036 +                               "$stack $stack0 +0($stack)",
33037 +                               create_trace_kprobe);
33038         if (WARN_ON_ONCE(ret)) {
33039                 pr_warn("error on probing function entry.\n");
33040                 warn++;
33041 @@ -1466,8 +1466,8 @@
33042                 }
33043         }
33044  
33045 -       ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
33046 -                                 "$retval", create_trace_kprobe);
33047 +       ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
33048 +                               "$retval", create_trace_kprobe);
33049         if (WARN_ON_ONCE(ret)) {
33050                 pr_warn("error on probing function return.\n");
33051                 warn++;
33052 @@ -1537,13 +1537,13 @@
33053                         disable_trace_kprobe(tk, file);
33054         }
33055  
33056 -       ret = traceprobe_command("-:testprobe", create_trace_kprobe);
33057 +       ret = trace_run_command("-:testprobe", create_trace_kprobe);
33058         if (WARN_ON_ONCE(ret)) {
33059                 pr_warn("error on deleting a probe.\n");
33060                 warn++;
33061         }
33062  
33063 -       ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
33064 +       ret = trace_run_command("-:testprobe2", create_trace_kprobe);
33065         if (WARN_ON_ONCE(ret)) {
33066                 pr_warn("error on deleting a probe.\n");
33067                 warn++;
33068 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_output.c linux-4.14/kernel/trace/trace_output.c
33069 --- linux-4.14.orig/kernel/trace/trace_output.c 2018-09-05 11:03:22.000000000 +0200
33070 +++ linux-4.14/kernel/trace/trace_output.c      2018-09-05 11:05:07.000000000 +0200
33071 @@ -447,6 +447,7 @@
33072  {
33073         char hardsoft_irq;
33074         char need_resched;
33075 +       char need_resched_lazy;
33076         char irqs_off;
33077         int hardirq;
33078         int softirq;
33079 @@ -477,6 +478,9 @@
33080                 break;
33081         }
33082  
33083 +       need_resched_lazy =
33084 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
33085 +
33086         hardsoft_irq =
33087                 (nmi && hardirq)     ? 'Z' :
33088                 nmi                  ? 'z' :
33089 @@ -485,14 +489,25 @@
33090                 softirq              ? 's' :
33091                                        '.' ;
33092  
33093 -       trace_seq_printf(s, "%c%c%c",
33094 -                        irqs_off, need_resched, hardsoft_irq);
33095 +       trace_seq_printf(s, "%c%c%c%c",
33096 +                        irqs_off, need_resched, need_resched_lazy,
33097 +                        hardsoft_irq);
33098  
33099         if (entry->preempt_count)
33100                 trace_seq_printf(s, "%x", entry->preempt_count);
33101         else
33102                 trace_seq_putc(s, '.');
33103  
33104 +       if (entry->preempt_lazy_count)
33105 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
33106 +       else
33107 +               trace_seq_putc(s, '.');
33108 +
33109 +       if (entry->migrate_disable)
33110 +               trace_seq_printf(s, "%x", entry->migrate_disable);
33111 +       else
33112 +               trace_seq_putc(s, '.');
33113 +
33114         return !trace_seq_has_overflowed(s);
33115  }
33116  
33117 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_probe.c linux-4.14/kernel/trace/trace_probe.c
33118 --- linux-4.14.orig/kernel/trace/trace_probe.c  2018-09-05 11:03:22.000000000 +0200
33119 +++ linux-4.14/kernel/trace/trace_probe.c       2018-09-05 11:05:07.000000000 +0200
33120 @@ -621,92 +621,6 @@
33121         kfree(arg->comm);
33122  }
33123  
33124 -int traceprobe_command(const char *buf, int (*createfn)(int, char **))
33125 -{
33126 -       char **argv;
33127 -       int argc, ret;
33128 -
33129 -       argc = 0;
33130 -       ret = 0;
33131 -       argv = argv_split(GFP_KERNEL, buf, &argc);
33132 -       if (!argv)
33133 -               return -ENOMEM;
33134 -
33135 -       if (argc)
33136 -               ret = createfn(argc, argv);
33137 -
33138 -       argv_free(argv);
33139 -
33140 -       return ret;
33141 -}
33142 -
33143 -#define WRITE_BUFSIZE  4096
33144 -
33145 -ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
33146 -                               size_t count, loff_t *ppos,
33147 -                               int (*createfn)(int, char **))
33148 -{
33149 -       char *kbuf, *buf, *tmp;
33150 -       int ret = 0;
33151 -       size_t done = 0;
33152 -       size_t size;
33153 -
33154 -       kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
33155 -       if (!kbuf)
33156 -               return -ENOMEM;
33157 -
33158 -       while (done < count) {
33159 -               size = count - done;
33160 -
33161 -               if (size >= WRITE_BUFSIZE)
33162 -                       size = WRITE_BUFSIZE - 1;
33163 -
33164 -               if (copy_from_user(kbuf, buffer + done, size)) {
33165 -                       ret = -EFAULT;
33166 -                       goto out;
33167 -               }
33168 -               kbuf[size] = '\0';
33169 -               buf = kbuf;
33170 -               do {
33171 -                       tmp = strchr(buf, '\n');
33172 -                       if (tmp) {
33173 -                               *tmp = '\0';
33174 -                               size = tmp - buf + 1;
33175 -                       } else {
33176 -                               size = strlen(buf);
33177 -                               if (done + size < count) {
33178 -                                       if (buf != kbuf)
33179 -                                               break;
33180 -                                       /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
33181 -                                       pr_warn("Line length is too long: Should be less than %d\n",
33182 -                                               WRITE_BUFSIZE - 2);
33183 -                                       ret = -EINVAL;
33184 -                                       goto out;
33185 -                               }
33186 -                       }
33187 -                       done += size;
33188 -
33189 -                       /* Remove comments */
33190 -                       tmp = strchr(buf, '#');
33191 -
33192 -                       if (tmp)
33193 -                               *tmp = '\0';
33194 -
33195 -                       ret = traceprobe_command(buf, createfn);
33196 -                       if (ret)
33197 -                               goto out;
33198 -                       buf += size;
33199 -
33200 -               } while (done < count);
33201 -       }
33202 -       ret = done;
33203 -
33204 -out:
33205 -       kfree(kbuf);
33206 -
33207 -       return ret;
33208 -}
33209 -
33210  static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
33211                            bool is_return)
33212  {
33213 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_probe.h linux-4.14/kernel/trace/trace_probe.h
33214 --- linux-4.14.orig/kernel/trace/trace_probe.h  2018-09-05 11:03:22.000000000 +0200
33215 +++ linux-4.14/kernel/trace/trace_probe.h       2018-09-05 11:05:07.000000000 +0200
33216 @@ -42,7 +42,6 @@
33217  
33218  #define MAX_TRACE_ARGS         128
33219  #define MAX_ARGSTR_LEN         63
33220 -#define MAX_EVENT_NAME_LEN     64
33221  #define MAX_STRING_SIZE                PATH_MAX
33222  
33223  /* Reserved field names */
33224 @@ -356,12 +355,6 @@
33225  
33226  extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
33227  
33228 -extern ssize_t traceprobe_probes_write(struct file *file,
33229 -               const char __user *buffer, size_t count, loff_t *ppos,
33230 -               int (*createfn)(int, char**));
33231 -
33232 -extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
33233 -
33234  /* Sum up total data length for dynamic arraies (strings) */
33235  static nokprobe_inline int
33236  __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
33237 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/trace_uprobe.c linux-4.14/kernel/trace/trace_uprobe.c
33238 --- linux-4.14.orig/kernel/trace/trace_uprobe.c 2018-09-05 11:03:22.000000000 +0200
33239 +++ linux-4.14/kernel/trace/trace_uprobe.c      2018-09-05 11:05:07.000000000 +0200
33240 @@ -647,7 +647,7 @@
33241  static ssize_t probes_write(struct file *file, const char __user *buffer,
33242                             size_t count, loff_t *ppos)
33243  {
33244 -       return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
33245 +       return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
33246  }
33247  
33248  static const struct file_operations uprobe_events_ops = {
33249 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/tracing_map.c linux-4.14/kernel/trace/tracing_map.c
33250 --- linux-4.14.orig/kernel/trace/tracing_map.c  2017-11-12 19:46:13.000000000 +0100
33251 +++ linux-4.14/kernel/trace/tracing_map.c       2018-09-05 11:05:07.000000000 +0200
33252 @@ -66,6 +66,73 @@
33253         return (u64)atomic64_read(&elt->fields[i].sum);
33254  }
33255  
33256 +/**
33257 + * tracing_map_set_var - Assign a tracing_map_elt's variable field
33258 + * @elt: The tracing_map_elt
33259 + * @i: The index of the given variable associated with the tracing_map_elt
33260 + * @n: The value to assign
33261 + *
33262 + * Assign n to variable i associated with the specified tracing_map_elt
33263 + * instance.  The index i is the index returned by the call to
33264 + * tracing_map_add_var() when the tracing map was set up.
33265 + */
33266 +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
33267 +{
33268 +       atomic64_set(&elt->vars[i], n);
33269 +       elt->var_set[i] = true;
33270 +}
33271 +
33272 +/**
33273 + * tracing_map_var_set - Return whether or not a variable has been set
33274 + * @elt: The tracing_map_elt
33275 + * @i: The index of the given variable associated with the tracing_map_elt
33276 + *
33277 + * Return true if the variable has been set, false otherwise.  The
33278 + * index i is the index returned by the call to tracing_map_add_var()
33279 + * when the tracing map was set up.
33280 + */
33281 +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
33282 +{
33283 +       return elt->var_set[i];
33284 +}
33285 +
33286 +/**
33287 + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
33288 + * @elt: The tracing_map_elt
33289 + * @i: The index of the given variable associated with the tracing_map_elt
33290 + *
33291 + * Retrieve the value of the variable i associated with the specified
33292 + * tracing_map_elt instance.  The index i is the index returned by the
33293 + * call to tracing_map_add_var() when the tracing map was set
33294 + * up.
33295 + *
33296 + * Return: The variable value associated with field i for elt.
33297 + */
33298 +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
33299 +{
33300 +       return (u64)atomic64_read(&elt->vars[i]);
33301 +}
33302 +
33303 +/**
33304 + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
33305 + * @elt: The tracing_map_elt
33306 + * @i: The index of the given variable associated with the tracing_map_elt
33307 + *
33308 + * Retrieve the value of the variable i associated with the specified
33309 + * tracing_map_elt instance, and reset the variable to the 'not set'
33310 + * state.  The index i is the index returned by the call to
33311 + * tracing_map_add_var() when the tracing map was set up.  The reset
33312 + * essentially makes the variable a read-once variable if it's only
33313 + * accessed using this function.
33314 + *
33315 + * Return: The variable value associated with field i for elt.
33316 + */
33317 +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
33318 +{
33319 +       elt->var_set[i] = false;
33320 +       return (u64)atomic64_read(&elt->vars[i]);
33321 +}
33322 +
33323  int tracing_map_cmp_string(void *val_a, void *val_b)
33324  {
33325         char *a = val_a;
33326 @@ -171,6 +238,28 @@
33327  }
33328  
33329  /**
33330 + * tracing_map_add_var - Add a field describing a tracing_map var
33331 + * @map: The tracing_map
33332 + *
33333 + * Add a var to the map and return the index identifying it in the map
33334 + * and associated tracing_map_elts.  This is the index used for
33335 + * instance to update a var for a particular tracing_map_elt using
33336 + * tracing_map_update_var() or reading it via tracing_map_read_var().
33337 + *
33338 + * Return: The index identifying the var in the map and associated
33339 + * tracing_map_elts, or -EINVAL on error.
33340 + */
33341 +int tracing_map_add_var(struct tracing_map *map)
33342 +{
33343 +       int ret = -EINVAL;
33344 +
33345 +       if (map->n_vars < TRACING_MAP_VARS_MAX)
33346 +               ret = map->n_vars++;
33347 +
33348 +       return ret;
33349 +}
33350 +
33351 +/**
33352   * tracing_map_add_key_field - Add a field describing a tracing_map key
33353   * @map: The tracing_map
33354   * @offset: The offset within the key
33355 @@ -280,6 +369,11 @@
33356                 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
33357                         atomic64_set(&elt->fields[i].sum, 0);
33358  
33359 +       for (i = 0; i < elt->map->n_vars; i++) {
33360 +               atomic64_set(&elt->vars[i], 0);
33361 +               elt->var_set[i] = false;
33362 +       }
33363 +
33364         if (elt->map->ops && elt->map->ops->elt_clear)
33365                 elt->map->ops->elt_clear(elt);
33366  }
33367 @@ -306,6 +400,8 @@
33368         if (elt->map->ops && elt->map->ops->elt_free)
33369                 elt->map->ops->elt_free(elt);
33370         kfree(elt->fields);
33371 +       kfree(elt->vars);
33372 +       kfree(elt->var_set);
33373         kfree(elt->key);
33374         kfree(elt);
33375  }
33376 @@ -333,6 +429,18 @@
33377                 goto free;
33378         }
33379  
33380 +       elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
33381 +       if (!elt->vars) {
33382 +               err = -ENOMEM;
33383 +               goto free;
33384 +       }
33385 +
33386 +       elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
33387 +       if (!elt->var_set) {
33388 +               err = -ENOMEM;
33389 +               goto free;
33390 +       }
33391 +
33392         tracing_map_elt_init_fields(elt);
33393  
33394         if (map->ops && map->ops->elt_alloc) {
33395 @@ -414,7 +522,9 @@
33396  __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
33397  {
33398         u32 idx, key_hash, test_key;
33399 +       int dup_try = 0;
33400         struct tracing_map_entry *entry;
33401 +       struct tracing_map_elt *val;
33402  
33403         key_hash = jhash(key, map->key_size, 0);
33404         if (key_hash == 0)
33405 @@ -426,10 +536,33 @@
33406                 entry = TRACING_MAP_ENTRY(map->map, idx);
33407                 test_key = entry->key;
33408  
33409 -               if (test_key && test_key == key_hash && entry->val &&
33410 -                   keys_match(key, entry->val->key, map->key_size)) {
33411 -                       atomic64_inc(&map->hits);
33412 -                       return entry->val;
33413 +               if (test_key && test_key == key_hash) {
33414 +                       val = READ_ONCE(entry->val);
33415 +                       if (val &&
33416 +                           keys_match(key, val->key, map->key_size)) {
33417 +                               if (!lookup_only)
33418 +                                       atomic64_inc(&map->hits);
33419 +                               return val;
33420 +                       } else if (unlikely(!val)) {
33421 +                               /*
33422 +                                * The key is present. But, val (pointer to elt
33423 +                                * struct) is still NULL. which means some other
33424 +                                * thread is in the process of inserting an
33425 +                                * element.
33426 +                                *
33427 +                                * On top of that, it's key_hash is same as the
33428 +                                * one being inserted right now. So, it's
33429 +                                * possible that the element has the same
33430 +                                * key as well.
33431 +                                */
33432 +
33433 +                               dup_try++;
33434 +                               if (dup_try > map->map_size) {
33435 +                                       atomic64_inc(&map->drops);
33436 +                                       break;
33437 +                               }
33438 +                               continue;
33439 +                       }
33440                 }
33441  
33442                 if (!test_key) {
33443 @@ -451,6 +584,13 @@
33444                                 atomic64_inc(&map->hits);
33445  
33446                                 return entry->val;
33447 +                       } else {
33448 +                               /*
33449 +                                * cmpxchg() failed. Loop around once
33450 +                                * more to check what key was inserted.
33451 +                                */
33452 +                               dup_try++;
33453 +                               continue;
33454                         }
33455                 }
33456  
33457 @@ -815,67 +955,15 @@
33458         return sort_entry;
33459  }
33460  
33461 -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
33462 -{
33463 -       struct tracing_map_elt *dup_elt;
33464 -       unsigned int i;
33465 -
33466 -       dup_elt = tracing_map_elt_alloc(elt->map);
33467 -       if (IS_ERR(dup_elt))
33468 -               return NULL;
33469 -
33470 -       if (elt->map->ops && elt->map->ops->elt_copy)
33471 -               elt->map->ops->elt_copy(dup_elt, elt);
33472 -
33473 -       dup_elt->private_data = elt->private_data;
33474 -       memcpy(dup_elt->key, elt->key, elt->map->key_size);
33475 -
33476 -       for (i = 0; i < elt->map->n_fields; i++) {
33477 -               atomic64_set(&dup_elt->fields[i].sum,
33478 -                            atomic64_read(&elt->fields[i].sum));
33479 -               dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
33480 -       }
33481 -
33482 -       return dup_elt;
33483 -}
33484 -
33485 -static int merge_dup(struct tracing_map_sort_entry **sort_entries,
33486 -                    unsigned int target, unsigned int dup)
33487 -{
33488 -       struct tracing_map_elt *target_elt, *elt;
33489 -       bool first_dup = (target - dup) == 1;
33490 -       int i;
33491 -
33492 -       if (first_dup) {
33493 -               elt = sort_entries[target]->elt;
33494 -               target_elt = copy_elt(elt);
33495 -               if (!target_elt)
33496 -                       return -ENOMEM;
33497 -               sort_entries[target]->elt = target_elt;
33498 -               sort_entries[target]->elt_copied = true;
33499 -       } else
33500 -               target_elt = sort_entries[target]->elt;
33501 -
33502 -       elt = sort_entries[dup]->elt;
33503 -
33504 -       for (i = 0; i < elt->map->n_fields; i++)
33505 -               atomic64_add(atomic64_read(&elt->fields[i].sum),
33506 -                            &target_elt->fields[i].sum);
33507 -
33508 -       sort_entries[dup]->dup = true;
33509 -
33510 -       return 0;
33511 -}
33512 -
33513 -static int merge_dups(struct tracing_map_sort_entry **sort_entries,
33514 +static void detect_dups(struct tracing_map_sort_entry **sort_entries,
33515                       int n_entries, unsigned int key_size)
33516  {
33517         unsigned int dups = 0, total_dups = 0;
33518 -       int err, i, j;
33519 +       int i;
33520         void *key;
33521  
33522         if (n_entries < 2)
33523 -               return total_dups;
33524 +               return;
33525  
33526         sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
33527              (int (*)(const void *, const void *))cmp_entries_dup, NULL);
33528 @@ -884,30 +972,14 @@
33529         for (i = 1; i < n_entries; i++) {
33530                 if (!memcmp(sort_entries[i]->key, key, key_size)) {
33531                         dups++; total_dups++;
33532 -                       err = merge_dup(sort_entries, i - dups, i);
33533 -                       if (err)
33534 -                               return err;
33535                         continue;
33536                 }
33537                 key = sort_entries[i]->key;
33538                 dups = 0;
33539         }
33540  
33541 -       if (!total_dups)
33542 -               return total_dups;
33543 -
33544 -       for (i = 0, j = 0; i < n_entries; i++) {
33545 -               if (!sort_entries[i]->dup) {
33546 -                       sort_entries[j] = sort_entries[i];
33547 -                       if (j++ != i)
33548 -                               sort_entries[i] = NULL;
33549 -               } else {
33550 -                       destroy_sort_entry(sort_entries[i]);
33551 -                       sort_entries[i] = NULL;
33552 -               }
33553 -       }
33554 -
33555 -       return total_dups;
33556 +       WARN_ONCE(total_dups > 0,
33557 +                 "Duplicates detected: %d\n", total_dups);
33558  }
33559  
33560  static bool is_key(struct tracing_map *map, unsigned int field_idx)
33561 @@ -1033,10 +1105,7 @@
33562                 return 1;
33563         }
33564  
33565 -       ret = merge_dups(entries, n_entries, map->key_size);
33566 -       if (ret < 0)
33567 -               goto free;
33568 -       n_entries -= ret;
33569 +       detect_dups(entries, n_entries, map->key_size);
33570  
33571         if (is_key(map, sort_keys[0].field_idx))
33572                 cmp_entries_fn = cmp_entries_key;
33573 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/trace/tracing_map.h linux-4.14/kernel/trace/tracing_map.h
33574 --- linux-4.14.orig/kernel/trace/tracing_map.h  2017-11-12 19:46:13.000000000 +0100
33575 +++ linux-4.14/kernel/trace/tracing_map.h       2018-09-05 11:05:07.000000000 +0200
33576 @@ -6,10 +6,11 @@
33577  #define TRACING_MAP_BITS_MAX           17
33578  #define TRACING_MAP_BITS_MIN           7
33579  
33580 -#define TRACING_MAP_KEYS_MAX           2
33581 +#define TRACING_MAP_KEYS_MAX           3
33582  #define TRACING_MAP_VALS_MAX           3
33583  #define TRACING_MAP_FIELDS_MAX         (TRACING_MAP_KEYS_MAX + \
33584                                          TRACING_MAP_VALS_MAX)
33585 +#define TRACING_MAP_VARS_MAX           16
33586  #define TRACING_MAP_SORT_KEYS_MAX      2
33587  
33588  typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
33589 @@ -137,6 +138,8 @@
33590  struct tracing_map_elt {
33591         struct tracing_map              *map;
33592         struct tracing_map_field        *fields;
33593 +       atomic64_t                      *vars;
33594 +       bool                            *var_set;
33595         void                            *key;
33596         void                            *private_data;
33597  };
33598 @@ -192,6 +195,7 @@
33599         int                             key_idx[TRACING_MAP_KEYS_MAX];
33600         unsigned int                    n_keys;
33601         struct tracing_map_sort_key     sort_key;
33602 +       unsigned int                    n_vars;
33603         atomic64_t                      hits;
33604         atomic64_t                      drops;
33605  };
33606 @@ -215,11 +219,6 @@
33607   *     Element allocation occurs before tracing begins, when the
33608   *     tracing_map_init() call is made by client code.
33609   *
33610 - * @elt_copy: At certain points in the lifetime of an element, it may
33611 - *     need to be copied.  The copy should include a copy of the
33612 - *     client-allocated data, which can be copied into the 'to'
33613 - *     element from the 'from' element.
33614 - *
33615   * @elt_free: When a tracing_map_elt is freed, this function is called
33616   *     and allows client-allocated per-element data to be freed.
33617   *
33618 @@ -233,8 +232,6 @@
33619   */
33620  struct tracing_map_ops {
33621         int                     (*elt_alloc)(struct tracing_map_elt *elt);
33622 -       void                    (*elt_copy)(struct tracing_map_elt *to,
33623 -                                           struct tracing_map_elt *from);
33624         void                    (*elt_free)(struct tracing_map_elt *elt);
33625         void                    (*elt_clear)(struct tracing_map_elt *elt);
33626         void                    (*elt_init)(struct tracing_map_elt *elt);
33627 @@ -248,6 +245,7 @@
33628  extern int tracing_map_init(struct tracing_map *map);
33629  
33630  extern int tracing_map_add_sum_field(struct tracing_map *map);
33631 +extern int tracing_map_add_var(struct tracing_map *map);
33632  extern int tracing_map_add_key_field(struct tracing_map *map,
33633                                      unsigned int offset,
33634                                      tracing_map_cmp_fn_t cmp_fn);
33635 @@ -267,7 +265,13 @@
33636  
33637  extern void tracing_map_update_sum(struct tracing_map_elt *elt,
33638                                    unsigned int i, u64 n);
33639 +extern void tracing_map_set_var(struct tracing_map_elt *elt,
33640 +                               unsigned int i, u64 n);
33641 +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
33642  extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
33643 +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
33644 +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
33645 +
33646  extern void tracing_map_set_field_descr(struct tracing_map *map,
33647                                         unsigned int i,
33648                                         unsigned int key_offset,
33649 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/user.c linux-4.14/kernel/user.c
33650 --- linux-4.14.orig/kernel/user.c       2017-11-12 19:46:13.000000000 +0100
33651 +++ linux-4.14/kernel/user.c    2018-09-05 11:05:07.000000000 +0200
33652 @@ -162,11 +162,11 @@
33653         if (!up)
33654                 return;
33655  
33656 -       local_irq_save(flags);
33657 +       local_irq_save_nort(flags);
33658         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
33659                 free_user(up, flags);
33660         else
33661 -               local_irq_restore(flags);
33662 +               local_irq_restore_nort(flags);
33663  }
33664  
33665  struct user_struct *alloc_uid(kuid_t uid)
33666 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/watchdog.c linux-4.14/kernel/watchdog.c
33667 --- linux-4.14.orig/kernel/watchdog.c   2017-11-12 19:46:13.000000000 +0100
33668 +++ linux-4.14/kernel/watchdog.c        2018-09-05 11:05:07.000000000 +0200
33669 @@ -462,7 +462,7 @@
33670          * Start the timer first to prevent the NMI watchdog triggering
33671          * before the timer has a chance to fire.
33672          */
33673 -       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
33674 +       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
33675         hrtimer->function = watchdog_timer_fn;
33676         hrtimer_start(hrtimer, ns_to_ktime(sample_period),
33677                       HRTIMER_MODE_REL_PINNED);
33678 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/watchdog_hld.c linux-4.14/kernel/watchdog_hld.c
33679 --- linux-4.14.orig/kernel/watchdog_hld.c       2017-11-12 19:46:13.000000000 +0100
33680 +++ linux-4.14/kernel/watchdog_hld.c    2018-09-05 11:05:07.000000000 +0200
33681 @@ -24,6 +24,8 @@
33682  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
33683  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
33684  static DEFINE_PER_CPU(struct perf_event *, dead_event);
33685 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
33686 +
33687  static struct cpumask dead_events_mask;
33688  
33689  static unsigned long hardlockup_allcpu_dumped;
33690 @@ -134,6 +136,13 @@
33691                 /* only print hardlockups once */
33692                 if (__this_cpu_read(hard_watchdog_warn) == true)
33693                         return;
33694 +               /*
33695 +                * If early-printk is enabled then make sure we do not
33696 +                * lock up in printk() and kill console logging:
33697 +                */
33698 +               printk_kill();
33699 +
33700 +               raw_spin_lock(&watchdog_output_lock);
33701  
33702                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
33703                 print_modules();
33704 @@ -151,6 +160,7 @@
33705                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
33706                         trigger_allbutself_cpu_backtrace();
33707  
33708 +               raw_spin_unlock(&watchdog_output_lock);
33709                 if (hardlockup_panic)
33710                         nmi_panic(regs, "Hard LOCKUP");
33711  
33712 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/workqueue.c linux-4.14/kernel/workqueue.c
33713 --- linux-4.14.orig/kernel/workqueue.c  2018-09-05 11:03:22.000000000 +0200
33714 +++ linux-4.14/kernel/workqueue.c       2018-09-05 11:05:07.000000000 +0200
33715 @@ -49,6 +49,8 @@
33716  #include <linux/moduleparam.h>
33717  #include <linux/uaccess.h>
33718  #include <linux/nmi.h>
33719 +#include <linux/locallock.h>
33720 +#include <linux/delay.h>
33721  
33722  #include "workqueue_internal.h"
33723  
33724 @@ -123,11 +125,16 @@
33725   *    cpu or grabbing pool->lock is enough for read access.  If
33726   *    POOL_DISASSOCIATED is set, it's identical to L.
33727   *
33728 + *    On RT we need the extra protection via rt_lock_idle_list() for
33729 + *    the list manipulations against read access from
33730 + *    wq_worker_sleeping(). All other places are nicely serialized via
33731 + *    pool->lock.
33732 + *
33733   * A: pool->attach_mutex protected.
33734   *
33735   * PL: wq_pool_mutex protected.
33736   *
33737 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
33738 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
33739   *
33740   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
33741   *
33742 @@ -136,7 +143,7 @@
33743   *
33744   * WQ: wq->mutex protected.
33745   *
33746 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
33747 + * WR: wq->mutex protected for writes.  RCU protected for reads.
33748   *
33749   * MD: wq_mayday_lock protected.
33750   */
33751 @@ -186,7 +193,7 @@
33752         atomic_t                nr_running ____cacheline_aligned_in_smp;
33753  
33754         /*
33755 -        * Destruction of pool is sched-RCU protected to allow dereferences
33756 +        * Destruction of pool is RCU protected to allow dereferences
33757          * from get_work_pool().
33758          */
33759         struct rcu_head         rcu;
33760 @@ -215,7 +222,7 @@
33761         /*
33762          * Release of unbound pwq is punted to system_wq.  See put_pwq()
33763          * and pwq_unbound_release_workfn() for details.  pool_workqueue
33764 -        * itself is also sched-RCU protected so that the first pwq can be
33765 +        * itself is also RCU protected so that the first pwq can be
33766          * determined without grabbing wq->mutex.
33767          */
33768         struct work_struct      unbound_release_work;
33769 @@ -352,6 +359,8 @@
33770  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
33771  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
33772  
33773 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
33774 +
33775  static int worker_thread(void *__worker);
33776  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
33777  
33778 @@ -359,20 +368,20 @@
33779  #include <trace/events/workqueue.h>
33780  
33781  #define assert_rcu_or_pool_mutex()                                     \
33782 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
33783 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
33784                          !lockdep_is_held(&wq_pool_mutex),              \
33785 -                        "sched RCU or wq_pool_mutex should be held")
33786 +                        "RCU or wq_pool_mutex should be held")
33787  
33788  #define assert_rcu_or_wq_mutex(wq)                                     \
33789 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
33790 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
33791                          !lockdep_is_held(&wq->mutex),                  \
33792 -                        "sched RCU or wq->mutex should be held")
33793 +                        "RCU or wq->mutex should be held")
33794  
33795  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
33796 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
33797 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
33798                          !lockdep_is_held(&wq->mutex) &&                \
33799                          !lockdep_is_held(&wq_pool_mutex),              \
33800 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
33801 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
33802  
33803  #define for_each_cpu_worker_pool(pool, cpu)                            \
33804         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
33805 @@ -384,7 +393,7 @@
33806   * @pool: iteration cursor
33807   * @pi: integer used for iteration
33808   *
33809 - * This must be called either with wq_pool_mutex held or sched RCU read
33810 + * This must be called either with wq_pool_mutex held or RCU read
33811   * locked.  If the pool needs to be used beyond the locking in effect, the
33812   * caller is responsible for guaranteeing that the pool stays online.
33813   *
33814 @@ -416,7 +425,7 @@
33815   * @pwq: iteration cursor
33816   * @wq: the target workqueue
33817   *
33818 - * This must be called either with wq->mutex held or sched RCU read locked.
33819 + * This must be called either with wq->mutex held or RCU read locked.
33820   * If the pwq needs to be used beyond the locking in effect, the caller is
33821   * responsible for guaranteeing that the pwq stays online.
33822   *
33823 @@ -428,6 +437,31 @@
33824                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
33825                 else
33826  
33827 +#ifdef CONFIG_PREEMPT_RT_BASE
33828 +static inline void rt_lock_idle_list(struct worker_pool *pool)
33829 +{
33830 +       preempt_disable();
33831 +}
33832 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
33833 +{
33834 +       preempt_enable();
33835 +}
33836 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
33837 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
33838 +#else
33839 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
33840 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
33841 +static inline void sched_lock_idle_list(struct worker_pool *pool)
33842 +{
33843 +       spin_lock_irq(&pool->lock);
33844 +}
33845 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
33846 +{
33847 +       spin_unlock_irq(&pool->lock);
33848 +}
33849 +#endif
33850 +
33851 +
33852  #ifdef CONFIG_DEBUG_OBJECTS_WORK
33853  
33854  static struct debug_obj_descr work_debug_descr;
33855 @@ -552,7 +586,7 @@
33856   * @wq: the target workqueue
33857   * @node: the node ID
33858   *
33859 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
33860 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
33861   * read locked.
33862   * If the pwq needs to be used beyond the locking in effect, the caller is
33863   * responsible for guaranteeing that the pwq stays online.
33864 @@ -696,8 +730,8 @@
33865   * @work: the work item of interest
33866   *
33867   * Pools are created and destroyed under wq_pool_mutex, and allows read
33868 - * access under sched-RCU read lock.  As such, this function should be
33869 - * called under wq_pool_mutex or with preemption disabled.
33870 + * access under RCU read lock.  As such, this function should be
33871 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
33872   *
33873   * All fields of the returned pool are accessible as long as the above
33874   * mentioned locking is in effect.  If the returned pool needs to be used
33875 @@ -834,50 +868,45 @@
33876   */
33877  static void wake_up_worker(struct worker_pool *pool)
33878  {
33879 -       struct worker *worker = first_idle_worker(pool);
33880 +       struct worker *worker;
33881 +
33882 +       rt_lock_idle_list(pool);
33883 +
33884 +       worker = first_idle_worker(pool);
33885  
33886         if (likely(worker))
33887                 wake_up_process(worker->task);
33888 +
33889 +       rt_unlock_idle_list(pool);
33890  }
33891  
33892  /**
33893 - * wq_worker_waking_up - a worker is waking up
33894 + * wq_worker_running - a worker is running again
33895   * @task: task waking up
33896 - * @cpu: CPU @task is waking up to
33897 - *
33898 - * This function is called during try_to_wake_up() when a worker is
33899 - * being awoken.
33900   *
33901 - * CONTEXT:
33902 - * spin_lock_irq(rq->lock)
33903 + * This function is called when a worker returns from schedule()
33904   */
33905 -void wq_worker_waking_up(struct task_struct *task, int cpu)
33906 +void wq_worker_running(struct task_struct *task)
33907  {
33908         struct worker *worker = kthread_data(task);
33909  
33910 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
33911 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
33912 +       if (!worker->sleeping)
33913 +               return;
33914 +       if (!(worker->flags & WORKER_NOT_RUNNING))
33915                 atomic_inc(&worker->pool->nr_running);
33916 -       }
33917 +       worker->sleeping = 0;
33918  }
33919  
33920  /**
33921   * wq_worker_sleeping - a worker is going to sleep
33922   * @task: task going to sleep
33923   *
33924 - * This function is called during schedule() when a busy worker is
33925 - * going to sleep.  Worker on the same cpu can be woken up by
33926 - * returning pointer to its task.
33927 - *
33928 - * CONTEXT:
33929 - * spin_lock_irq(rq->lock)
33930 - *
33931 - * Return:
33932 - * Worker task on @cpu to wake up, %NULL if none.
33933 + * This function is called from schedule() when a busy worker is
33934 + * going to sleep.
33935   */
33936 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
33937 +void wq_worker_sleeping(struct task_struct *task)
33938  {
33939 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
33940 +       struct worker *worker = kthread_data(task);
33941         struct worker_pool *pool;
33942  
33943         /*
33944 @@ -886,29 +915,26 @@
33945          * checking NOT_RUNNING.
33946          */
33947         if (worker->flags & WORKER_NOT_RUNNING)
33948 -               return NULL;
33949 +               return;
33950  
33951         pool = worker->pool;
33952  
33953 -       /* this can only happen on the local cpu */
33954 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
33955 -               return NULL;
33956 +       if (WARN_ON_ONCE(worker->sleeping))
33957 +               return;
33958 +
33959 +       worker->sleeping = 1;
33960  
33961         /*
33962          * The counterpart of the following dec_and_test, implied mb,
33963          * worklist not empty test sequence is in insert_work().
33964          * Please read comment there.
33965 -        *
33966 -        * NOT_RUNNING is clear.  This means that we're bound to and
33967 -        * running on the local cpu w/ rq lock held and preemption
33968 -        * disabled, which in turn means that none else could be
33969 -        * manipulating idle_list, so dereferencing idle_list without pool
33970 -        * lock is safe.
33971          */
33972         if (atomic_dec_and_test(&pool->nr_running) &&
33973 -           !list_empty(&pool->worklist))
33974 -               to_wakeup = first_idle_worker(pool);
33975 -       return to_wakeup ? to_wakeup->task : NULL;
33976 +           !list_empty(&pool->worklist)) {
33977 +               sched_lock_idle_list(pool);
33978 +               wake_up_worker(pool);
33979 +               sched_unlock_idle_list(pool);
33980 +       }
33981  }
33982  
33983  /**
33984 @@ -1102,12 +1128,14 @@
33985  {
33986         if (pwq) {
33987                 /*
33988 -                * As both pwqs and pools are sched-RCU protected, the
33989 +                * As both pwqs and pools are RCU protected, the
33990                  * following lock operations are safe.
33991                  */
33992 -               spin_lock_irq(&pwq->pool->lock);
33993 +               rcu_read_lock();
33994 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
33995                 put_pwq(pwq);
33996 -               spin_unlock_irq(&pwq->pool->lock);
33997 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
33998 +               rcu_read_unlock();
33999         }
34000  }
34001  
34002 @@ -1211,7 +1239,7 @@
34003         struct worker_pool *pool;
34004         struct pool_workqueue *pwq;
34005  
34006 -       local_irq_save(*flags);
34007 +       local_lock_irqsave(pendingb_lock, *flags);
34008  
34009         /* try to steal the timer if it exists */
34010         if (is_dwork) {
34011 @@ -1230,6 +1258,7 @@
34012         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
34013                 return 0;
34014  
34015 +       rcu_read_lock();
34016         /*
34017          * The queueing is in progress, or it is already queued. Try to
34018          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
34019 @@ -1268,14 +1297,16 @@
34020                 set_work_pool_and_keep_pending(work, pool->id);
34021  
34022                 spin_unlock(&pool->lock);
34023 +               rcu_read_unlock();
34024                 return 1;
34025         }
34026         spin_unlock(&pool->lock);
34027  fail:
34028 -       local_irq_restore(*flags);
34029 +       rcu_read_unlock();
34030 +       local_unlock_irqrestore(pendingb_lock, *flags);
34031         if (work_is_canceling(work))
34032                 return -ENOENT;
34033 -       cpu_relax();
34034 +       cpu_chill();
34035         return -EAGAIN;
34036  }
34037  
34038 @@ -1377,7 +1408,7 @@
34039          * queued or lose PENDING.  Grabbing PENDING and queueing should
34040          * happen with IRQ disabled.
34041          */
34042 -       WARN_ON_ONCE(!irqs_disabled());
34043 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
34044  
34045         debug_work_activate(work);
34046  
34047 @@ -1385,6 +1416,7 @@
34048         if (unlikely(wq->flags & __WQ_DRAINING) &&
34049             WARN_ON_ONCE(!is_chained_work(wq)))
34050                 return;
34051 +       rcu_read_lock();
34052  retry:
34053         if (req_cpu == WORK_CPU_UNBOUND)
34054                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
34055 @@ -1441,10 +1473,8 @@
34056         /* pwq determined, queue */
34057         trace_workqueue_queue_work(req_cpu, pwq, work);
34058  
34059 -       if (WARN_ON(!list_empty(&work->entry))) {
34060 -               spin_unlock(&pwq->pool->lock);
34061 -               return;
34062 -       }
34063 +       if (WARN_ON(!list_empty(&work->entry)))
34064 +               goto out;
34065  
34066         pwq->nr_in_flight[pwq->work_color]++;
34067         work_flags = work_color_to_flags(pwq->work_color);
34068 @@ -1462,7 +1492,9 @@
34069  
34070         insert_work(pwq, work, worklist, work_flags);
34071  
34072 +out:
34073         spin_unlock(&pwq->pool->lock);
34074 +       rcu_read_unlock();
34075  }
34076  
34077  /**
34078 @@ -1482,14 +1514,14 @@
34079         bool ret = false;
34080         unsigned long flags;
34081  
34082 -       local_irq_save(flags);
34083 +       local_lock_irqsave(pendingb_lock,flags);
34084  
34085         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
34086                 __queue_work(cpu, wq, work);
34087                 ret = true;
34088         }
34089  
34090 -       local_irq_restore(flags);
34091 +       local_unlock_irqrestore(pendingb_lock, flags);
34092         return ret;
34093  }
34094  EXPORT_SYMBOL(queue_work_on);
34095 @@ -1498,8 +1530,11 @@
34096  {
34097         struct delayed_work *dwork = (struct delayed_work *)__data;
34098  
34099 +       /* XXX */
34100 +       /* local_lock(pendingb_lock); */
34101         /* should have been called from irqsafe timer with irq already off */
34102         __queue_work(dwork->cpu, dwork->wq, &dwork->work);
34103 +       /* local_unlock(pendingb_lock); */
34104  }
34105  EXPORT_SYMBOL(delayed_work_timer_fn);
34106  
34107 @@ -1555,14 +1590,14 @@
34108         unsigned long flags;
34109  
34110         /* read the comment in __queue_work() */
34111 -       local_irq_save(flags);
34112 +       local_lock_irqsave(pendingb_lock, flags);
34113  
34114         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
34115                 __queue_delayed_work(cpu, wq, dwork, delay);
34116                 ret = true;
34117         }
34118  
34119 -       local_irq_restore(flags);
34120 +       local_unlock_irqrestore(pendingb_lock, flags);
34121         return ret;
34122  }
34123  EXPORT_SYMBOL(queue_delayed_work_on);
34124 @@ -1597,7 +1632,7 @@
34125  
34126         if (likely(ret >= 0)) {
34127                 __queue_delayed_work(cpu, wq, dwork, delay);
34128 -               local_irq_restore(flags);
34129 +               local_unlock_irqrestore(pendingb_lock, flags);
34130         }
34131  
34132         /* -ENOENT from try_to_grab_pending() becomes %true */
34133 @@ -1630,7 +1665,9 @@
34134         worker->last_active = jiffies;
34135  
34136         /* idle_list is LIFO */
34137 +       rt_lock_idle_list(pool);
34138         list_add(&worker->entry, &pool->idle_list);
34139 +       rt_unlock_idle_list(pool);
34140  
34141         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
34142                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
34143 @@ -1663,7 +1700,9 @@
34144                 return;
34145         worker_clr_flags(worker, WORKER_IDLE);
34146         pool->nr_idle--;
34147 +       rt_lock_idle_list(pool);
34148         list_del_init(&worker->entry);
34149 +       rt_unlock_idle_list(pool);
34150  }
34151  
34152  static struct worker *alloc_worker(int node)
34153 @@ -1829,7 +1868,9 @@
34154         pool->nr_workers--;
34155         pool->nr_idle--;
34156  
34157 +       rt_lock_idle_list(pool);
34158         list_del_init(&worker->entry);
34159 +       rt_unlock_idle_list(pool);
34160         worker->flags |= WORKER_DIE;
34161         wake_up_process(worker->task);
34162  }
34163 @@ -2815,14 +2856,14 @@
34164  
34165         might_sleep();
34166  
34167 -       local_irq_disable();
34168 +       rcu_read_lock();
34169         pool = get_work_pool(work);
34170         if (!pool) {
34171 -               local_irq_enable();
34172 +               rcu_read_unlock();
34173                 return false;
34174         }
34175  
34176 -       spin_lock(&pool->lock);
34177 +       spin_lock_irq(&pool->lock);
34178         /* see the comment in try_to_grab_pending() with the same code */
34179         pwq = get_work_pwq(work);
34180         if (pwq) {
34181 @@ -2853,10 +2894,11 @@
34182                 lock_map_acquire(&pwq->wq->lockdep_map);
34183                 lock_map_release(&pwq->wq->lockdep_map);
34184         }
34185 -
34186 +       rcu_read_unlock();
34187         return true;
34188  already_gone:
34189         spin_unlock_irq(&pool->lock);
34190 +       rcu_read_unlock();
34191         return false;
34192  }
34193  
34194 @@ -2946,7 +2988,7 @@
34195  
34196         /* tell other tasks trying to grab @work to back off */
34197         mark_work_canceling(work);
34198 -       local_irq_restore(flags);
34199 +       local_unlock_irqrestore(pendingb_lock, flags);
34200  
34201         /*
34202          * This allows canceling during early boot.  We know that @work
34203 @@ -3007,10 +3049,10 @@
34204   */
34205  bool flush_delayed_work(struct delayed_work *dwork)
34206  {
34207 -       local_irq_disable();
34208 +       local_lock_irq(pendingb_lock);
34209         if (del_timer_sync(&dwork->timer))
34210                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
34211 -       local_irq_enable();
34212 +       local_unlock_irq(pendingb_lock);
34213         return flush_work(&dwork->work);
34214  }
34215  EXPORT_SYMBOL(flush_delayed_work);
34216 @@ -3028,7 +3070,7 @@
34217                 return false;
34218  
34219         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
34220 -       local_irq_restore(flags);
34221 +       local_unlock_irqrestore(pendingb_lock, flags);
34222         return ret;
34223  }
34224  
34225 @@ -3284,7 +3326,7 @@
34226   * put_unbound_pool - put a worker_pool
34227   * @pool: worker_pool to put
34228   *
34229 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
34230 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
34231   * safe manner.  get_unbound_pool() calls this function on its failure path
34232   * and this function should be able to release pools which went through,
34233   * successfully or not, init_worker_pool().
34234 @@ -3338,8 +3380,8 @@
34235         del_timer_sync(&pool->idle_timer);
34236         del_timer_sync(&pool->mayday_timer);
34237  
34238 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
34239 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
34240 +       /* RCU protected to allow dereferences from get_work_pool() */
34241 +       call_rcu(&pool->rcu, rcu_free_pool);
34242  }
34243  
34244  /**
34245 @@ -3446,14 +3488,14 @@
34246         put_unbound_pool(pool);
34247         mutex_unlock(&wq_pool_mutex);
34248  
34249 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
34250 +       call_rcu(&pwq->rcu, rcu_free_pwq);
34251  
34252         /*
34253          * If we're the last pwq going away, @wq is already dead and no one
34254          * is gonna access it anymore.  Schedule RCU free.
34255          */
34256         if (is_last)
34257 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
34258 +               call_rcu(&wq->rcu, rcu_free_wq);
34259  }
34260  
34261  /**
34262 @@ -4128,7 +4170,7 @@
34263                  * The base ref is never dropped on per-cpu pwqs.  Directly
34264                  * schedule RCU free.
34265                  */
34266 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
34267 +               call_rcu(&wq->rcu, rcu_free_wq);
34268         } else {
34269                 /*
34270                  * We're the sole accessor of @wq at this point.  Directly
34271 @@ -4238,7 +4280,8 @@
34272         struct pool_workqueue *pwq;
34273         bool ret;
34274  
34275 -       rcu_read_lock_sched();
34276 +       rcu_read_lock();
34277 +       preempt_disable();
34278  
34279         if (cpu == WORK_CPU_UNBOUND)
34280                 cpu = smp_processor_id();
34281 @@ -4249,7 +4292,8 @@
34282                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
34283  
34284         ret = !list_empty(&pwq->delayed_works);
34285 -       rcu_read_unlock_sched();
34286 +       preempt_enable();
34287 +       rcu_read_unlock();
34288  
34289         return ret;
34290  }
34291 @@ -4275,15 +4319,15 @@
34292         if (work_pending(work))
34293                 ret |= WORK_BUSY_PENDING;
34294  
34295 -       local_irq_save(flags);
34296 +       rcu_read_lock();
34297         pool = get_work_pool(work);
34298         if (pool) {
34299 -               spin_lock(&pool->lock);
34300 +               spin_lock_irqsave(&pool->lock, flags);
34301                 if (find_worker_executing_work(pool, work))
34302                         ret |= WORK_BUSY_RUNNING;
34303 -               spin_unlock(&pool->lock);
34304 +               spin_unlock_irqrestore(&pool->lock, flags);
34305         }
34306 -       local_irq_restore(flags);
34307 +       rcu_read_unlock();
34308  
34309         return ret;
34310  }
34311 @@ -4472,7 +4516,7 @@
34312         unsigned long flags;
34313         int pi;
34314  
34315 -       rcu_read_lock_sched();
34316 +       rcu_read_lock();
34317  
34318         pr_info("Showing busy workqueues and worker pools:\n");
34319  
34320 @@ -4537,7 +4581,7 @@
34321                 touch_nmi_watchdog();
34322         }
34323  
34324 -       rcu_read_unlock_sched();
34325 +       rcu_read_unlock();
34326  }
34327  
34328  /*
34329 @@ -4898,16 +4942,16 @@
34330                  * nr_active is monotonically decreasing.  It's safe
34331                  * to peek without lock.
34332                  */
34333 -               rcu_read_lock_sched();
34334 +               rcu_read_lock();
34335                 for_each_pwq(pwq, wq) {
34336                         WARN_ON_ONCE(pwq->nr_active < 0);
34337                         if (pwq->nr_active) {
34338                                 busy = true;
34339 -                               rcu_read_unlock_sched();
34340 +                               rcu_read_unlock();
34341                                 goto out_unlock;
34342                         }
34343                 }
34344 -               rcu_read_unlock_sched();
34345 +               rcu_read_unlock();
34346         }
34347  out_unlock:
34348         mutex_unlock(&wq_pool_mutex);
34349 @@ -5097,7 +5141,8 @@
34350         const char *delim = "";
34351         int node, written = 0;
34352  
34353 -       rcu_read_lock_sched();
34354 +       get_online_cpus();
34355 +       rcu_read_lock();
34356         for_each_node(node) {
34357                 written += scnprintf(buf + written, PAGE_SIZE - written,
34358                                      "%s%d:%d", delim, node,
34359 @@ -5105,7 +5150,8 @@
34360                 delim = " ";
34361         }
34362         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
34363 -       rcu_read_unlock_sched();
34364 +       rcu_read_unlock();
34365 +       put_online_cpus();
34366  
34367         return written;
34368  }
34369 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/kernel/workqueue_internal.h linux-4.14/kernel/workqueue_internal.h
34370 --- linux-4.14.orig/kernel/workqueue_internal.h 2017-11-12 19:46:13.000000000 +0100
34371 +++ linux-4.14/kernel/workqueue_internal.h      2018-09-05 11:05:07.000000000 +0200
34372 @@ -45,6 +45,7 @@
34373         unsigned long           last_active;    /* L: last active timestamp */
34374         unsigned int            flags;          /* X: flags */
34375         int                     id;             /* I: worker id */
34376 +       int                     sleeping;       /* None */
34377  
34378         /*
34379          * Opaque string set with work_set_desc().  Printed out with task
34380 @@ -70,7 +71,7 @@
34381   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
34382   * sched/core.c and workqueue.c.
34383   */
34384 -void wq_worker_waking_up(struct task_struct *task, int cpu);
34385 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
34386 +void wq_worker_running(struct task_struct *task);
34387 +void wq_worker_sleeping(struct task_struct *task);
34388  
34389  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
34390 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/debugobjects.c linux-4.14/lib/debugobjects.c
34391 --- linux-4.14.orig/lib/debugobjects.c  2017-11-12 19:46:13.000000000 +0100
34392 +++ linux-4.14/lib/debugobjects.c       2018-09-05 11:05:07.000000000 +0200
34393 @@ -336,7 +336,10 @@
34394         struct debug_obj *obj;
34395         unsigned long flags;
34396  
34397 -       fill_pool();
34398 +#ifdef CONFIG_PREEMPT_RT_FULL
34399 +       if (preempt_count() == 0 && !irqs_disabled())
34400 +#endif
34401 +               fill_pool();
34402  
34403         db = get_bucket((unsigned long) addr);
34404  
34405 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/irq_poll.c linux-4.14/lib/irq_poll.c
34406 --- linux-4.14.orig/lib/irq_poll.c      2017-11-12 19:46:13.000000000 +0100
34407 +++ linux-4.14/lib/irq_poll.c   2018-09-05 11:05:07.000000000 +0200
34408 @@ -37,6 +37,7 @@
34409         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
34410         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34411         local_irq_restore(flags);
34412 +       preempt_check_resched_rt();
34413  }
34414  EXPORT_SYMBOL(irq_poll_sched);
34415  
34416 @@ -72,6 +73,7 @@
34417         local_irq_save(flags);
34418         __irq_poll_complete(iop);
34419         local_irq_restore(flags);
34420 +       preempt_check_resched_rt();
34421  }
34422  EXPORT_SYMBOL(irq_poll_complete);
34423  
34424 @@ -96,6 +98,7 @@
34425                 }
34426  
34427                 local_irq_enable();
34428 +               preempt_check_resched_rt();
34429  
34430                 /* Even though interrupts have been re-enabled, this
34431                  * access is safe because interrupts can only add new
34432 @@ -133,6 +136,7 @@
34433                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34434  
34435         local_irq_enable();
34436 +       preempt_check_resched_rt();
34437  }
34438  
34439  /**
34440 @@ -196,6 +200,7 @@
34441                          this_cpu_ptr(&blk_cpu_iopoll));
34442         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
34443         local_irq_enable();
34444 +       preempt_check_resched_rt();
34445  
34446         return 0;
34447  }
34448 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/Kconfig linux-4.14/lib/Kconfig
34449 --- linux-4.14.orig/lib/Kconfig 2017-11-12 19:46:13.000000000 +0100
34450 +++ linux-4.14/lib/Kconfig      2018-09-05 11:05:07.000000000 +0200
34451 @@ -428,6 +428,7 @@
34452  
34453  config CPUMASK_OFFSTACK
34454         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
34455 +       depends on !PREEMPT_RT_FULL
34456         help
34457           Use dynamic allocation for cpumask_var_t, instead of putting
34458           them on the stack.  This is a bit more expensive, but avoids
34459 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/Kconfig.debug linux-4.14/lib/Kconfig.debug
34460 --- linux-4.14.orig/lib/Kconfig.debug   2018-09-05 11:03:22.000000000 +0200
34461 +++ linux-4.14/lib/Kconfig.debug        2018-09-05 11:05:07.000000000 +0200
34462 @@ -1197,7 +1197,7 @@
34463  
34464  config DEBUG_LOCKING_API_SELFTESTS
34465         bool "Locking API boot-time self-tests"
34466 -       depends on DEBUG_KERNEL
34467 +       depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
34468         help
34469           Say Y here if you want the kernel to run a short self-test during
34470           bootup. The self-test checks whether common types of locking bugs
34471 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/locking-selftest.c linux-4.14/lib/locking-selftest.c
34472 --- linux-4.14.orig/lib/locking-selftest.c      2017-11-12 19:46:13.000000000 +0100
34473 +++ linux-4.14/lib/locking-selftest.c   2018-09-05 11:05:07.000000000 +0200
34474 @@ -742,6 +742,8 @@
34475  #include "locking-selftest-spin-hardirq.h"
34476  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
34477  
34478 +#ifndef CONFIG_PREEMPT_RT_FULL
34479 +
34480  #include "locking-selftest-rlock-hardirq.h"
34481  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
34482  
34483 @@ -757,9 +759,12 @@
34484  #include "locking-selftest-wlock-softirq.h"
34485  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
34486  
34487 +#endif
34488 +
34489  #undef E1
34490  #undef E2
34491  
34492 +#ifndef CONFIG_PREEMPT_RT_FULL
34493  /*
34494   * Enabling hardirqs with a softirq-safe lock held:
34495   */
34496 @@ -792,6 +797,8 @@
34497  #undef E1
34498  #undef E2
34499  
34500 +#endif
34501 +
34502  /*
34503   * Enabling irqs with an irq-safe lock held:
34504   */
34505 @@ -815,6 +822,8 @@
34506  #include "locking-selftest-spin-hardirq.h"
34507  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
34508  
34509 +#ifndef CONFIG_PREEMPT_RT_FULL
34510 +
34511  #include "locking-selftest-rlock-hardirq.h"
34512  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
34513  
34514 @@ -830,6 +839,8 @@
34515  #include "locking-selftest-wlock-softirq.h"
34516  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
34517  
34518 +#endif
34519 +
34520  #undef E1
34521  #undef E2
34522  
34523 @@ -861,6 +872,8 @@
34524  #include "locking-selftest-spin-hardirq.h"
34525  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
34526  
34527 +#ifndef CONFIG_PREEMPT_RT_FULL
34528 +
34529  #include "locking-selftest-rlock-hardirq.h"
34530  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
34531  
34532 @@ -876,6 +889,8 @@
34533  #include "locking-selftest-wlock-softirq.h"
34534  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
34535  
34536 +#endif
34537 +
34538  #undef E1
34539  #undef E2
34540  #undef E3
34541 @@ -909,6 +924,8 @@
34542  #include "locking-selftest-spin-hardirq.h"
34543  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
34544  
34545 +#ifndef CONFIG_PREEMPT_RT_FULL
34546 +
34547  #include "locking-selftest-rlock-hardirq.h"
34548  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
34549  
34550 @@ -924,10 +941,14 @@
34551  #include "locking-selftest-wlock-softirq.h"
34552  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
34553  
34554 +#endif
34555 +
34556  #undef E1
34557  #undef E2
34558  #undef E3
34559  
34560 +#ifndef CONFIG_PREEMPT_RT_FULL
34561 +
34562  /*
34563   * read-lock / write-lock irq inversion.
34564   *
34565 @@ -990,6 +1011,10 @@
34566  #undef E2
34567  #undef E3
34568  
34569 +#endif
34570 +
34571 +#ifndef CONFIG_PREEMPT_RT_FULL
34572 +
34573  /*
34574   * read-lock / write-lock recursion that is actually safe.
34575   */
34576 @@ -1028,6 +1053,8 @@
34577  #undef E2
34578  #undef E3
34579  
34580 +#endif
34581 +
34582  /*
34583   * read-lock / write-lock recursion that is unsafe.
34584   */
34585 @@ -2057,6 +2084,7 @@
34586  
34587         printk("  --------------------------------------------------------------------------\n");
34588  
34589 +#ifndef CONFIG_PREEMPT_RT_FULL
34590         /*
34591          * irq-context testcases:
34592          */
34593 @@ -2069,6 +2097,28 @@
34594  
34595         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
34596  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
34597 +#else
34598 +       /* On -rt, we only do hardirq context test for raw spinlock */
34599 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
34600 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
34601 +
34602 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
34603 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
34604 +
34605 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
34606 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
34607 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
34608 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
34609 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
34610 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
34611 +
34612 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
34613 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
34614 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
34615 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
34616 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
34617 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
34618 +#endif
34619  
34620         ww_tests();
34621  
34622 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/percpu_ida.c linux-4.14/lib/percpu_ida.c
34623 --- linux-4.14.orig/lib/percpu_ida.c    2017-11-12 19:46:13.000000000 +0100
34624 +++ linux-4.14/lib/percpu_ida.c 2018-09-05 11:05:07.000000000 +0200
34625 @@ -27,6 +27,9 @@
34626  #include <linux/string.h>
34627  #include <linux/spinlock.h>
34628  #include <linux/percpu_ida.h>
34629 +#include <linux/locallock.h>
34630 +
34631 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
34632  
34633  struct percpu_ida_cpu {
34634         /*
34635 @@ -149,13 +152,13 @@
34636         unsigned long flags;
34637         int tag;
34638  
34639 -       local_irq_save(flags);
34640 +       local_lock_irqsave(irq_off_lock, flags);
34641         tags = this_cpu_ptr(pool->tag_cpu);
34642  
34643         /* Fastpath */
34644         tag = alloc_local_tag(tags);
34645         if (likely(tag >= 0)) {
34646 -               local_irq_restore(flags);
34647 +               local_unlock_irqrestore(irq_off_lock, flags);
34648                 return tag;
34649         }
34650  
34651 @@ -174,6 +177,7 @@
34652  
34653                 if (!tags->nr_free)
34654                         alloc_global_tags(pool, tags);
34655 +
34656                 if (!tags->nr_free)
34657                         steal_tags(pool, tags);
34658  
34659 @@ -185,7 +189,7 @@
34660                 }
34661  
34662                 spin_unlock(&pool->lock);
34663 -               local_irq_restore(flags);
34664 +               local_unlock_irqrestore(irq_off_lock, flags);
34665  
34666                 if (tag >= 0 || state == TASK_RUNNING)
34667                         break;
34668 @@ -197,7 +201,7 @@
34669  
34670                 schedule();
34671  
34672 -               local_irq_save(flags);
34673 +               local_lock_irqsave(irq_off_lock, flags);
34674                 tags = this_cpu_ptr(pool->tag_cpu);
34675         }
34676         if (state != TASK_RUNNING)
34677 @@ -222,7 +226,7 @@
34678  
34679         BUG_ON(tag >= pool->nr_tags);
34680  
34681 -       local_irq_save(flags);
34682 +       local_lock_irqsave(irq_off_lock, flags);
34683         tags = this_cpu_ptr(pool->tag_cpu);
34684  
34685         spin_lock(&tags->lock);
34686 @@ -254,7 +258,7 @@
34687                 spin_unlock(&pool->lock);
34688         }
34689  
34690 -       local_irq_restore(flags);
34691 +       local_unlock_irqrestore(irq_off_lock, flags);
34692  }
34693  EXPORT_SYMBOL_GPL(percpu_ida_free);
34694  
34695 @@ -346,7 +350,7 @@
34696         struct percpu_ida_cpu *remote;
34697         unsigned cpu, i, err = 0;
34698  
34699 -       local_irq_save(flags);
34700 +       local_lock_irqsave(irq_off_lock, flags);
34701         for_each_possible_cpu(cpu) {
34702                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
34703                 spin_lock(&remote->lock);
34704 @@ -368,7 +372,7 @@
34705         }
34706         spin_unlock(&pool->lock);
34707  out:
34708 -       local_irq_restore(flags);
34709 +       local_unlock_irqrestore(irq_off_lock, flags);
34710         return err;
34711  }
34712  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
34713 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/radix-tree.c linux-4.14/lib/radix-tree.c
34714 --- linux-4.14.orig/lib/radix-tree.c    2018-09-05 11:03:25.000000000 +0200
34715 +++ linux-4.14/lib/radix-tree.c 2018-09-05 11:05:07.000000000 +0200
34716 @@ -37,7 +37,7 @@
34717  #include <linux/rcupdate.h>
34718  #include <linux/slab.h>
34719  #include <linux/string.h>
34720 -
34721 +#include <linux/locallock.h>
34722  
34723  /* Number of nodes in fully populated tree of given height */
34724  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
34725 @@ -86,6 +86,7 @@
34726         struct radix_tree_node *nodes;
34727  };
34728  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
34729 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
34730  
34731  static inline struct radix_tree_node *entry_to_node(void *ptr)
34732  {
34733 @@ -404,12 +405,13 @@
34734                  * succeed in getting a node here (and never reach
34735                  * kmem_cache_alloc)
34736                  */
34737 -               rtp = this_cpu_ptr(&radix_tree_preloads);
34738 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
34739                 if (rtp->nr) {
34740                         ret = rtp->nodes;
34741                         rtp->nodes = ret->parent;
34742                         rtp->nr--;
34743                 }
34744 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
34745                 /*
34746                  * Update the allocation stack trace as this is more useful
34747                  * for debugging.
34748 @@ -475,14 +477,14 @@
34749          */
34750         gfp_mask &= ~__GFP_ACCOUNT;
34751  
34752 -       preempt_disable();
34753 +       local_lock(radix_tree_preloads_lock);
34754         rtp = this_cpu_ptr(&radix_tree_preloads);
34755         while (rtp->nr < nr) {
34756 -               preempt_enable();
34757 +               local_unlock(radix_tree_preloads_lock);
34758                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
34759                 if (node == NULL)
34760                         goto out;
34761 -               preempt_disable();
34762 +               local_lock(radix_tree_preloads_lock);
34763                 rtp = this_cpu_ptr(&radix_tree_preloads);
34764                 if (rtp->nr < nr) {
34765                         node->parent = rtp->nodes;
34766 @@ -524,7 +526,7 @@
34767         if (gfpflags_allow_blocking(gfp_mask))
34768                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
34769         /* Preloading doesn't help anything with this gfp mask, skip it */
34770 -       preempt_disable();
34771 +       local_lock(radix_tree_preloads_lock);
34772         return 0;
34773  }
34774  EXPORT_SYMBOL(radix_tree_maybe_preload);
34775 @@ -562,7 +564,7 @@
34776  
34777         /* Preloading doesn't help anything with this gfp mask, skip it */
34778         if (!gfpflags_allow_blocking(gfp_mask)) {
34779 -               preempt_disable();
34780 +               local_lock(radix_tree_preloads_lock);
34781                 return 0;
34782         }
34783  
34784 @@ -596,6 +598,12 @@
34785         return __radix_tree_preload(gfp_mask, nr_nodes);
34786  }
34787  
34788 +void radix_tree_preload_end(void)
34789 +{
34790 +       local_unlock(radix_tree_preloads_lock);
34791 +}
34792 +EXPORT_SYMBOL(radix_tree_preload_end);
34793 +
34794  static unsigned radix_tree_load_root(const struct radix_tree_root *root,
34795                 struct radix_tree_node **nodep, unsigned long *maxindex)
34796  {
34797 @@ -2105,10 +2113,16 @@
34798  void idr_preload(gfp_t gfp_mask)
34799  {
34800         if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
34801 -               preempt_disable();
34802 +               local_lock(radix_tree_preloads_lock);
34803  }
34804  EXPORT_SYMBOL(idr_preload);
34805  
34806 +void idr_preload_end(void)
34807 +{
34808 +       local_unlock(radix_tree_preloads_lock);
34809 +}
34810 +EXPORT_SYMBOL(idr_preload_end);
34811 +
34812  /**
34813   * ida_pre_get - reserve resources for ida allocation
34814   * @ida: ida handle
34815 @@ -2125,7 +2139,7 @@
34816          * to return to the ida_pre_get() step.
34817          */
34818         if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
34819 -               preempt_enable();
34820 +               local_unlock(radix_tree_preloads_lock);
34821  
34822         if (!this_cpu_read(ida_bitmap)) {
34823                 struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
34824 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/scatterlist.c linux-4.14/lib/scatterlist.c
34825 --- linux-4.14.orig/lib/scatterlist.c   2017-11-12 19:46:13.000000000 +0100
34826 +++ linux-4.14/lib/scatterlist.c        2018-09-05 11:05:07.000000000 +0200
34827 @@ -620,7 +620,7 @@
34828                         flush_kernel_dcache_page(miter->page);
34829  
34830                 if (miter->__flags & SG_MITER_ATOMIC) {
34831 -                       WARN_ON_ONCE(preemptible());
34832 +                       WARN_ON_ONCE(!pagefault_disabled());
34833                         kunmap_atomic(miter->addr);
34834                 } else
34835                         kunmap(miter->page);
34836 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/smp_processor_id.c linux-4.14/lib/smp_processor_id.c
34837 --- linux-4.14.orig/lib/smp_processor_id.c      2017-11-12 19:46:13.000000000 +0100
34838 +++ linux-4.14/lib/smp_processor_id.c   2018-09-05 11:05:07.000000000 +0200
34839 @@ -23,7 +23,7 @@
34840          * Kernel threads bound to a single CPU can safely use
34841          * smp_processor_id():
34842          */
34843 -       if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
34844 +       if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
34845                 goto out;
34846  
34847         /*
34848 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/lib/timerqueue.c linux-4.14/lib/timerqueue.c
34849 --- linux-4.14.orig/lib/timerqueue.c    2017-11-12 19:46:13.000000000 +0100
34850 +++ linux-4.14/lib/timerqueue.c 2018-09-05 11:05:07.000000000 +0200
34851 @@ -33,8 +33,9 @@
34852   * @head: head of timerqueue
34853   * @node: timer node to be added
34854   *
34855 - * Adds the timer node to the timerqueue, sorted by the
34856 - * node's expires value.
34857 + * Adds the timer node to the timerqueue, sorted by the node's expires
34858 + * value. Returns true if the newly added timer is the first expiring timer in
34859 + * the queue.
34860   */
34861  bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
34862  {
34863 @@ -70,7 +71,8 @@
34864   * @head: head of timerqueue
34865   * @node: timer node to be removed
34866   *
34867 - * Removes the timer node from the timerqueue.
34868 + * Removes the timer node from the timerqueue. Returns true if the queue is
34869 + * not empty after the remove.
34870   */
34871  bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
34872  {
34873 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/localversion-rt linux-4.14/localversion-rt
34874 --- linux-4.14.orig/localversion-rt     1970-01-01 01:00:00.000000000 +0100
34875 +++ linux-4.14/localversion-rt  2018-09-05 11:05:07.000000000 +0200
34876 @@ -0,0 +1 @@
34877 +-rt40
34878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/backing-dev.c linux-4.14/mm/backing-dev.c
34879 --- linux-4.14.orig/mm/backing-dev.c    2018-09-05 11:03:25.000000000 +0200
34880 +++ linux-4.14/mm/backing-dev.c 2018-09-05 11:05:07.000000000 +0200
34881 @@ -470,9 +470,9 @@
34882  {
34883         unsigned long flags;
34884  
34885 -       local_irq_save(flags);
34886 +       local_irq_save_nort(flags);
34887         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
34888 -               local_irq_restore(flags);
34889 +               local_irq_restore_nort(flags);
34890                 return;
34891         }
34892  
34893 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/compaction.c linux-4.14/mm/compaction.c
34894 --- linux-4.14.orig/mm/compaction.c     2017-11-12 19:46:13.000000000 +0100
34895 +++ linux-4.14/mm/compaction.c  2018-09-05 11:05:07.000000000 +0200
34896 @@ -1634,10 +1634,12 @@
34897                                 block_start_pfn(cc->migrate_pfn, cc->order);
34898  
34899                         if (cc->last_migrated_pfn < current_block_start) {
34900 -                               cpu = get_cpu();
34901 +                               cpu = get_cpu_light();
34902 +                               local_lock_irq(swapvec_lock);
34903                                 lru_add_drain_cpu(cpu);
34904 +                               local_unlock_irq(swapvec_lock);
34905                                 drain_local_pages(zone);
34906 -                               put_cpu();
34907 +                               put_cpu_light();
34908                                 /* No more flushing until we migrate again */
34909                                 cc->last_migrated_pfn = 0;
34910                         }
34911 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/filemap.c linux-4.14/mm/filemap.c
34912 --- linux-4.14.orig/mm/filemap.c        2018-09-05 11:03:28.000000000 +0200
34913 +++ linux-4.14/mm/filemap.c     2018-09-05 11:05:07.000000000 +0200
34914 @@ -110,6 +110,7 @@
34915   * ->i_mmap_rwsem
34916   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
34917   */
34918 +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
34919  
34920  static int page_cache_tree_insert(struct address_space *mapping,
34921                                   struct page *page, void **shadowp)
34922 @@ -133,8 +134,10 @@
34923                 if (shadowp)
34924                         *shadowp = p;
34925         }
34926 +       local_lock(shadow_nodes_lock);
34927         __radix_tree_replace(&mapping->page_tree, node, slot, page,
34928 -                            workingset_update_node, mapping);
34929 +                            __workingset_update_node, mapping);
34930 +       local_unlock(shadow_nodes_lock);
34931         mapping->nrpages++;
34932         return 0;
34933  }
34934 @@ -151,6 +154,7 @@
34935         VM_BUG_ON_PAGE(PageTail(page), page);
34936         VM_BUG_ON_PAGE(nr != 1 && shadow, page);
34937  
34938 +       local_lock(shadow_nodes_lock);
34939         for (i = 0; i < nr; i++) {
34940                 struct radix_tree_node *node;
34941                 void **slot;
34942 @@ -162,8 +166,9 @@
34943  
34944                 radix_tree_clear_tags(&mapping->page_tree, node, slot);
34945                 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
34946 -                                    workingset_update_node, mapping);
34947 +                                    __workingset_update_node, mapping);
34948         }
34949 +       local_unlock(shadow_nodes_lock);
34950  
34951         if (shadow) {
34952                 mapping->nrexceptional += nr;
34953 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/highmem.c linux-4.14/mm/highmem.c
34954 --- linux-4.14.orig/mm/highmem.c        2017-11-12 19:46:13.000000000 +0100
34955 +++ linux-4.14/mm/highmem.c     2018-09-05 11:05:07.000000000 +0200
34956 @@ -30,10 +30,11 @@
34957  #include <linux/kgdb.h>
34958  #include <asm/tlbflush.h>
34959  
34960 -
34961 +#ifndef CONFIG_PREEMPT_RT_FULL
34962  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34963  DEFINE_PER_CPU(int, __kmap_atomic_idx);
34964  #endif
34965 +#endif
34966  
34967  /*
34968   * Virtual_count is not a pure "count".
34969 @@ -108,8 +109,9 @@
34970  unsigned long totalhigh_pages __read_mostly;
34971  EXPORT_SYMBOL(totalhigh_pages);
34972  
34973 -
34974 +#ifndef CONFIG_PREEMPT_RT_FULL
34975  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
34976 +#endif
34977  
34978  unsigned int nr_free_highpages (void)
34979  {
34980 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/Kconfig linux-4.14/mm/Kconfig
34981 --- linux-4.14.orig/mm/Kconfig  2018-09-05 11:03:25.000000000 +0200
34982 +++ linux-4.14/mm/Kconfig       2018-09-05 11:05:07.000000000 +0200
34983 @@ -385,7 +385,7 @@
34984  
34985  config TRANSPARENT_HUGEPAGE
34986         bool "Transparent Hugepage Support"
34987 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
34988 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
34989         select COMPACTION
34990         select RADIX_TREE_MULTIORDER
34991         help
34992 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/memcontrol.c linux-4.14/mm/memcontrol.c
34993 --- linux-4.14.orig/mm/memcontrol.c     2018-09-05 11:03:25.000000000 +0200
34994 +++ linux-4.14/mm/memcontrol.c  2018-09-05 11:05:07.000000000 +0200
34995 @@ -69,6 +69,7 @@
34996  #include <net/sock.h>
34997  #include <net/ip.h>
34998  #include "slab.h"
34999 +#include <linux/locallock.h>
35000  
35001  #include <linux/uaccess.h>
35002  
35003 @@ -94,6 +95,8 @@
35004  #define do_swap_account                0
35005  #endif
35006  
35007 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
35008 +
35009  /* Whether legacy memory+swap accounting is active */
35010  static bool do_memsw_account(void)
35011  {
35012 @@ -1831,7 +1834,7 @@
35013          * as well as workers from this path always operate on the local
35014          * per-cpu data. CPU up doesn't touch memcg_stock at all.
35015          */
35016 -       curcpu = get_cpu();
35017 +       curcpu = get_cpu_light();
35018         for_each_online_cpu(cpu) {
35019                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
35020                 struct mem_cgroup *memcg;
35021 @@ -1851,7 +1854,7 @@
35022                 }
35023                 css_put(&memcg->css);
35024         }
35025 -       put_cpu();
35026 +       put_cpu_light();
35027         mutex_unlock(&percpu_charge_mutex);
35028  }
35029  
35030 @@ -4624,12 +4627,12 @@
35031  
35032         ret = 0;
35033  
35034 -       local_irq_disable();
35035 +       local_lock_irq(event_lock);
35036         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
35037         memcg_check_events(to, page);
35038         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
35039         memcg_check_events(from, page);
35040 -       local_irq_enable();
35041 +       local_unlock_irq(event_lock);
35042  out_unlock:
35043         unlock_page(page);
35044  out:
35045 @@ -5572,10 +5575,10 @@
35046  
35047         commit_charge(page, memcg, lrucare);
35048  
35049 -       local_irq_disable();
35050 +       local_lock_irq(event_lock);
35051         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
35052         memcg_check_events(memcg, page);
35053 -       local_irq_enable();
35054 +       local_unlock_irq(event_lock);
35055  
35056         if (do_memsw_account() && PageSwapCache(page)) {
35057                 swp_entry_t entry = { .val = page_private(page) };
35058 @@ -5644,7 +5647,7 @@
35059                 memcg_oom_recover(ug->memcg);
35060         }
35061  
35062 -       local_irq_save(flags);
35063 +       local_lock_irqsave(event_lock, flags);
35064         __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
35065         __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
35066         __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
35067 @@ -5652,7 +5655,7 @@
35068         __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
35069         __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
35070         memcg_check_events(ug->memcg, ug->dummy_page);
35071 -       local_irq_restore(flags);
35072 +       local_unlock_irqrestore(event_lock, flags);
35073  
35074         if (!mem_cgroup_is_root(ug->memcg))
35075                 css_put_many(&ug->memcg->css, nr_pages);
35076 @@ -5815,10 +5818,10 @@
35077  
35078         commit_charge(newpage, memcg, false);
35079  
35080 -       local_irq_save(flags);
35081 +       local_lock_irqsave(event_lock, flags);
35082         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
35083         memcg_check_events(memcg, newpage);
35084 -       local_irq_restore(flags);
35085 +       local_unlock_irqrestore(event_lock, flags);
35086  }
35087  
35088  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
35089 @@ -6010,6 +6013,7 @@
35090         struct mem_cgroup *memcg, *swap_memcg;
35091         unsigned int nr_entries;
35092         unsigned short oldid;
35093 +       unsigned long flags;
35094  
35095         VM_BUG_ON_PAGE(PageLRU(page), page);
35096         VM_BUG_ON_PAGE(page_count(page), page);
35097 @@ -6055,13 +6059,17 @@
35098          * important here to have the interrupts disabled because it is the
35099          * only synchronisation we have for udpating the per-CPU variables.
35100          */
35101 +       local_lock_irqsave(event_lock, flags);
35102 +#ifndef CONFIG_PREEMPT_RT_BASE
35103         VM_BUG_ON(!irqs_disabled());
35104 +#endif
35105         mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
35106                                      -nr_entries);
35107         memcg_check_events(memcg, page);
35108  
35109         if (!mem_cgroup_is_root(memcg))
35110                 css_put_many(&memcg->css, nr_entries);
35111 +       local_unlock_irqrestore(event_lock, flags);
35112  }
35113  
35114  /**
35115 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/mmu_context.c linux-4.14/mm/mmu_context.c
35116 --- linux-4.14.orig/mm/mmu_context.c    2017-11-12 19:46:13.000000000 +0100
35117 +++ linux-4.14/mm/mmu_context.c 2018-09-05 11:05:07.000000000 +0200
35118 @@ -25,6 +25,7 @@
35119         struct task_struct *tsk = current;
35120  
35121         task_lock(tsk);
35122 +       preempt_disable_rt();
35123         active_mm = tsk->active_mm;
35124         if (active_mm != mm) {
35125                 mmgrab(mm);
35126 @@ -32,6 +33,7 @@
35127         }
35128         tsk->mm = mm;
35129         switch_mm(active_mm, mm, tsk);
35130 +       preempt_enable_rt();
35131         task_unlock(tsk);
35132  #ifdef finish_arch_post_lock_switch
35133         finish_arch_post_lock_switch();
35134 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/page_alloc.c linux-4.14/mm/page_alloc.c
35135 --- linux-4.14.orig/mm/page_alloc.c     2018-09-05 11:03:25.000000000 +0200
35136 +++ linux-4.14/mm/page_alloc.c  2018-09-05 11:05:07.000000000 +0200
35137 @@ -61,6 +61,7 @@
35138  #include <linux/hugetlb.h>
35139  #include <linux/sched/rt.h>
35140  #include <linux/sched/mm.h>
35141 +#include <linux/locallock.h>
35142  #include <linux/page_owner.h>
35143  #include <linux/kthread.h>
35144  #include <linux/memcontrol.h>
35145 @@ -286,6 +287,18 @@
35146  EXPORT_SYMBOL(nr_online_nodes);
35147  #endif
35148  
35149 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
35150 +
35151 +#ifdef CONFIG_PREEMPT_RT_BASE
35152 +# define cpu_lock_irqsave(cpu, flags)          \
35153 +       local_lock_irqsave_on(pa_lock, flags, cpu)
35154 +# define cpu_unlock_irqrestore(cpu, flags)     \
35155 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
35156 +#else
35157 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
35158 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
35159 +#endif
35160 +
35161  int page_group_by_mobility_disabled __read_mostly;
35162  
35163  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
35164 @@ -1094,7 +1107,7 @@
35165  #endif /* CONFIG_DEBUG_VM */
35166  
35167  /*
35168 - * Frees a number of pages from the PCP lists
35169 + * Frees a number of pages which have been collected from the pcp lists.
35170   * Assumes all pages on list are in same zone, and of same order.
35171   * count is the number of pages to free.
35172   *
35173 @@ -1105,15 +1118,53 @@
35174   * pinned" detection logic.
35175   */
35176  static void free_pcppages_bulk(struct zone *zone, int count,
35177 -                                       struct per_cpu_pages *pcp)
35178 +                              struct list_head *list)
35179  {
35180 -       int migratetype = 0;
35181 -       int batch_free = 0;
35182         bool isolated_pageblocks;
35183 +       unsigned long flags;
35184  
35185 -       spin_lock(&zone->lock);
35186 +       spin_lock_irqsave(&zone->lock, flags);
35187         isolated_pageblocks = has_isolate_pageblock(zone);
35188  
35189 +       while (!list_empty(list)) {
35190 +               struct page *page;
35191 +               int mt; /* migratetype of the to-be-freed page */
35192 +
35193 +               page = list_first_entry(list, struct page, lru);
35194 +               /* must delete as __free_one_page list manipulates */
35195 +               list_del(&page->lru);
35196 +
35197 +               mt = get_pcppage_migratetype(page);
35198 +               /* MIGRATE_ISOLATE page should not go to pcplists */
35199 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
35200 +               /* Pageblock could have been isolated meanwhile */
35201 +               if (unlikely(isolated_pageblocks))
35202 +                       mt = get_pageblock_migratetype(page);
35203 +
35204 +               if (bulkfree_pcp_prepare(page))
35205 +                       continue;
35206 +
35207 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
35208 +               trace_mm_page_pcpu_drain(page, 0, mt);
35209 +               count--;
35210 +       }
35211 +       WARN_ON(count != 0);
35212 +       spin_unlock_irqrestore(&zone->lock, flags);
35213 +}
35214 +
35215 +/*
35216 + * Moves a number of pages from the PCP lists to free list which
35217 + * is freed outside of the locked region.
35218 + *
35219 + * Assumes all pages on list are in same zone, and of same order.
35220 + * count is the number of pages to free.
35221 + */
35222 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
35223 +                             struct list_head *dst)
35224 +{
35225 +       int migratetype = 0;
35226 +       int batch_free = 0;
35227 +
35228         while (count) {
35229                 struct page *page;
35230                 struct list_head *list;
35231 @@ -1129,7 +1180,7 @@
35232                         batch_free++;
35233                         if (++migratetype == MIGRATE_PCPTYPES)
35234                                 migratetype = 0;
35235 -                       list = &pcp->lists[migratetype];
35236 +                       list = &src->lists[migratetype];
35237                 } while (list_empty(list));
35238  
35239                 /* This is the only non-empty list. Free them all. */
35240 @@ -1137,27 +1188,12 @@
35241                         batch_free = count;
35242  
35243                 do {
35244 -                       int mt; /* migratetype of the to-be-freed page */
35245 -
35246                         page = list_last_entry(list, struct page, lru);
35247 -                       /* must delete as __free_one_page list manipulates */
35248                         list_del(&page->lru);
35249  
35250 -                       mt = get_pcppage_migratetype(page);
35251 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
35252 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
35253 -                       /* Pageblock could have been isolated meanwhile */
35254 -                       if (unlikely(isolated_pageblocks))
35255 -                               mt = get_pageblock_migratetype(page);
35256 -
35257 -                       if (bulkfree_pcp_prepare(page))
35258 -                               continue;
35259 -
35260 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
35261 -                       trace_mm_page_pcpu_drain(page, 0, mt);
35262 +                       list_add(&page->lru, dst);
35263                 } while (--count && --batch_free && !list_empty(list));
35264         }
35265 -       spin_unlock(&zone->lock);
35266  }
35267  
35268  static void free_one_page(struct zone *zone,
35269 @@ -1165,13 +1201,15 @@
35270                                 unsigned int order,
35271                                 int migratetype)
35272  {
35273 -       spin_lock(&zone->lock);
35274 +       unsigned long flags;
35275 +
35276 +       spin_lock_irqsave(&zone->lock, flags);
35277         if (unlikely(has_isolate_pageblock(zone) ||
35278                 is_migrate_isolate(migratetype))) {
35279                 migratetype = get_pfnblock_migratetype(page, pfn);
35280         }
35281         __free_one_page(page, pfn, zone, order, migratetype);
35282 -       spin_unlock(&zone->lock);
35283 +       spin_unlock_irqrestore(&zone->lock, flags);
35284  }
35285  
35286  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
35287 @@ -1257,10 +1295,10 @@
35288                 return;
35289  
35290         migratetype = get_pfnblock_migratetype(page, pfn);
35291 -       local_irq_save(flags);
35292 +       local_lock_irqsave(pa_lock, flags);
35293         __count_vm_events(PGFREE, 1 << order);
35294         free_one_page(page_zone(page), page, pfn, order, migratetype);
35295 -       local_irq_restore(flags);
35296 +       local_unlock_irqrestore(pa_lock, flags);
35297  }
35298  
35299  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
35300 @@ -2378,16 +2416,18 @@
35301  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
35302  {
35303         unsigned long flags;
35304 +       LIST_HEAD(dst);
35305         int to_drain, batch;
35306  
35307 -       local_irq_save(flags);
35308 +       local_lock_irqsave(pa_lock, flags);
35309         batch = READ_ONCE(pcp->batch);
35310         to_drain = min(pcp->count, batch);
35311         if (to_drain > 0) {
35312 -               free_pcppages_bulk(zone, to_drain, pcp);
35313 +               isolate_pcp_pages(to_drain, pcp, &dst);
35314                 pcp->count -= to_drain;
35315         }
35316 -       local_irq_restore(flags);
35317 +       local_unlock_irqrestore(pa_lock, flags);
35318 +       free_pcppages_bulk(zone, to_drain, &dst);
35319  }
35320  #endif
35321  
35322 @@ -2403,16 +2443,21 @@
35323         unsigned long flags;
35324         struct per_cpu_pageset *pset;
35325         struct per_cpu_pages *pcp;
35326 +       LIST_HEAD(dst);
35327 +       int count;
35328  
35329 -       local_irq_save(flags);
35330 +       cpu_lock_irqsave(cpu, flags);
35331         pset = per_cpu_ptr(zone->pageset, cpu);
35332  
35333         pcp = &pset->pcp;
35334 -       if (pcp->count) {
35335 -               free_pcppages_bulk(zone, pcp->count, pcp);
35336 +       count = pcp->count;
35337 +       if (count) {
35338 +               isolate_pcp_pages(count, pcp, &dst);
35339                 pcp->count = 0;
35340         }
35341 -       local_irq_restore(flags);
35342 +       cpu_unlock_irqrestore(cpu, flags);
35343 +       if (count)
35344 +               free_pcppages_bulk(zone, count, &dst);
35345  }
35346  
35347  /*
35348 @@ -2447,6 +2492,7 @@
35349                 drain_pages(cpu);
35350  }
35351  
35352 +#ifndef CONFIG_PREEMPT_RT_BASE
35353  static void drain_local_pages_wq(struct work_struct *work)
35354  {
35355         /*
35356 @@ -2460,6 +2506,7 @@
35357         drain_local_pages(NULL);
35358         preempt_enable();
35359  }
35360 +#endif
35361  
35362  /*
35363   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
35364 @@ -2526,7 +2573,14 @@
35365                 else
35366                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
35367         }
35368 -
35369 +#ifdef CONFIG_PREEMPT_RT_BASE
35370 +       for_each_cpu(cpu, &cpus_with_pcps) {
35371 +               if (zone)
35372 +                       drain_pages_zone(cpu, zone);
35373 +               else
35374 +                       drain_pages(cpu);
35375 +       }
35376 +#else
35377         for_each_cpu(cpu, &cpus_with_pcps) {
35378                 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
35379                 INIT_WORK(work, drain_local_pages_wq);
35380 @@ -2534,6 +2588,7 @@
35381         }
35382         for_each_cpu(cpu, &cpus_with_pcps)
35383                 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
35384 +#endif
35385  
35386         mutex_unlock(&pcpu_drain_mutex);
35387  }
35388 @@ -2610,7 +2665,7 @@
35389  
35390         migratetype = get_pfnblock_migratetype(page, pfn);
35391         set_pcppage_migratetype(page, migratetype);
35392 -       local_irq_save(flags);
35393 +       local_lock_irqsave(pa_lock, flags);
35394         __count_vm_event(PGFREE);
35395  
35396         /*
35397 @@ -2636,12 +2691,17 @@
35398         pcp->count++;
35399         if (pcp->count >= pcp->high) {
35400                 unsigned long batch = READ_ONCE(pcp->batch);
35401 -               free_pcppages_bulk(zone, batch, pcp);
35402 +               LIST_HEAD(dst);
35403 +
35404 +               isolate_pcp_pages(batch, pcp, &dst);
35405                 pcp->count -= batch;
35406 +               local_unlock_irqrestore(pa_lock, flags);
35407 +               free_pcppages_bulk(zone, batch, &dst);
35408 +               return;
35409         }
35410  
35411  out:
35412 -       local_irq_restore(flags);
35413 +       local_unlock_irqrestore(pa_lock, flags);
35414  }
35415  
35416  /*
35417 @@ -2789,7 +2849,7 @@
35418         struct page *page;
35419         unsigned long flags;
35420  
35421 -       local_irq_save(flags);
35422 +       local_lock_irqsave(pa_lock, flags);
35423         pcp = &this_cpu_ptr(zone->pageset)->pcp;
35424         list = &pcp->lists[migratetype];
35425         page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
35426 @@ -2797,7 +2857,7 @@
35427                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
35428                 zone_statistics(preferred_zone, zone);
35429         }
35430 -       local_irq_restore(flags);
35431 +       local_unlock_irqrestore(pa_lock, flags);
35432         return page;
35433  }
35434  
35435 @@ -2824,7 +2884,7 @@
35436          * allocate greater than order-1 page units with __GFP_NOFAIL.
35437          */
35438         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
35439 -       spin_lock_irqsave(&zone->lock, flags);
35440 +       local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
35441  
35442         do {
35443                 page = NULL;
35444 @@ -2844,14 +2904,14 @@
35445  
35446         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
35447         zone_statistics(preferred_zone, zone);
35448 -       local_irq_restore(flags);
35449 +       local_unlock_irqrestore(pa_lock, flags);
35450  
35451  out:
35452         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
35453         return page;
35454  
35455  failed:
35456 -       local_irq_restore(flags);
35457 +       local_unlock_irqrestore(pa_lock, flags);
35458         return NULL;
35459  }
35460  
35461 @@ -6778,8 +6838,9 @@
35462  
35463  static int page_alloc_cpu_dead(unsigned int cpu)
35464  {
35465 -
35466 +       local_lock_irq_on(swapvec_lock, cpu);
35467         lru_add_drain_cpu(cpu);
35468 +       local_unlock_irq_on(swapvec_lock, cpu);
35469         drain_pages(cpu);
35470  
35471         /*
35472 @@ -7683,7 +7744,7 @@
35473         struct per_cpu_pageset *pset;
35474  
35475         /* avoid races with drain_pages()  */
35476 -       local_irq_save(flags);
35477 +       local_lock_irqsave(pa_lock, flags);
35478         if (zone->pageset != &boot_pageset) {
35479                 for_each_online_cpu(cpu) {
35480                         pset = per_cpu_ptr(zone->pageset, cpu);
35481 @@ -7692,7 +7753,7 @@
35482                 free_percpu(zone->pageset);
35483                 zone->pageset = &boot_pageset;
35484         }
35485 -       local_irq_restore(flags);
35486 +       local_unlock_irqrestore(pa_lock, flags);
35487  }
35488  
35489  #ifdef CONFIG_MEMORY_HOTREMOVE
35490 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/slab.h linux-4.14/mm/slab.h
35491 --- linux-4.14.orig/mm/slab.h   2018-09-05 11:03:25.000000000 +0200
35492 +++ linux-4.14/mm/slab.h        2018-09-05 11:05:07.000000000 +0200
35493 @@ -451,7 +451,11 @@
35494   * The slab lists for all objects.
35495   */
35496  struct kmem_cache_node {
35497 +#ifdef CONFIG_SLUB
35498 +       raw_spinlock_t list_lock;
35499 +#else
35500         spinlock_t list_lock;
35501 +#endif
35502  
35503  #ifdef CONFIG_SLAB
35504         struct list_head slabs_partial; /* partial list first, better asm code */
35505 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/slub.c linux-4.14/mm/slub.c
35506 --- linux-4.14.orig/mm/slub.c   2018-09-05 11:03:25.000000000 +0200
35507 +++ linux-4.14/mm/slub.c        2018-09-05 11:05:07.000000000 +0200
35508 @@ -1179,7 +1179,7 @@
35509         unsigned long uninitialized_var(flags);
35510         int ret = 0;
35511  
35512 -       spin_lock_irqsave(&n->list_lock, flags);
35513 +       raw_spin_lock_irqsave(&n->list_lock, flags);
35514         slab_lock(page);
35515  
35516         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
35517 @@ -1214,7 +1214,7 @@
35518                          bulk_cnt, cnt);
35519  
35520         slab_unlock(page);
35521 -       spin_unlock_irqrestore(&n->list_lock, flags);
35522 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
35523         if (!ret)
35524                 slab_fix(s, "Object at 0x%p not freed", object);
35525         return ret;
35526 @@ -1342,6 +1342,12 @@
35527  
35528  #endif /* CONFIG_SLUB_DEBUG */
35529  
35530 +struct slub_free_list {
35531 +       raw_spinlock_t          lock;
35532 +       struct list_head        list;
35533 +};
35534 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
35535 +
35536  /*
35537   * Hooks for other subsystems that check memory allocations. In a typical
35538   * production configuration these hooks all should produce no code at all.
35539 @@ -1561,10 +1567,17 @@
35540         void *start, *p;
35541         int idx, order;
35542         bool shuffle;
35543 +       bool enableirqs = false;
35544  
35545         flags &= gfp_allowed_mask;
35546  
35547         if (gfpflags_allow_blocking(flags))
35548 +               enableirqs = true;
35549 +#ifdef CONFIG_PREEMPT_RT_FULL
35550 +       if (system_state > SYSTEM_BOOTING)
35551 +               enableirqs = true;
35552 +#endif
35553 +       if (enableirqs)
35554                 local_irq_enable();
35555  
35556         flags |= s->allocflags;
35557 @@ -1623,7 +1636,7 @@
35558         page->frozen = 1;
35559  
35560  out:
35561 -       if (gfpflags_allow_blocking(flags))
35562 +       if (enableirqs)
35563                 local_irq_disable();
35564         if (!page)
35565                 return NULL;
35566 @@ -1681,6 +1694,16 @@
35567         __free_pages(page, order);
35568  }
35569  
35570 +static void free_delayed(struct list_head *h)
35571 +{
35572 +       while(!list_empty(h)) {
35573 +               struct page *page = list_first_entry(h, struct page, lru);
35574 +
35575 +               list_del(&page->lru);
35576 +               __free_slab(page->slab_cache, page);
35577 +       }
35578 +}
35579 +
35580  #define need_reserve_slab_rcu                                          \
35581         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
35582  
35583 @@ -1712,6 +1735,12 @@
35584                 }
35585  
35586                 call_rcu(head, rcu_free_slab);
35587 +       } else if (irqs_disabled()) {
35588 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
35589 +
35590 +               raw_spin_lock(&f->lock);
35591 +               list_add(&page->lru, &f->list);
35592 +               raw_spin_unlock(&f->lock);
35593         } else
35594                 __free_slab(s, page);
35595  }
35596 @@ -1819,7 +1848,7 @@
35597         if (!n || !n->nr_partial)
35598                 return NULL;
35599  
35600 -       spin_lock(&n->list_lock);
35601 +       raw_spin_lock(&n->list_lock);
35602         list_for_each_entry_safe(page, page2, &n->partial, lru) {
35603                 void *t;
35604  
35605 @@ -1844,7 +1873,7 @@
35606                         break;
35607  
35608         }
35609 -       spin_unlock(&n->list_lock);
35610 +       raw_spin_unlock(&n->list_lock);
35611         return object;
35612  }
35613  
35614 @@ -2090,7 +2119,7 @@
35615                          * that acquire_slab() will see a slab page that
35616                          * is frozen
35617                          */
35618 -                       spin_lock(&n->list_lock);
35619 +                       raw_spin_lock(&n->list_lock);
35620                 }
35621         } else {
35622                 m = M_FULL;
35623 @@ -2101,7 +2130,7 @@
35624                          * slabs from diagnostic functions will not see
35625                          * any frozen slabs.
35626                          */
35627 -                       spin_lock(&n->list_lock);
35628 +                       raw_spin_lock(&n->list_lock);
35629                 }
35630         }
35631  
35632 @@ -2136,7 +2165,7 @@
35633                 goto redo;
35634  
35635         if (lock)
35636 -               spin_unlock(&n->list_lock);
35637 +               raw_spin_unlock(&n->list_lock);
35638  
35639         if (m == M_FREE) {
35640                 stat(s, DEACTIVATE_EMPTY);
35641 @@ -2171,10 +2200,10 @@
35642                 n2 = get_node(s, page_to_nid(page));
35643                 if (n != n2) {
35644                         if (n)
35645 -                               spin_unlock(&n->list_lock);
35646 +                               raw_spin_unlock(&n->list_lock);
35647  
35648                         n = n2;
35649 -                       spin_lock(&n->list_lock);
35650 +                       raw_spin_lock(&n->list_lock);
35651                 }
35652  
35653                 do {
35654 @@ -2203,7 +2232,7 @@
35655         }
35656  
35657         if (n)
35658 -               spin_unlock(&n->list_lock);
35659 +               raw_spin_unlock(&n->list_lock);
35660  
35661         while (discard_page) {
35662                 page = discard_page;
35663 @@ -2242,14 +2271,21 @@
35664                         pobjects = oldpage->pobjects;
35665                         pages = oldpage->pages;
35666                         if (drain && pobjects > s->cpu_partial) {
35667 +                               struct slub_free_list *f;
35668                                 unsigned long flags;
35669 +                               LIST_HEAD(tofree);
35670                                 /*
35671                                  * partial array is full. Move the existing
35672                                  * set to the per node partial list.
35673                                  */
35674                                 local_irq_save(flags);
35675                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
35676 +                               f = this_cpu_ptr(&slub_free_list);
35677 +                               raw_spin_lock(&f->lock);
35678 +                               list_splice_init(&f->list, &tofree);
35679 +                               raw_spin_unlock(&f->lock);
35680                                 local_irq_restore(flags);
35681 +                               free_delayed(&tofree);
35682                                 oldpage = NULL;
35683                                 pobjects = 0;
35684                                 pages = 0;
35685 @@ -2319,7 +2355,22 @@
35686  
35687  static void flush_all(struct kmem_cache *s)
35688  {
35689 +       LIST_HEAD(tofree);
35690 +       int cpu;
35691 +
35692         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
35693 +       for_each_online_cpu(cpu) {
35694 +               struct slub_free_list *f;
35695 +
35696 +               if (!has_cpu_slab(cpu, s))
35697 +                       continue;
35698 +
35699 +               f = &per_cpu(slub_free_list, cpu);
35700 +               raw_spin_lock_irq(&f->lock);
35701 +               list_splice_init(&f->list, &tofree);
35702 +               raw_spin_unlock_irq(&f->lock);
35703 +               free_delayed(&tofree);
35704 +       }
35705  }
35706  
35707  /*
35708 @@ -2374,10 +2425,10 @@
35709         unsigned long x = 0;
35710         struct page *page;
35711  
35712 -       spin_lock_irqsave(&n->list_lock, flags);
35713 +       raw_spin_lock_irqsave(&n->list_lock, flags);
35714         list_for_each_entry(page, &n->partial, lru)
35715                 x += get_count(page);
35716 -       spin_unlock_irqrestore(&n->list_lock, flags);
35717 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
35718         return x;
35719  }
35720  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
35721 @@ -2515,8 +2566,10 @@
35722   * already disabled (which is the case for bulk allocation).
35723   */
35724  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
35725 -                         unsigned long addr, struct kmem_cache_cpu *c)
35726 +                         unsigned long addr, struct kmem_cache_cpu *c,
35727 +                         struct list_head *to_free)
35728  {
35729 +       struct slub_free_list *f;
35730         void *freelist;
35731         struct page *page;
35732  
35733 @@ -2572,6 +2625,13 @@
35734         VM_BUG_ON(!c->page->frozen);
35735         c->freelist = get_freepointer(s, freelist);
35736         c->tid = next_tid(c->tid);
35737 +
35738 +out:
35739 +       f = this_cpu_ptr(&slub_free_list);
35740 +       raw_spin_lock(&f->lock);
35741 +       list_splice_init(&f->list, to_free);
35742 +       raw_spin_unlock(&f->lock);
35743 +
35744         return freelist;
35745  
35746  new_slab:
35747 @@ -2587,7 +2647,7 @@
35748  
35749         if (unlikely(!freelist)) {
35750                 slab_out_of_memory(s, gfpflags, node);
35751 -               return NULL;
35752 +               goto out;
35753         }
35754  
35755         page = c->page;
35756 @@ -2600,7 +2660,7 @@
35757                 goto new_slab;  /* Slab failed checks. Next slab needed */
35758  
35759         deactivate_slab(s, page, get_freepointer(s, freelist), c);
35760 -       return freelist;
35761 +       goto out;
35762  }
35763  
35764  /*
35765 @@ -2612,6 +2672,7 @@
35766  {
35767         void *p;
35768         unsigned long flags;
35769 +       LIST_HEAD(tofree);
35770  
35771         local_irq_save(flags);
35772  #ifdef CONFIG_PREEMPT
35773 @@ -2623,8 +2684,9 @@
35774         c = this_cpu_ptr(s->cpu_slab);
35775  #endif
35776  
35777 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
35778 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
35779         local_irq_restore(flags);
35780 +       free_delayed(&tofree);
35781         return p;
35782  }
35783  
35784 @@ -2810,7 +2872,7 @@
35785  
35786         do {
35787                 if (unlikely(n)) {
35788 -                       spin_unlock_irqrestore(&n->list_lock, flags);
35789 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
35790                         n = NULL;
35791                 }
35792                 prior = page->freelist;
35793 @@ -2842,7 +2904,7 @@
35794                                  * Otherwise the list_lock will synchronize with
35795                                  * other processors updating the list of slabs.
35796                                  */
35797 -                               spin_lock_irqsave(&n->list_lock, flags);
35798 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
35799  
35800                         }
35801                 }
35802 @@ -2884,7 +2946,7 @@
35803                 add_partial(n, page, DEACTIVATE_TO_TAIL);
35804                 stat(s, FREE_ADD_PARTIAL);
35805         }
35806 -       spin_unlock_irqrestore(&n->list_lock, flags);
35807 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
35808         return;
35809  
35810  slab_empty:
35811 @@ -2899,7 +2961,7 @@
35812                 remove_full(s, n, page);
35813         }
35814  
35815 -       spin_unlock_irqrestore(&n->list_lock, flags);
35816 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
35817         stat(s, FREE_SLAB);
35818         discard_slab(s, page);
35819  }
35820 @@ -3104,6 +3166,7 @@
35821                           void **p)
35822  {
35823         struct kmem_cache_cpu *c;
35824 +       LIST_HEAD(to_free);
35825         int i;
35826  
35827         /* memcg and kmem_cache debug support */
35828 @@ -3127,7 +3190,7 @@
35829                          * of re-populating per CPU c->freelist
35830                          */
35831                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
35832 -                                           _RET_IP_, c);
35833 +                                           _RET_IP_, c, &to_free);
35834                         if (unlikely(!p[i]))
35835                                 goto error;
35836  
35837 @@ -3139,6 +3202,7 @@
35838         }
35839         c->tid = next_tid(c->tid);
35840         local_irq_enable();
35841 +       free_delayed(&to_free);
35842  
35843         /* Clear memory outside IRQ disabled fastpath loop */
35844         if (unlikely(flags & __GFP_ZERO)) {
35845 @@ -3153,6 +3217,7 @@
35846         return i;
35847  error:
35848         local_irq_enable();
35849 +       free_delayed(&to_free);
35850         slab_post_alloc_hook(s, flags, i, p);
35851         __kmem_cache_free_bulk(s, i, p);
35852         return 0;
35853 @@ -3286,7 +3351,7 @@
35854  init_kmem_cache_node(struct kmem_cache_node *n)
35855  {
35856         n->nr_partial = 0;
35857 -       spin_lock_init(&n->list_lock);
35858 +       raw_spin_lock_init(&n->list_lock);
35859         INIT_LIST_HEAD(&n->partial);
35860  #ifdef CONFIG_SLUB_DEBUG
35861         atomic_long_set(&n->nr_slabs, 0);
35862 @@ -3640,6 +3705,10 @@
35863                                                         const char *text)
35864  {
35865  #ifdef CONFIG_SLUB_DEBUG
35866 +#ifdef CONFIG_PREEMPT_RT_BASE
35867 +       /* XXX move out of irq-off section */
35868 +       slab_err(s, page, text, s->name);
35869 +#else
35870         void *addr = page_address(page);
35871         void *p;
35872         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
35873 @@ -3660,6 +3729,7 @@
35874         slab_unlock(page);
35875         kfree(map);
35876  #endif
35877 +#endif
35878  }
35879  
35880  /*
35881 @@ -3673,7 +3743,7 @@
35882         struct page *page, *h;
35883  
35884         BUG_ON(irqs_disabled());
35885 -       spin_lock_irq(&n->list_lock);
35886 +       raw_spin_lock_irq(&n->list_lock);
35887         list_for_each_entry_safe(page, h, &n->partial, lru) {
35888                 if (!page->inuse) {
35889                         remove_partial(n, page);
35890 @@ -3683,7 +3753,7 @@
35891                         "Objects remaining in %s on __kmem_cache_shutdown()");
35892                 }
35893         }
35894 -       spin_unlock_irq(&n->list_lock);
35895 +       raw_spin_unlock_irq(&n->list_lock);
35896  
35897         list_for_each_entry_safe(page, h, &discard, lru)
35898                 discard_slab(s, page);
35899 @@ -3927,7 +3997,7 @@
35900                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
35901                         INIT_LIST_HEAD(promote + i);
35902  
35903 -               spin_lock_irqsave(&n->list_lock, flags);
35904 +               raw_spin_lock_irqsave(&n->list_lock, flags);
35905  
35906                 /*
35907                  * Build lists of slabs to discard or promote.
35908 @@ -3958,7 +4028,7 @@
35909                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
35910                         list_splice(promote + i, &n->partial);
35911  
35912 -               spin_unlock_irqrestore(&n->list_lock, flags);
35913 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
35914  
35915                 /* Release empty slabs */
35916                 list_for_each_entry_safe(page, t, &discard, lru)
35917 @@ -4171,6 +4241,12 @@
35918  {
35919         static __initdata struct kmem_cache boot_kmem_cache,
35920                 boot_kmem_cache_node;
35921 +       int cpu;
35922 +
35923 +       for_each_possible_cpu(cpu) {
35924 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
35925 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
35926 +       }
35927  
35928         if (debug_guardpage_minorder())
35929                 slub_max_order = 0;
35930 @@ -4379,7 +4455,7 @@
35931         struct page *page;
35932         unsigned long flags;
35933  
35934 -       spin_lock_irqsave(&n->list_lock, flags);
35935 +       raw_spin_lock_irqsave(&n->list_lock, flags);
35936  
35937         list_for_each_entry(page, &n->partial, lru) {
35938                 validate_slab_slab(s, page, map);
35939 @@ -4401,7 +4477,7 @@
35940                        s->name, count, atomic_long_read(&n->nr_slabs));
35941  
35942  out:
35943 -       spin_unlock_irqrestore(&n->list_lock, flags);
35944 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
35945         return count;
35946  }
35947  
35948 @@ -4589,12 +4665,12 @@
35949                 if (!atomic_long_read(&n->nr_slabs))
35950                         continue;
35951  
35952 -               spin_lock_irqsave(&n->list_lock, flags);
35953 +               raw_spin_lock_irqsave(&n->list_lock, flags);
35954                 list_for_each_entry(page, &n->partial, lru)
35955                         process_slab(&t, s, page, alloc, map);
35956                 list_for_each_entry(page, &n->full, lru)
35957                         process_slab(&t, s, page, alloc, map);
35958 -               spin_unlock_irqrestore(&n->list_lock, flags);
35959 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
35960         }
35961  
35962         for (i = 0; i < t.count; i++) {
35963 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/swap.c linux-4.14/mm/swap.c
35964 --- linux-4.14.orig/mm/swap.c   2017-11-12 19:46:13.000000000 +0100
35965 +++ linux-4.14/mm/swap.c        2018-09-05 11:05:07.000000000 +0200
35966 @@ -32,6 +32,7 @@
35967  #include <linux/memcontrol.h>
35968  #include <linux/gfp.h>
35969  #include <linux/uio.h>
35970 +#include <linux/locallock.h>
35971  #include <linux/hugetlb.h>
35972  #include <linux/page_idle.h>
35973  
35974 @@ -50,6 +51,8 @@
35975  #ifdef CONFIG_SMP
35976  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
35977  #endif
35978 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
35979 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
35980  
35981  /*
35982   * This path almost never happens for VM activity - pages are normally
35983 @@ -252,11 +255,11 @@
35984                 unsigned long flags;
35985  
35986                 get_page(page);
35987 -               local_irq_save(flags);
35988 +               local_lock_irqsave(rotate_lock, flags);
35989                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
35990                 if (!pagevec_add(pvec, page) || PageCompound(page))
35991                         pagevec_move_tail(pvec);
35992 -               local_irq_restore(flags);
35993 +               local_unlock_irqrestore(rotate_lock, flags);
35994         }
35995  }
35996  
35997 @@ -306,12 +309,13 @@
35998  {
35999         page = compound_head(page);
36000         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
36001 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
36002 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
36003 +                                                      activate_page_pvecs);
36004  
36005                 get_page(page);
36006                 if (!pagevec_add(pvec, page) || PageCompound(page))
36007                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
36008 -               put_cpu_var(activate_page_pvecs);
36009 +               put_locked_var(swapvec_lock, activate_page_pvecs);
36010         }
36011  }
36012  
36013 @@ -338,7 +342,7 @@
36014  
36015  static void __lru_cache_activate_page(struct page *page)
36016  {
36017 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36018 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36019         int i;
36020  
36021         /*
36022 @@ -360,7 +364,7 @@
36023                 }
36024         }
36025  
36026 -       put_cpu_var(lru_add_pvec);
36027 +       put_locked_var(swapvec_lock, lru_add_pvec);
36028  }
36029  
36030  /*
36031 @@ -402,12 +406,12 @@
36032  
36033  static void __lru_cache_add(struct page *page)
36034  {
36035 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36036 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36037  
36038         get_page(page);
36039         if (!pagevec_add(pvec, page) || PageCompound(page))
36040                 __pagevec_lru_add(pvec);
36041 -       put_cpu_var(lru_add_pvec);
36042 +       put_locked_var(swapvec_lock, lru_add_pvec);
36043  }
36044  
36045  /**
36046 @@ -613,9 +617,15 @@
36047                 unsigned long flags;
36048  
36049                 /* No harm done if a racing interrupt already did this */
36050 -               local_irq_save(flags);
36051 +#ifdef CONFIG_PREEMPT_RT_BASE
36052 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
36053                 pagevec_move_tail(pvec);
36054 -               local_irq_restore(flags);
36055 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
36056 +#else
36057 +               local_lock_irqsave(rotate_lock, flags);
36058 +               pagevec_move_tail(pvec);
36059 +               local_unlock_irqrestore(rotate_lock, flags);
36060 +#endif
36061         }
36062  
36063         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
36064 @@ -647,11 +657,12 @@
36065                 return;
36066  
36067         if (likely(get_page_unless_zero(page))) {
36068 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
36069 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
36070 +                                                      lru_deactivate_file_pvecs);
36071  
36072                 if (!pagevec_add(pvec, page) || PageCompound(page))
36073                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
36074 -               put_cpu_var(lru_deactivate_file_pvecs);
36075 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
36076         }
36077  }
36078  
36079 @@ -666,21 +677,32 @@
36080  {
36081         if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
36082             !PageSwapCache(page) && !PageUnevictable(page)) {
36083 -               struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
36084 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
36085 +                                                      lru_lazyfree_pvecs);
36086  
36087                 get_page(page);
36088                 if (!pagevec_add(pvec, page) || PageCompound(page))
36089                         pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
36090 -               put_cpu_var(lru_lazyfree_pvecs);
36091 +               put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
36092         }
36093  }
36094  
36095  void lru_add_drain(void)
36096  {
36097 -       lru_add_drain_cpu(get_cpu());
36098 -       put_cpu();
36099 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
36100 +       local_unlock_cpu(swapvec_lock);
36101  }
36102  
36103 +#ifdef CONFIG_PREEMPT_RT_BASE
36104 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
36105 +{
36106 +       local_lock_on(swapvec_lock, cpu);
36107 +       lru_add_drain_cpu(cpu);
36108 +       local_unlock_on(swapvec_lock, cpu);
36109 +}
36110 +
36111 +#else
36112 +
36113  static void lru_add_drain_per_cpu(struct work_struct *dummy)
36114  {
36115         lru_add_drain();
36116 @@ -688,6 +710,16 @@
36117  
36118  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
36119  
36120 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
36121 +{
36122 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
36123 +
36124 +       INIT_WORK(work, lru_add_drain_per_cpu);
36125 +       queue_work_on(cpu, mm_percpu_wq, work);
36126 +       cpumask_set_cpu(cpu, has_work);
36127 +}
36128 +#endif
36129 +
36130  void lru_add_drain_all_cpuslocked(void)
36131  {
36132         static DEFINE_MUTEX(lock);
36133 @@ -705,21 +737,19 @@
36134         cpumask_clear(&has_work);
36135  
36136         for_each_online_cpu(cpu) {
36137 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
36138  
36139                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
36140                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
36141                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
36142                     pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
36143 -                   need_activate_page_drain(cpu)) {
36144 -                       INIT_WORK(work, lru_add_drain_per_cpu);
36145 -                       queue_work_on(cpu, mm_percpu_wq, work);
36146 -                       cpumask_set_cpu(cpu, &has_work);
36147 -               }
36148 +                   need_activate_page_drain(cpu))
36149 +                       remote_lru_add_drain(cpu, &has_work);
36150         }
36151  
36152 +#ifndef CONFIG_PREEMPT_RT_BASE
36153         for_each_cpu(cpu, &has_work)
36154                 flush_work(&per_cpu(lru_add_drain_work, cpu));
36155 +#endif
36156  
36157         mutex_unlock(&lock);
36158  }
36159 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/truncate.c linux-4.14/mm/truncate.c
36160 --- linux-4.14.orig/mm/truncate.c       2017-11-12 19:46:13.000000000 +0100
36161 +++ linux-4.14/mm/truncate.c    2018-09-05 11:05:07.000000000 +0200
36162 @@ -41,8 +41,10 @@
36163                 goto unlock;
36164         if (*slot != entry)
36165                 goto unlock;
36166 +       local_lock(shadow_nodes_lock);
36167         __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
36168 -                            workingset_update_node, mapping);
36169 +                            __workingset_update_node, mapping);
36170 +       local_unlock(shadow_nodes_lock);
36171         mapping->nrexceptional--;
36172  unlock:
36173         spin_unlock_irq(&mapping->tree_lock);
36174 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/vmalloc.c linux-4.14/mm/vmalloc.c
36175 --- linux-4.14.orig/mm/vmalloc.c        2018-09-05 11:03:25.000000000 +0200
36176 +++ linux-4.14/mm/vmalloc.c     2018-09-05 11:05:07.000000000 +0200
36177 @@ -865,7 +865,7 @@
36178         struct vmap_block *vb;
36179         struct vmap_area *va;
36180         unsigned long vb_idx;
36181 -       int node, err;
36182 +       int node, err, cpu;
36183         void *vaddr;
36184  
36185         node = numa_node_id();
36186 @@ -908,11 +908,12 @@
36187         BUG_ON(err);
36188         radix_tree_preload_end();
36189  
36190 -       vbq = &get_cpu_var(vmap_block_queue);
36191 +       cpu = get_cpu_light();
36192 +       vbq = this_cpu_ptr(&vmap_block_queue);
36193         spin_lock(&vbq->lock);
36194         list_add_tail_rcu(&vb->free_list, &vbq->free);
36195         spin_unlock(&vbq->lock);
36196 -       put_cpu_var(vmap_block_queue);
36197 +       put_cpu_light();
36198  
36199         return vaddr;
36200  }
36201 @@ -981,6 +982,7 @@
36202         struct vmap_block *vb;
36203         void *vaddr = NULL;
36204         unsigned int order;
36205 +       int cpu;
36206  
36207         BUG_ON(offset_in_page(size));
36208         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
36209 @@ -995,7 +997,8 @@
36210         order = get_order(size);
36211  
36212         rcu_read_lock();
36213 -       vbq = &get_cpu_var(vmap_block_queue);
36214 +       cpu = get_cpu_light();
36215 +       vbq = this_cpu_ptr(&vmap_block_queue);
36216         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
36217                 unsigned long pages_off;
36218  
36219 @@ -1018,7 +1021,7 @@
36220                 break;
36221         }
36222  
36223 -       put_cpu_var(vmap_block_queue);
36224 +       put_cpu_light();
36225         rcu_read_unlock();
36226  
36227         /* Allocate new block if nothing was found */
36228 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/vmstat.c linux-4.14/mm/vmstat.c
36229 --- linux-4.14.orig/mm/vmstat.c 2017-11-12 19:46:13.000000000 +0100
36230 +++ linux-4.14/mm/vmstat.c      2018-09-05 11:05:07.000000000 +0200
36231 @@ -249,6 +249,7 @@
36232         long x;
36233         long t;
36234  
36235 +       preempt_disable_rt();
36236         x = delta + __this_cpu_read(*p);
36237  
36238         t = __this_cpu_read(pcp->stat_threshold);
36239 @@ -258,6 +259,7 @@
36240                 x = 0;
36241         }
36242         __this_cpu_write(*p, x);
36243 +       preempt_enable_rt();
36244  }
36245  EXPORT_SYMBOL(__mod_zone_page_state);
36246  
36247 @@ -269,6 +271,7 @@
36248         long x;
36249         long t;
36250  
36251 +       preempt_disable_rt();
36252         x = delta + __this_cpu_read(*p);
36253  
36254         t = __this_cpu_read(pcp->stat_threshold);
36255 @@ -278,6 +281,7 @@
36256                 x = 0;
36257         }
36258         __this_cpu_write(*p, x);
36259 +       preempt_enable_rt();
36260  }
36261  EXPORT_SYMBOL(__mod_node_page_state);
36262  
36263 @@ -310,6 +314,7 @@
36264         s8 __percpu *p = pcp->vm_stat_diff + item;
36265         s8 v, t;
36266  
36267 +       preempt_disable_rt();
36268         v = __this_cpu_inc_return(*p);
36269         t = __this_cpu_read(pcp->stat_threshold);
36270         if (unlikely(v > t)) {
36271 @@ -318,6 +323,7 @@
36272                 zone_page_state_add(v + overstep, zone, item);
36273                 __this_cpu_write(*p, -overstep);
36274         }
36275 +       preempt_enable_rt();
36276  }
36277  
36278  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
36279 @@ -326,6 +332,7 @@
36280         s8 __percpu *p = pcp->vm_node_stat_diff + item;
36281         s8 v, t;
36282  
36283 +       preempt_disable_rt();
36284         v = __this_cpu_inc_return(*p);
36285         t = __this_cpu_read(pcp->stat_threshold);
36286         if (unlikely(v > t)) {
36287 @@ -334,6 +341,7 @@
36288                 node_page_state_add(v + overstep, pgdat, item);
36289                 __this_cpu_write(*p, -overstep);
36290         }
36291 +       preempt_enable_rt();
36292  }
36293  
36294  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
36295 @@ -354,6 +362,7 @@
36296         s8 __percpu *p = pcp->vm_stat_diff + item;
36297         s8 v, t;
36298  
36299 +       preempt_disable_rt();
36300         v = __this_cpu_dec_return(*p);
36301         t = __this_cpu_read(pcp->stat_threshold);
36302         if (unlikely(v < - t)) {
36303 @@ -362,6 +371,7 @@
36304                 zone_page_state_add(v - overstep, zone, item);
36305                 __this_cpu_write(*p, overstep);
36306         }
36307 +       preempt_enable_rt();
36308  }
36309  
36310  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
36311 @@ -370,6 +380,7 @@
36312         s8 __percpu *p = pcp->vm_node_stat_diff + item;
36313         s8 v, t;
36314  
36315 +       preempt_disable_rt();
36316         v = __this_cpu_dec_return(*p);
36317         t = __this_cpu_read(pcp->stat_threshold);
36318         if (unlikely(v < - t)) {
36319 @@ -378,6 +389,7 @@
36320                 node_page_state_add(v - overstep, pgdat, item);
36321                 __this_cpu_write(*p, overstep);
36322         }
36323 +       preempt_enable_rt();
36324  }
36325  
36326  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
36327 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/workingset.c linux-4.14/mm/workingset.c
36328 --- linux-4.14.orig/mm/workingset.c     2017-11-12 19:46:13.000000000 +0100
36329 +++ linux-4.14/mm/workingset.c  2018-09-05 11:05:07.000000000 +0200
36330 @@ -338,9 +338,10 @@
36331   * point where they would still be useful.
36332   */
36333  
36334 -static struct list_lru shadow_nodes;
36335 +static struct list_lru __shadow_nodes;
36336 +DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
36337  
36338 -void workingset_update_node(struct radix_tree_node *node, void *private)
36339 +void __workingset_update_node(struct radix_tree_node *node, void *private)
36340  {
36341         struct address_space *mapping = private;
36342  
36343 @@ -358,10 +359,10 @@
36344          */
36345         if (node->count && node->count == node->exceptional) {
36346                 if (list_empty(&node->private_list))
36347 -                       list_lru_add(&shadow_nodes, &node->private_list);
36348 +                       list_lru_add(&__shadow_nodes, &node->private_list);
36349         } else {
36350                 if (!list_empty(&node->private_list))
36351 -                       list_lru_del(&shadow_nodes, &node->private_list);
36352 +                       list_lru_del(&__shadow_nodes, &node->private_list);
36353         }
36354  }
36355  
36356 @@ -373,9 +374,9 @@
36357         unsigned long cache;
36358  
36359         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
36360 -       local_irq_disable();
36361 -       nodes = list_lru_shrink_count(&shadow_nodes, sc);
36362 -       local_irq_enable();
36363 +       local_lock_irq(shadow_nodes_lock);
36364 +       nodes = list_lru_shrink_count(&__shadow_nodes, sc);
36365 +       local_unlock_irq(shadow_nodes_lock);
36366  
36367         /*
36368          * Approximate a reasonable limit for the radix tree nodes
36369 @@ -475,15 +476,15 @@
36370                 goto out_invalid;
36371         inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
36372         __radix_tree_delete_node(&mapping->page_tree, node,
36373 -                                workingset_update_node, mapping);
36374 +                                __workingset_update_node, mapping);
36375  
36376  out_invalid:
36377         spin_unlock(&mapping->tree_lock);
36378         ret = LRU_REMOVED_RETRY;
36379  out:
36380 -       local_irq_enable();
36381 +       local_unlock_irq(shadow_nodes_lock);
36382         cond_resched();
36383 -       local_irq_disable();
36384 +       local_lock_irq(shadow_nodes_lock);
36385         spin_lock(lru_lock);
36386         return ret;
36387  }
36388 @@ -494,9 +495,9 @@
36389         unsigned long ret;
36390  
36391         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
36392 -       local_irq_disable();
36393 -       ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
36394 -       local_irq_enable();
36395 +       local_lock_irq(shadow_nodes_lock);
36396 +       ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL);
36397 +       local_unlock_irq(shadow_nodes_lock);
36398         return ret;
36399  }
36400  
36401 @@ -534,7 +535,7 @@
36402         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
36403                timestamp_bits, max_order, bucket_order);
36404  
36405 -       ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
36406 +       ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key);
36407         if (ret)
36408                 goto err;
36409         ret = register_shrinker(&workingset_shadow_shrinker);
36410 @@ -542,7 +543,7 @@
36411                 goto err_list_lru;
36412         return 0;
36413  err_list_lru:
36414 -       list_lru_destroy(&shadow_nodes);
36415 +       list_lru_destroy(&__shadow_nodes);
36416  err:
36417         return ret;
36418  }
36419 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/mm/zsmalloc.c linux-4.14/mm/zsmalloc.c
36420 --- linux-4.14.orig/mm/zsmalloc.c       2018-09-05 11:03:25.000000000 +0200
36421 +++ linux-4.14/mm/zsmalloc.c    2018-09-05 11:05:07.000000000 +0200
36422 @@ -53,6 +53,7 @@
36423  #include <linux/mount.h>
36424  #include <linux/migrate.h>
36425  #include <linux/pagemap.h>
36426 +#include <linux/locallock.h>
36427  
36428  #define ZSPAGE_MAGIC   0x58
36429  
36430 @@ -70,9 +71,22 @@
36431   */
36432  #define ZS_MAX_ZSPAGE_ORDER 2
36433  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
36434 -
36435  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
36436  
36437 +#ifdef CONFIG_PREEMPT_RT_FULL
36438 +
36439 +struct zsmalloc_handle {
36440 +       unsigned long addr;
36441 +       struct mutex lock;
36442 +};
36443 +
36444 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
36445 +
36446 +#else
36447 +
36448 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
36449 +#endif
36450 +
36451  /*
36452   * Object location (<PFN>, <obj_idx>) is encoded as
36453   * as single (unsigned long) handle value.
36454 @@ -320,7 +334,7 @@
36455  
36456  static int create_cache(struct zs_pool *pool)
36457  {
36458 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
36459 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
36460                                         0, 0, NULL);
36461         if (!pool->handle_cachep)
36462                 return 1;
36463 @@ -344,9 +358,26 @@
36464  
36465  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
36466  {
36467 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
36468 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
36469 +       void *p;
36470 +
36471 +       p = kmem_cache_alloc(pool->handle_cachep,
36472 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
36473 +#ifdef CONFIG_PREEMPT_RT_FULL
36474 +       if (p) {
36475 +               struct zsmalloc_handle *zh = p;
36476 +
36477 +               mutex_init(&zh->lock);
36478 +       }
36479 +#endif
36480 +       return (unsigned long)p;
36481 +}
36482 +
36483 +#ifdef CONFIG_PREEMPT_RT_FULL
36484 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
36485 +{
36486 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
36487  }
36488 +#endif
36489  
36490  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
36491  {
36492 @@ -366,12 +397,18 @@
36493  
36494  static void record_obj(unsigned long handle, unsigned long obj)
36495  {
36496 +#ifdef CONFIG_PREEMPT_RT_FULL
36497 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36498 +
36499 +       WRITE_ONCE(zh->addr, obj);
36500 +#else
36501         /*
36502          * lsb of @obj represents handle lock while other bits
36503          * represent object value the handle is pointing so
36504          * updating shouldn't do store tearing.
36505          */
36506         WRITE_ONCE(*(unsigned long *)handle, obj);
36507 +#endif
36508  }
36509  
36510  /* zpool driver */
36511 @@ -460,6 +497,7 @@
36512  
36513  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
36514  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
36515 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
36516  
36517  static bool is_zspage_isolated(struct zspage *zspage)
36518  {
36519 @@ -898,7 +936,13 @@
36520  
36521  static unsigned long handle_to_obj(unsigned long handle)
36522  {
36523 +#ifdef CONFIG_PREEMPT_RT_FULL
36524 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36525 +
36526 +       return zh->addr;
36527 +#else
36528         return *(unsigned long *)handle;
36529 +#endif
36530  }
36531  
36532  static unsigned long obj_to_head(struct page *page, void *obj)
36533 @@ -912,22 +956,46 @@
36534  
36535  static inline int testpin_tag(unsigned long handle)
36536  {
36537 +#ifdef CONFIG_PREEMPT_RT_FULL
36538 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36539 +
36540 +       return mutex_is_locked(&zh->lock);
36541 +#else
36542         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
36543 +#endif
36544  }
36545  
36546  static inline int trypin_tag(unsigned long handle)
36547  {
36548 +#ifdef CONFIG_PREEMPT_RT_FULL
36549 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36550 +
36551 +       return mutex_trylock(&zh->lock);
36552 +#else
36553         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
36554 +#endif
36555  }
36556  
36557  static void pin_tag(unsigned long handle)
36558  {
36559 +#ifdef CONFIG_PREEMPT_RT_FULL
36560 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36561 +
36562 +       return mutex_lock(&zh->lock);
36563 +#else
36564         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
36565 +#endif
36566  }
36567  
36568  static void unpin_tag(unsigned long handle)
36569  {
36570 +#ifdef CONFIG_PREEMPT_RT_FULL
36571 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
36572 +
36573 +       return mutex_unlock(&zh->lock);
36574 +#else
36575         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
36576 +#endif
36577  }
36578  
36579  static void reset_page(struct page *page)
36580 @@ -1365,7 +1433,7 @@
36581         class = pool->size_class[class_idx];
36582         off = (class->size * obj_idx) & ~PAGE_MASK;
36583  
36584 -       area = &get_cpu_var(zs_map_area);
36585 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
36586         area->vm_mm = mm;
36587         if (off + class->size <= PAGE_SIZE) {
36588                 /* this object is contained entirely within a page */
36589 @@ -1419,7 +1487,7 @@
36590  
36591                 __zs_unmap_object(area, pages, off, class->size);
36592         }
36593 -       put_cpu_var(zs_map_area);
36594 +       put_locked_var(zs_map_area_lock, zs_map_area);
36595  
36596         migrate_read_unlock(zspage);
36597         unpin_tag(handle);
36598 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/9p/trans_xen.c linux-4.14/net/9p/trans_xen.c
36599 --- linux-4.14.orig/net/9p/trans_xen.c  2018-09-05 11:03:25.000000000 +0200
36600 +++ linux-4.14/net/9p/trans_xen.c       2018-09-05 11:05:07.000000000 +0200
36601 @@ -38,7 +38,6 @@
36602  
36603  #include <linux/module.h>
36604  #include <linux/spinlock.h>
36605 -#include <linux/rwlock.h>
36606  #include <net/9p/9p.h>
36607  #include <net/9p/client.h>
36608  #include <net/9p/transport.h>
36609 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/bluetooth/hci_sock.c linux-4.14/net/bluetooth/hci_sock.c
36610 --- linux-4.14.orig/net/bluetooth/hci_sock.c    2017-11-12 19:46:13.000000000 +0100
36611 +++ linux-4.14/net/bluetooth/hci_sock.c 2018-09-05 11:05:07.000000000 +0200
36612 @@ -251,15 +251,13 @@
36613  }
36614  
36615  /* Send frame to sockets with specific channel */
36616 -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36617 -                        int flag, struct sock *skip_sk)
36618 +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36619 +                                 int flag, struct sock *skip_sk)
36620  {
36621         struct sock *sk;
36622  
36623         BT_DBG("channel %u len %d", channel, skb->len);
36624  
36625 -       read_lock(&hci_sk_list.lock);
36626 -
36627         sk_for_each(sk, &hci_sk_list.head) {
36628                 struct sk_buff *nskb;
36629  
36630 @@ -285,6 +283,13 @@
36631                         kfree_skb(nskb);
36632         }
36633  
36634 +}
36635 +
36636 +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
36637 +                        int flag, struct sock *skip_sk)
36638 +{
36639 +       read_lock(&hci_sk_list.lock);
36640 +       __hci_send_to_channel(channel, skb, flag, skip_sk);
36641         read_unlock(&hci_sk_list.lock);
36642  }
36643  
36644 @@ -388,8 +393,8 @@
36645                 hdr->index = index;
36646                 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
36647  
36648 -               hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
36649 -                                   HCI_SOCK_TRUSTED, NULL);
36650 +               __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
36651 +                                     HCI_SOCK_TRUSTED, NULL);
36652                 kfree_skb(skb);
36653         }
36654  
36655 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/can/bcm.c linux-4.14/net/can/bcm.c
36656 --- linux-4.14.orig/net/can/bcm.c       2017-11-12 19:46:13.000000000 +0100
36657 +++ linux-4.14/net/can/bcm.c    2018-09-05 11:05:07.000000000 +0200
36658 @@ -102,7 +102,6 @@
36659         unsigned long frames_abs, frames_filtered;
36660         struct bcm_timeval ival1, ival2;
36661         struct hrtimer timer, thrtimer;
36662 -       struct tasklet_struct tsklet, thrtsklet;
36663         ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
36664         int rx_ifindex;
36665         int cfsiz;
36666 @@ -364,25 +363,34 @@
36667         }
36668  }
36669  
36670 -static void bcm_tx_start_timer(struct bcm_op *op)
36671 +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
36672  {
36673 +       ktime_t ival;
36674 +
36675         if (op->kt_ival1 && op->count)
36676 -               hrtimer_start(&op->timer,
36677 -                             ktime_add(ktime_get(), op->kt_ival1),
36678 -                             HRTIMER_MODE_ABS);
36679 +               ival = op->kt_ival1;
36680         else if (op->kt_ival2)
36681 -               hrtimer_start(&op->timer,
36682 -                             ktime_add(ktime_get(), op->kt_ival2),
36683 -                             HRTIMER_MODE_ABS);
36684 +               ival = op->kt_ival2;
36685 +       else
36686 +               return false;
36687 +
36688 +       hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
36689 +       return true;
36690  }
36691  
36692 -static void bcm_tx_timeout_tsklet(unsigned long data)
36693 +static void bcm_tx_start_timer(struct bcm_op *op)
36694  {
36695 -       struct bcm_op *op = (struct bcm_op *)data;
36696 +       if (bcm_tx_set_expiry(op, &op->timer))
36697 +               hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
36698 +}
36699 +
36700 +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
36701 +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
36702 +{
36703 +       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36704         struct bcm_msg_head msg_head;
36705  
36706         if (op->kt_ival1 && (op->count > 0)) {
36707 -
36708                 op->count--;
36709                 if (!op->count && (op->flags & TX_COUNTEVT)) {
36710  
36711 @@ -399,22 +407,12 @@
36712                 }
36713                 bcm_can_tx(op);
36714  
36715 -       } else if (op->kt_ival2)
36716 +       } else if (op->kt_ival2) {
36717                 bcm_can_tx(op);
36718 +       }
36719  
36720 -       bcm_tx_start_timer(op);
36721 -}
36722 -
36723 -/*
36724 - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
36725 - */
36726 -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
36727 -{
36728 -       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36729 -
36730 -       tasklet_schedule(&op->tsklet);
36731 -
36732 -       return HRTIMER_NORESTART;
36733 +       return bcm_tx_set_expiry(op, &op->timer) ?
36734 +               HRTIMER_RESTART : HRTIMER_NORESTART;
36735  }
36736  
36737  /*
36738 @@ -480,7 +478,7 @@
36739                 /* do not send the saved data - only start throttle timer */
36740                 hrtimer_start(&op->thrtimer,
36741                               ktime_add(op->kt_lastmsg, op->kt_ival2),
36742 -                             HRTIMER_MODE_ABS);
36743 +                             HRTIMER_MODE_ABS_SOFT);
36744                 return;
36745         }
36746  
36747 @@ -539,14 +537,21 @@
36748                 return;
36749  
36750         if (op->kt_ival1)
36751 -               hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
36752 +               hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
36753  }
36754  
36755 -static void bcm_rx_timeout_tsklet(unsigned long data)
36756 +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
36757 +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
36758  {
36759 -       struct bcm_op *op = (struct bcm_op *)data;
36760 +       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36761         struct bcm_msg_head msg_head;
36762  
36763 +       /* if user wants to be informed, when cyclic CAN-Messages come back */
36764 +       if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
36765 +               /* clear received CAN frames to indicate 'nothing received' */
36766 +               memset(op->last_frames, 0, op->nframes * op->cfsiz);
36767 +       }
36768 +
36769         /* create notification to user */
36770         msg_head.opcode  = RX_TIMEOUT;
36771         msg_head.flags   = op->flags;
36772 @@ -557,25 +562,6 @@
36773         msg_head.nframes = 0;
36774  
36775         bcm_send_to_user(op, &msg_head, NULL, 0);
36776 -}
36777 -
36778 -/*
36779 - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
36780 - */
36781 -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
36782 -{
36783 -       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
36784 -
36785 -       /* schedule before NET_RX_SOFTIRQ */
36786 -       tasklet_hi_schedule(&op->tsklet);
36787 -
36788 -       /* no restart of the timer is done here! */
36789 -
36790 -       /* if user wants to be informed, when cyclic CAN-Messages come back */
36791 -       if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
36792 -               /* clear received CAN frames to indicate 'nothing received' */
36793 -               memset(op->last_frames, 0, op->nframes * op->cfsiz);
36794 -       }
36795  
36796         return HRTIMER_NORESTART;
36797  }
36798 @@ -583,14 +569,12 @@
36799  /*
36800   * bcm_rx_do_flush - helper for bcm_rx_thr_flush
36801   */
36802 -static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
36803 -                                 unsigned int index)
36804 +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
36805  {
36806         struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
36807  
36808         if ((op->last_frames) && (lcf->flags & RX_THR)) {
36809 -               if (update)
36810 -                       bcm_rx_changed(op, lcf);
36811 +               bcm_rx_changed(op, lcf);
36812                 return 1;
36813         }
36814         return 0;
36815 @@ -598,11 +582,8 @@
36816  
36817  /*
36818   * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
36819 - *
36820 - * update == 0 : just check if throttled data is available  (any irq context)
36821 - * update == 1 : check and send throttled data to userspace (soft_irq context)
36822   */
36823 -static int bcm_rx_thr_flush(struct bcm_op *op, int update)
36824 +static int bcm_rx_thr_flush(struct bcm_op *op)
36825  {
36826         int updated = 0;
36827  
36828 @@ -611,24 +592,16 @@
36829  
36830                 /* for MUX filter we start at index 1 */
36831                 for (i = 1; i < op->nframes; i++)
36832 -                       updated += bcm_rx_do_flush(op, update, i);
36833 +                       updated += bcm_rx_do_flush(op, i);
36834  
36835         } else {
36836                 /* for RX_FILTER_ID and simple filter */
36837 -               updated += bcm_rx_do_flush(op, update, 0);
36838 +               updated += bcm_rx_do_flush(op, 0);
36839         }
36840  
36841         return updated;
36842  }
36843  
36844 -static void bcm_rx_thr_tsklet(unsigned long data)
36845 -{
36846 -       struct bcm_op *op = (struct bcm_op *)data;
36847 -
36848 -       /* push the changed data to the userspace */
36849 -       bcm_rx_thr_flush(op, 1);
36850 -}
36851 -
36852  /*
36853   * bcm_rx_thr_handler - the time for blocked content updates is over now:
36854   *                      Check for throttled data and send it to the userspace
36855 @@ -637,9 +610,7 @@
36856  {
36857         struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
36858  
36859 -       tasklet_schedule(&op->thrtsklet);
36860 -
36861 -       if (bcm_rx_thr_flush(op, 0)) {
36862 +       if (bcm_rx_thr_flush(op)) {
36863                 hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
36864                 return HRTIMER_RESTART;
36865         } else {
36866 @@ -735,23 +706,8 @@
36867  
36868  static void bcm_remove_op(struct bcm_op *op)
36869  {
36870 -       if (op->tsklet.func) {
36871 -               while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
36872 -                      test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
36873 -                      hrtimer_active(&op->timer)) {
36874 -                       hrtimer_cancel(&op->timer);
36875 -                       tasklet_kill(&op->tsklet);
36876 -               }
36877 -       }
36878 -
36879 -       if (op->thrtsklet.func) {
36880 -               while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
36881 -                      test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
36882 -                      hrtimer_active(&op->thrtimer)) {
36883 -                       hrtimer_cancel(&op->thrtimer);
36884 -                       tasklet_kill(&op->thrtsklet);
36885 -               }
36886 -       }
36887 +       hrtimer_cancel(&op->timer);
36888 +       hrtimer_cancel(&op->thrtimer);
36889  
36890         if ((op->frames) && (op->frames != &op->sframe))
36891                 kfree(op->frames);
36892 @@ -979,15 +935,13 @@
36893                 op->ifindex = ifindex;
36894  
36895                 /* initialize uninitialized (kzalloc) structure */
36896 -               hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36897 +               hrtimer_init(&op->timer, CLOCK_MONOTONIC,
36898 +                            HRTIMER_MODE_REL_SOFT);
36899                 op->timer.function = bcm_tx_timeout_handler;
36900  
36901 -               /* initialize tasklet for tx countevent notification */
36902 -               tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
36903 -                            (unsigned long) op);
36904 -
36905                 /* currently unused in tx_ops */
36906 -               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36907 +               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
36908 +                            HRTIMER_MODE_REL_SOFT);
36909  
36910                 /* add this bcm_op to the list of the tx_ops */
36911                 list_add(&op->list, &bo->tx_ops);
36912 @@ -1150,20 +1104,14 @@
36913                 op->rx_ifindex = ifindex;
36914  
36915                 /* initialize uninitialized (kzalloc) structure */
36916 -               hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36917 +               hrtimer_init(&op->timer, CLOCK_MONOTONIC,
36918 +                            HRTIMER_MODE_REL_SOFT);
36919                 op->timer.function = bcm_rx_timeout_handler;
36920  
36921 -               /* initialize tasklet for rx timeout notification */
36922 -               tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
36923 -                            (unsigned long) op);
36924 -
36925 -               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
36926 +               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
36927 +                            HRTIMER_MODE_REL_SOFT);
36928                 op->thrtimer.function = bcm_rx_thr_handler;
36929  
36930 -               /* initialize tasklet for rx throttle handling */
36931 -               tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
36932 -                            (unsigned long) op);
36933 -
36934                 /* add this bcm_op to the list of the rx_ops */
36935                 list_add(&op->list, &bo->rx_ops);
36936  
36937 @@ -1209,12 +1157,12 @@
36938                          */
36939                         op->kt_lastmsg = 0;
36940                         hrtimer_cancel(&op->thrtimer);
36941 -                       bcm_rx_thr_flush(op, 1);
36942 +                       bcm_rx_thr_flush(op);
36943                 }
36944  
36945                 if ((op->flags & STARTTIMER) && op->kt_ival1)
36946                         hrtimer_start(&op->timer, op->kt_ival1,
36947 -                                     HRTIMER_MODE_REL);
36948 +                                     HRTIMER_MODE_REL_SOFT);
36949         }
36950  
36951         /* now we can register for can_ids, if we added a new bcm_op */
36952 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/dev.c linux-4.14/net/core/dev.c
36953 --- linux-4.14.orig/net/core/dev.c      2018-09-05 11:03:25.000000000 +0200
36954 +++ linux-4.14/net/core/dev.c   2018-09-05 11:05:07.000000000 +0200
36955 @@ -195,6 +195,7 @@
36956  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
36957  
36958  static seqcount_t devnet_rename_seq;
36959 +static DEFINE_MUTEX(devnet_rename_mutex);
36960  
36961  static inline void dev_base_seq_inc(struct net *net)
36962  {
36963 @@ -217,14 +218,14 @@
36964  static inline void rps_lock(struct softnet_data *sd)
36965  {
36966  #ifdef CONFIG_RPS
36967 -       spin_lock(&sd->input_pkt_queue.lock);
36968 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
36969  #endif
36970  }
36971  
36972  static inline void rps_unlock(struct softnet_data *sd)
36973  {
36974  #ifdef CONFIG_RPS
36975 -       spin_unlock(&sd->input_pkt_queue.lock);
36976 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
36977  #endif
36978  }
36979  
36980 @@ -920,7 +921,8 @@
36981         strcpy(name, dev->name);
36982         rcu_read_unlock();
36983         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
36984 -               cond_resched();
36985 +               mutex_lock(&devnet_rename_mutex);
36986 +               mutex_unlock(&devnet_rename_mutex);
36987                 goto retry;
36988         }
36989  
36990 @@ -1189,20 +1191,17 @@
36991         if (dev->flags & IFF_UP)
36992                 return -EBUSY;
36993  
36994 -       write_seqcount_begin(&devnet_rename_seq);
36995 +       mutex_lock(&devnet_rename_mutex);
36996 +       __raw_write_seqcount_begin(&devnet_rename_seq);
36997  
36998 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
36999 -               write_seqcount_end(&devnet_rename_seq);
37000 -               return 0;
37001 -       }
37002 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
37003 +               goto outunlock;
37004  
37005         memcpy(oldname, dev->name, IFNAMSIZ);
37006  
37007         err = dev_get_valid_name(net, dev, newname);
37008 -       if (err < 0) {
37009 -               write_seqcount_end(&devnet_rename_seq);
37010 -               return err;
37011 -       }
37012 +       if (err < 0)
37013 +               goto outunlock;
37014  
37015         if (oldname[0] && !strchr(oldname, '%'))
37016                 netdev_info(dev, "renamed from %s\n", oldname);
37017 @@ -1215,11 +1214,12 @@
37018         if (ret) {
37019                 memcpy(dev->name, oldname, IFNAMSIZ);
37020                 dev->name_assign_type = old_assign_type;
37021 -               write_seqcount_end(&devnet_rename_seq);
37022 -               return ret;
37023 +               err = ret;
37024 +               goto outunlock;
37025         }
37026  
37027 -       write_seqcount_end(&devnet_rename_seq);
37028 +       __raw_write_seqcount_end(&devnet_rename_seq);
37029 +       mutex_unlock(&devnet_rename_mutex);
37030  
37031         netdev_adjacent_rename_links(dev, oldname);
37032  
37033 @@ -1240,7 +1240,8 @@
37034                 /* err >= 0 after dev_alloc_name() or stores the first errno */
37035                 if (err >= 0) {
37036                         err = ret;
37037 -                       write_seqcount_begin(&devnet_rename_seq);
37038 +                       mutex_lock(&devnet_rename_mutex);
37039 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
37040                         memcpy(dev->name, oldname, IFNAMSIZ);
37041                         memcpy(oldname, newname, IFNAMSIZ);
37042                         dev->name_assign_type = old_assign_type;
37043 @@ -1253,6 +1254,11 @@
37044         }
37045  
37046         return err;
37047 +
37048 +outunlock:
37049 +       __raw_write_seqcount_end(&devnet_rename_seq);
37050 +       mutex_unlock(&devnet_rename_mutex);
37051 +       return err;
37052  }
37053  
37054  /**
37055 @@ -2438,6 +2444,7 @@
37056         sd->output_queue_tailp = &q->next_sched;
37057         raise_softirq_irqoff(NET_TX_SOFTIRQ);
37058         local_irq_restore(flags);
37059 +       preempt_check_resched_rt();
37060  }
37061  
37062  void __netif_schedule(struct Qdisc *q)
37063 @@ -2500,6 +2507,7 @@
37064         __this_cpu_write(softnet_data.completion_queue, skb);
37065         raise_softirq_irqoff(NET_TX_SOFTIRQ);
37066         local_irq_restore(flags);
37067 +       preempt_check_resched_rt();
37068  }
37069  EXPORT_SYMBOL(__dev_kfree_skb_irq);
37070  
37071 @@ -3175,7 +3183,11 @@
37072          * This permits qdisc->running owner to get the lock more
37073          * often and dequeue packets faster.
37074          */
37075 +#ifdef CONFIG_PREEMPT_RT_FULL
37076 +       contended = true;
37077 +#else
37078         contended = qdisc_is_running(q);
37079 +#endif
37080         if (unlikely(contended))
37081                 spin_lock(&q->busylock);
37082  
37083 @@ -3246,8 +3258,10 @@
37084  #define skb_update_prio(skb)
37085  #endif
37086  
37087 +#ifndef CONFIG_PREEMPT_RT_FULL
37088  DEFINE_PER_CPU(int, xmit_recursion);
37089  EXPORT_SYMBOL(xmit_recursion);
37090 +#endif
37091  
37092  /**
37093   *     dev_loopback_xmit - loop back @skb
37094 @@ -3487,9 +3501,12 @@
37095         if (dev->flags & IFF_UP) {
37096                 int cpu = smp_processor_id(); /* ok because BHs are off */
37097  
37098 +#ifdef CONFIG_PREEMPT_RT_FULL
37099 +               if (txq->xmit_lock_owner != current) {
37100 +#else
37101                 if (txq->xmit_lock_owner != cpu) {
37102 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
37103 -                                    XMIT_RECURSION_LIMIT))
37104 +#endif
37105 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
37106                                 goto recursion_alert;
37107  
37108                         skb = validate_xmit_skb(skb, dev);
37109 @@ -3499,9 +3516,9 @@
37110                         HARD_TX_LOCK(dev, txq, cpu);
37111  
37112                         if (!netif_xmit_stopped(txq)) {
37113 -                               __this_cpu_inc(xmit_recursion);
37114 +                               xmit_rec_inc();
37115                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
37116 -                               __this_cpu_dec(xmit_recursion);
37117 +                               xmit_rec_dec();
37118                                 if (dev_xmit_complete(rc)) {
37119                                         HARD_TX_UNLOCK(dev, txq);
37120                                         goto out;
37121 @@ -3882,6 +3899,7 @@
37122         rps_unlock(sd);
37123  
37124         local_irq_restore(flags);
37125 +       preempt_check_resched_rt();
37126  
37127         atomic_long_inc(&skb->dev->rx_dropped);
37128         kfree_skb(skb);
37129 @@ -4034,7 +4052,7 @@
37130                 struct rps_dev_flow voidflow, *rflow = &voidflow;
37131                 int cpu;
37132  
37133 -               preempt_disable();
37134 +               migrate_disable();
37135                 rcu_read_lock();
37136  
37137                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
37138 @@ -4044,14 +4062,14 @@
37139                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
37140  
37141                 rcu_read_unlock();
37142 -               preempt_enable();
37143 +               migrate_enable();
37144         } else
37145  #endif
37146         {
37147                 unsigned int qtail;
37148  
37149 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
37150 -               put_cpu();
37151 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
37152 +               put_cpu_light();
37153         }
37154         return ret;
37155  }
37156 @@ -4085,11 +4103,9 @@
37157  
37158         trace_netif_rx_ni_entry(skb);
37159  
37160 -       preempt_disable();
37161 +       local_bh_disable();
37162         err = netif_rx_internal(skb);
37163 -       if (local_softirq_pending())
37164 -               do_softirq();
37165 -       preempt_enable();
37166 +       local_bh_enable();
37167  
37168         return err;
37169  }
37170 @@ -4607,7 +4623,7 @@
37171         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
37172                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
37173                         __skb_unlink(skb, &sd->input_pkt_queue);
37174 -                       kfree_skb(skb);
37175 +                       __skb_queue_tail(&sd->tofree_queue, skb);
37176                         input_queue_head_incr(sd);
37177                 }
37178         }
37179 @@ -4617,11 +4633,14 @@
37180         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
37181                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
37182                         __skb_unlink(skb, &sd->process_queue);
37183 -                       kfree_skb(skb);
37184 +                       __skb_queue_tail(&sd->tofree_queue, skb);
37185                         input_queue_head_incr(sd);
37186                 }
37187         }
37188 +       if (!skb_queue_empty(&sd->tofree_queue))
37189 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
37190         local_bh_enable();
37191 +
37192  }
37193  
37194  static void flush_all_backlogs(void)
37195 @@ -5131,12 +5150,14 @@
37196                 sd->rps_ipi_list = NULL;
37197  
37198                 local_irq_enable();
37199 +               preempt_check_resched_rt();
37200  
37201                 /* Send pending IPI's to kick RPS processing on remote cpus. */
37202                 net_rps_send_ipi(remsd);
37203         } else
37204  #endif
37205                 local_irq_enable();
37206 +       preempt_check_resched_rt();
37207  }
37208  
37209  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
37210 @@ -5166,7 +5187,9 @@
37211         while (again) {
37212                 struct sk_buff *skb;
37213  
37214 +               local_irq_disable();
37215                 while ((skb = __skb_dequeue(&sd->process_queue))) {
37216 +                       local_irq_enable();
37217                         rcu_read_lock();
37218                         __netif_receive_skb(skb);
37219                         rcu_read_unlock();
37220 @@ -5174,9 +5197,9 @@
37221                         if (++work >= quota)
37222                                 return work;
37223  
37224 +                       local_irq_disable();
37225                 }
37226  
37227 -               local_irq_disable();
37228                 rps_lock(sd);
37229                 if (skb_queue_empty(&sd->input_pkt_queue)) {
37230                         /*
37231 @@ -5214,6 +5237,7 @@
37232         local_irq_save(flags);
37233         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
37234         local_irq_restore(flags);
37235 +       preempt_check_resched_rt();
37236  }
37237  EXPORT_SYMBOL(__napi_schedule);
37238  
37239 @@ -5250,6 +5274,7 @@
37240  }
37241  EXPORT_SYMBOL(napi_schedule_prep);
37242  
37243 +#ifndef CONFIG_PREEMPT_RT_FULL
37244  /**
37245   * __napi_schedule_irqoff - schedule for receive
37246   * @n: entry to schedule
37247 @@ -5261,6 +5286,7 @@
37248         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
37249  }
37250  EXPORT_SYMBOL(__napi_schedule_irqoff);
37251 +#endif
37252  
37253  bool napi_complete_done(struct napi_struct *n, int work_done)
37254  {
37255 @@ -5615,13 +5641,21 @@
37256         unsigned long time_limit = jiffies +
37257                 usecs_to_jiffies(netdev_budget_usecs);
37258         int budget = netdev_budget;
37259 +       struct sk_buff_head tofree_q;
37260 +       struct sk_buff *skb;
37261         LIST_HEAD(list);
37262         LIST_HEAD(repoll);
37263  
37264 +       __skb_queue_head_init(&tofree_q);
37265 +
37266         local_irq_disable();
37267 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
37268         list_splice_init(&sd->poll_list, &list);
37269         local_irq_enable();
37270  
37271 +       while ((skb = __skb_dequeue(&tofree_q)))
37272 +               kfree_skb(skb);
37273 +
37274         for (;;) {
37275                 struct napi_struct *n;
37276  
37277 @@ -5651,7 +5685,7 @@
37278         list_splice_tail(&repoll, &list);
37279         list_splice(&list, &sd->poll_list);
37280         if (!list_empty(&sd->poll_list))
37281 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
37282 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
37283  
37284         net_rps_action_and_irq_enable(sd);
37285  out:
37286 @@ -7478,7 +7512,7 @@
37287         /* Initialize queue lock */
37288         spin_lock_init(&queue->_xmit_lock);
37289         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
37290 -       queue->xmit_lock_owner = -1;
37291 +       netdev_queue_clear_owner(queue);
37292         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
37293         queue->dev = dev;
37294  #ifdef CONFIG_BQL
37295 @@ -8418,6 +8452,7 @@
37296  
37297         raise_softirq_irqoff(NET_TX_SOFTIRQ);
37298         local_irq_enable();
37299 +       preempt_check_resched_rt();
37300  
37301  #ifdef CONFIG_RPS
37302         remsd = oldsd->rps_ipi_list;
37303 @@ -8431,10 +8466,13 @@
37304                 netif_rx_ni(skb);
37305                 input_queue_head_incr(oldsd);
37306         }
37307 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
37308 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
37309                 netif_rx_ni(skb);
37310                 input_queue_head_incr(oldsd);
37311         }
37312 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
37313 +               kfree_skb(skb);
37314 +       }
37315  
37316         return 0;
37317  }
37318 @@ -8738,8 +8776,9 @@
37319  
37320                 INIT_WORK(flush, flush_backlog);
37321  
37322 -               skb_queue_head_init(&sd->input_pkt_queue);
37323 -               skb_queue_head_init(&sd->process_queue);
37324 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
37325 +               skb_queue_head_init_raw(&sd->process_queue);
37326 +               skb_queue_head_init_raw(&sd->tofree_queue);
37327                 INIT_LIST_HEAD(&sd->poll_list);
37328                 sd->output_queue_tailp = &sd->output_queue;
37329  #ifdef CONFIG_RPS
37330 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/filter.c linux-4.14/net/core/filter.c
37331 --- linux-4.14.orig/net/core/filter.c   2018-09-05 11:03:25.000000000 +0200
37332 +++ linux-4.14/net/core/filter.c        2018-09-05 11:05:07.000000000 +0200
37333 @@ -1696,7 +1696,7 @@
37334  {
37335         int ret;
37336  
37337 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
37338 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
37339                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
37340                 kfree_skb(skb);
37341                 return -ENETDOWN;
37342 @@ -1704,9 +1704,9 @@
37343  
37344         skb->dev = dev;
37345  
37346 -       __this_cpu_inc(xmit_recursion);
37347 +       xmit_rec_inc();
37348         ret = dev_queue_xmit(skb);
37349 -       __this_cpu_dec(xmit_recursion);
37350 +       xmit_rec_dec();
37351  
37352         return ret;
37353  }
37354 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/gen_estimator.c linux-4.14/net/core/gen_estimator.c
37355 --- linux-4.14.orig/net/core/gen_estimator.c    2018-09-05 11:03:25.000000000 +0200
37356 +++ linux-4.14/net/core/gen_estimator.c 2018-09-05 11:05:07.000000000 +0200
37357 @@ -46,7 +46,7 @@
37358  struct net_rate_estimator {
37359         struct gnet_stats_basic_packed  *bstats;
37360         spinlock_t              *stats_lock;
37361 -       seqcount_t              *running;
37362 +       net_seqlock_t           *running;
37363         struct gnet_stats_basic_cpu __percpu *cpu_bstats;
37364         u8                      ewma_log;
37365         u8                      intvl_log; /* period : (250ms << intvl_log) */
37366 @@ -129,7 +129,7 @@
37367                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
37368                       struct net_rate_estimator __rcu **rate_est,
37369                       spinlock_t *stats_lock,
37370 -                     seqcount_t *running,
37371 +                     net_seqlock_t *running,
37372                       struct nlattr *opt)
37373  {
37374         struct gnet_estimator *parm = nla_data(opt);
37375 @@ -222,7 +222,7 @@
37376                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
37377                           struct net_rate_estimator __rcu **rate_est,
37378                           spinlock_t *stats_lock,
37379 -                         seqcount_t *running, struct nlattr *opt)
37380 +                         net_seqlock_t *running, struct nlattr *opt)
37381  {
37382         return gen_new_estimator(bstats, cpu_bstats, rate_est,
37383                                  stats_lock, running, opt);
37384 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/gen_stats.c linux-4.14/net/core/gen_stats.c
37385 --- linux-4.14.orig/net/core/gen_stats.c        2018-09-05 11:03:25.000000000 +0200
37386 +++ linux-4.14/net/core/gen_stats.c     2018-09-05 11:05:07.000000000 +0200
37387 @@ -142,7 +142,7 @@
37388  }
37389  
37390  void
37391 -__gnet_stats_copy_basic(const seqcount_t *running,
37392 +__gnet_stats_copy_basic(net_seqlock_t *running,
37393                         struct gnet_stats_basic_packed *bstats,
37394                         struct gnet_stats_basic_cpu __percpu *cpu,
37395                         struct gnet_stats_basic_packed *b)
37396 @@ -155,10 +155,10 @@
37397         }
37398         do {
37399                 if (running)
37400 -                       seq = read_seqcount_begin(running);
37401 +                       seq = net_seq_begin(running);
37402                 bstats->bytes = b->bytes;
37403                 bstats->packets = b->packets;
37404 -       } while (running && read_seqcount_retry(running, seq));
37405 +       } while (running && net_seq_retry(running, seq));
37406  }
37407  EXPORT_SYMBOL(__gnet_stats_copy_basic);
37408  
37409 @@ -176,7 +176,7 @@
37410   * if the room in the socket buffer was not sufficient.
37411   */
37412  int
37413 -gnet_stats_copy_basic(const seqcount_t *running,
37414 +gnet_stats_copy_basic(net_seqlock_t *running,
37415                       struct gnet_dump *d,
37416                       struct gnet_stats_basic_cpu __percpu *cpu,
37417                       struct gnet_stats_basic_packed *b)
37418 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/pktgen.c linux-4.14/net/core/pktgen.c
37419 --- linux-4.14.orig/net/core/pktgen.c   2017-11-12 19:46:13.000000000 +0100
37420 +++ linux-4.14/net/core/pktgen.c        2018-09-05 11:05:07.000000000 +0200
37421 @@ -2252,7 +2252,8 @@
37422         s64 remaining;
37423         struct hrtimer_sleeper t;
37424  
37425 -       hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
37426 +       hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
37427 +                                     current);
37428         hrtimer_set_expires(&t.timer, spin_until);
37429  
37430         remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
37431 @@ -2267,7 +2268,6 @@
37432                 } while (ktime_compare(end_time, spin_until) < 0);
37433         } else {
37434                 /* see do_nanosleep */
37435 -               hrtimer_init_sleeper(&t, current);
37436                 do {
37437                         set_current_state(TASK_INTERRUPTIBLE);
37438                         hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
37439 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/skbuff.c linux-4.14/net/core/skbuff.c
37440 --- linux-4.14.orig/net/core/skbuff.c   2018-09-05 11:03:25.000000000 +0200
37441 +++ linux-4.14/net/core/skbuff.c        2018-09-05 11:05:07.000000000 +0200
37442 @@ -63,6 +63,7 @@
37443  #include <linux/errqueue.h>
37444  #include <linux/prefetch.h>
37445  #include <linux/if_vlan.h>
37446 +#include <linux/locallock.h>
37447  
37448  #include <net/protocol.h>
37449  #include <net/dst.h>
37450 @@ -330,6 +331,8 @@
37451  
37452  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
37453  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
37454 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
37455 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
37456  
37457  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
37458  {
37459 @@ -337,10 +340,10 @@
37460         unsigned long flags;
37461         void *data;
37462  
37463 -       local_irq_save(flags);
37464 +       local_lock_irqsave(netdev_alloc_lock, flags);
37465         nc = this_cpu_ptr(&netdev_alloc_cache);
37466         data = page_frag_alloc(nc, fragsz, gfp_mask);
37467 -       local_irq_restore(flags);
37468 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
37469         return data;
37470  }
37471  
37472 @@ -359,9 +362,13 @@
37473  
37474  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
37475  {
37476 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37477 +       struct napi_alloc_cache *nc;
37478 +       void *data;
37479  
37480 -       return page_frag_alloc(&nc->page, fragsz, gfp_mask);
37481 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37482 +       data =  page_frag_alloc(&nc->page, fragsz, gfp_mask);
37483 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37484 +       return data;
37485  }
37486  
37487  void *napi_alloc_frag(unsigned int fragsz)
37488 @@ -408,13 +415,13 @@
37489         if (sk_memalloc_socks())
37490                 gfp_mask |= __GFP_MEMALLOC;
37491  
37492 -       local_irq_save(flags);
37493 +       local_lock_irqsave(netdev_alloc_lock, flags);
37494  
37495         nc = this_cpu_ptr(&netdev_alloc_cache);
37496         data = page_frag_alloc(nc, len, gfp_mask);
37497         pfmemalloc = nc->pfmemalloc;
37498  
37499 -       local_irq_restore(flags);
37500 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
37501  
37502         if (unlikely(!data))
37503                 return NULL;
37504 @@ -455,9 +462,10 @@
37505  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
37506                                  gfp_t gfp_mask)
37507  {
37508 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37509 +       struct napi_alloc_cache *nc;
37510         struct sk_buff *skb;
37511         void *data;
37512 +       bool pfmemalloc;
37513  
37514         len += NET_SKB_PAD + NET_IP_ALIGN;
37515  
37516 @@ -475,7 +483,10 @@
37517         if (sk_memalloc_socks())
37518                 gfp_mask |= __GFP_MEMALLOC;
37519  
37520 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37521         data = page_frag_alloc(&nc->page, len, gfp_mask);
37522 +       pfmemalloc = nc->page.pfmemalloc;
37523 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37524         if (unlikely(!data))
37525                 return NULL;
37526  
37527 @@ -486,7 +497,7 @@
37528         }
37529  
37530         /* use OR instead of assignment to avoid clearing of bits in mask */
37531 -       if (nc->page.pfmemalloc)
37532 +       if (pfmemalloc)
37533                 skb->pfmemalloc = 1;
37534         skb->head_frag = 1;
37535  
37536 @@ -718,23 +729,26 @@
37537  
37538  void __kfree_skb_flush(void)
37539  {
37540 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37541 +       struct napi_alloc_cache *nc;
37542  
37543 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37544         /* flush skb_cache if containing objects */
37545         if (nc->skb_count) {
37546                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
37547                                      nc->skb_cache);
37548                 nc->skb_count = 0;
37549         }
37550 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37551  }
37552  
37553  static inline void _kfree_skb_defer(struct sk_buff *skb)
37554  {
37555 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
37556 +       struct napi_alloc_cache *nc;
37557  
37558         /* drop skb->head and call any destructors for packet */
37559         skb_release_all(skb);
37560  
37561 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37562         /* record skb to CPU local list */
37563         nc->skb_cache[nc->skb_count++] = skb;
37564  
37565 @@ -749,6 +763,7 @@
37566                                      nc->skb_cache);
37567                 nc->skb_count = 0;
37568         }
37569 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
37570  }
37571  void __kfree_skb_defer(struct sk_buff *skb)
37572  {
37573 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/core/sock.c linux-4.14/net/core/sock.c
37574 --- linux-4.14.orig/net/core/sock.c     2018-09-05 11:03:25.000000000 +0200
37575 +++ linux-4.14/net/core/sock.c  2018-09-05 11:05:07.000000000 +0200
37576 @@ -2757,12 +2757,11 @@
37577         if (sk->sk_lock.owned)
37578                 __lock_sock(sk);
37579         sk->sk_lock.owned = 1;
37580 -       spin_unlock(&sk->sk_lock.slock);
37581 +       spin_unlock_bh(&sk->sk_lock.slock);
37582         /*
37583          * The sk_lock has mutex_lock() semantics here:
37584          */
37585         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
37586 -       local_bh_enable();
37587  }
37588  EXPORT_SYMBOL(lock_sock_nested);
37589  
37590 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/ipv4/icmp.c linux-4.14/net/ipv4/icmp.c
37591 --- linux-4.14.orig/net/ipv4/icmp.c     2018-09-05 11:03:25.000000000 +0200
37592 +++ linux-4.14/net/ipv4/icmp.c  2018-09-05 11:05:07.000000000 +0200
37593 @@ -77,6 +77,7 @@
37594  #include <linux/string.h>
37595  #include <linux/netfilter_ipv4.h>
37596  #include <linux/slab.h>
37597 +#include <linux/locallock.h>
37598  #include <net/snmp.h>
37599  #include <net/ip.h>
37600  #include <net/route.h>
37601 @@ -204,6 +205,8 @@
37602   *
37603   *     On SMP we have one ICMP socket per-cpu.
37604   */
37605 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
37606 +
37607  static struct sock *icmp_sk(struct net *net)
37608  {
37609         return *this_cpu_ptr(net->ipv4.icmp_sk);
37610 @@ -214,12 +217,16 @@
37611  {
37612         struct sock *sk;
37613  
37614 +       if (!local_trylock(icmp_sk_lock))
37615 +               return NULL;
37616 +
37617         sk = icmp_sk(net);
37618  
37619         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
37620                 /* This can happen if the output path signals a
37621                  * dst_link_failure() for an outgoing ICMP packet.
37622                  */
37623 +               local_unlock(icmp_sk_lock);
37624                 return NULL;
37625         }
37626         return sk;
37627 @@ -228,6 +235,7 @@
37628  static inline void icmp_xmit_unlock(struct sock *sk)
37629  {
37630         spin_unlock(&sk->sk_lock.slock);
37631 +       local_unlock(icmp_sk_lock);
37632  }
37633  
37634  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
37635 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/ipv4/tcp_ipv4.c linux-4.14/net/ipv4/tcp_ipv4.c
37636 --- linux-4.14.orig/net/ipv4/tcp_ipv4.c 2018-09-05 11:03:25.000000000 +0200
37637 +++ linux-4.14/net/ipv4/tcp_ipv4.c      2018-09-05 11:05:07.000000000 +0200
37638 @@ -62,6 +62,7 @@
37639  #include <linux/init.h>
37640  #include <linux/times.h>
37641  #include <linux/slab.h>
37642 +#include <linux/locallock.h>
37643  
37644  #include <net/net_namespace.h>
37645  #include <net/icmp.h>
37646 @@ -580,6 +581,7 @@
37647  }
37648  EXPORT_SYMBOL(tcp_v4_send_check);
37649  
37650 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
37651  /*
37652   *     This routine will send an RST to the other tcp.
37653   *
37654 @@ -710,6 +712,7 @@
37655         arg.tos = ip_hdr(skb)->tos;
37656         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
37657         local_bh_disable();
37658 +       local_lock(tcp_sk_lock);
37659         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
37660                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
37661                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
37662 @@ -717,6 +720,7 @@
37663  
37664         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
37665         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
37666 +       local_unlock(tcp_sk_lock);
37667         local_bh_enable();
37668  
37669  #ifdef CONFIG_TCP_MD5SIG
37670 @@ -796,12 +800,14 @@
37671         arg.tos = tos;
37672         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
37673         local_bh_disable();
37674 +       local_lock(tcp_sk_lock);
37675         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
37676                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
37677                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
37678                               &arg, arg.iov[0].iov_len);
37679  
37680         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
37681 +       local_unlock(tcp_sk_lock);
37682         local_bh_enable();
37683  }
37684  
37685 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/Kconfig linux-4.14/net/Kconfig
37686 --- linux-4.14.orig/net/Kconfig 2017-11-12 19:46:13.000000000 +0100
37687 +++ linux-4.14/net/Kconfig      2018-09-05 11:05:07.000000000 +0200
37688 @@ -272,7 +272,7 @@
37689  
37690  config NET_RX_BUSY_POLL
37691         bool
37692 -       default y
37693 +       default y if !PREEMPT_RT_FULL
37694  
37695  config BQL
37696         bool
37697 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/mac80211/rx.c linux-4.14/net/mac80211/rx.c
37698 --- linux-4.14.orig/net/mac80211/rx.c   2018-09-05 11:03:25.000000000 +0200
37699 +++ linux-4.14/net/mac80211/rx.c        2018-09-05 11:05:07.000000000 +0200
37700 @@ -4252,7 +4252,7 @@
37701         struct ieee80211_supported_band *sband;
37702         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
37703  
37704 -       WARN_ON_ONCE(softirq_count() == 0);
37705 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
37706  
37707         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
37708                 goto drop;
37709 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/netfilter/core.c linux-4.14/net/netfilter/core.c
37710 --- linux-4.14.orig/net/netfilter/core.c        2017-11-12 19:46:13.000000000 +0100
37711 +++ linux-4.14/net/netfilter/core.c     2018-09-05 11:05:07.000000000 +0200
37712 @@ -21,6 +21,7 @@
37713  #include <linux/inetdevice.h>
37714  #include <linux/proc_fs.h>
37715  #include <linux/mutex.h>
37716 +#include <linux/locallock.h>
37717  #include <linux/mm.h>
37718  #include <linux/rcupdate.h>
37719  #include <net/net_namespace.h>
37720 @@ -28,6 +29,11 @@
37721  
37722  #include "nf_internals.h"
37723  
37724 +#ifdef CONFIG_PREEMPT_RT_BASE
37725 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
37726 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
37727 +#endif
37728 +
37729  static DEFINE_MUTEX(afinfo_mutex);
37730  
37731  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
37732 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/packet/af_packet.c linux-4.14/net/packet/af_packet.c
37733 --- linux-4.14.orig/net/packet/af_packet.c      2018-09-05 11:03:25.000000000 +0200
37734 +++ linux-4.14/net/packet/af_packet.c   2018-09-05 11:05:07.000000000 +0200
37735 @@ -63,6 +63,7 @@
37736  #include <linux/if_packet.h>
37737  #include <linux/wireless.h>
37738  #include <linux/kernel.h>
37739 +#include <linux/delay.h>
37740  #include <linux/kmod.h>
37741  #include <linux/slab.h>
37742  #include <linux/vmalloc.h>
37743 @@ -707,7 +708,7 @@
37744         if (BLOCK_NUM_PKTS(pbd)) {
37745                 while (atomic_read(&pkc->blk_fill_in_prog)) {
37746                         /* Waiting for skb_copy_bits to finish... */
37747 -                       cpu_relax();
37748 +                       cpu_chill();
37749                 }
37750         }
37751  
37752 @@ -969,7 +970,7 @@
37753                 if (!(status & TP_STATUS_BLK_TMO)) {
37754                         while (atomic_read(&pkc->blk_fill_in_prog)) {
37755                                 /* Waiting for skb_copy_bits to finish... */
37756 -                               cpu_relax();
37757 +                               cpu_chill();
37758                         }
37759                 }
37760                 prb_close_block(pkc, pbd, po, status);
37761 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/rds/ib_rdma.c linux-4.14/net/rds/ib_rdma.c
37762 --- linux-4.14.orig/net/rds/ib_rdma.c   2017-11-12 19:46:13.000000000 +0100
37763 +++ linux-4.14/net/rds/ib_rdma.c        2018-09-05 11:05:07.000000000 +0200
37764 @@ -34,6 +34,7 @@
37765  #include <linux/slab.h>
37766  #include <linux/rculist.h>
37767  #include <linux/llist.h>
37768 +#include <linux/delay.h>
37769  
37770  #include "rds_single_path.h"
37771  #include "ib_mr.h"
37772 @@ -210,7 +211,7 @@
37773         for_each_online_cpu(cpu) {
37774                 flag = &per_cpu(clean_list_grace, cpu);
37775                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
37776 -                       cpu_relax();
37777 +                       cpu_chill();
37778         }
37779  }
37780  
37781 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/rxrpc/security.c linux-4.14/net/rxrpc/security.c
37782 --- linux-4.14.orig/net/rxrpc/security.c        2017-11-12 19:46:13.000000000 +0100
37783 +++ linux-4.14/net/rxrpc/security.c     2018-09-05 11:05:07.000000000 +0200
37784 @@ -19,9 +19,6 @@
37785  #include <keys/rxrpc-type.h>
37786  #include "ar-internal.h"
37787  
37788 -static LIST_HEAD(rxrpc_security_methods);
37789 -static DECLARE_RWSEM(rxrpc_security_sem);
37790 -
37791  static const struct rxrpc_security *rxrpc_security_types[] = {
37792         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
37793  #ifdef CONFIG_RXKAD
37794 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sched/sch_api.c linux-4.14/net/sched/sch_api.c
37795 --- linux-4.14.orig/net/sched/sch_api.c 2017-11-12 19:46:13.000000000 +0100
37796 +++ linux-4.14/net/sched/sch_api.c      2018-09-05 11:05:07.000000000 +0200
37797 @@ -1081,7 +1081,7 @@
37798                         rcu_assign_pointer(sch->stab, stab);
37799                 }
37800                 if (tca[TCA_RATE]) {
37801 -                       seqcount_t *running;
37802 +                       net_seqlock_t *running;
37803  
37804                         err = -EOPNOTSUPP;
37805                         if (sch->flags & TCQ_F_MQROOT)
37806 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sched/sch_generic.c linux-4.14/net/sched/sch_generic.c
37807 --- linux-4.14.orig/net/sched/sch_generic.c     2018-09-05 11:03:25.000000000 +0200
37808 +++ linux-4.14/net/sched/sch_generic.c  2018-09-05 11:05:07.000000000 +0200
37809 @@ -429,7 +429,11 @@
37810         .ops            =       &noop_qdisc_ops,
37811         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
37812         .dev_queue      =       &noop_netdev_queue,
37813 +#ifdef CONFIG_PREEMPT_RT_BASE
37814 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
37815 +#else
37816         .running        =       SEQCNT_ZERO(noop_qdisc.running),
37817 +#endif
37818         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
37819  };
37820  EXPORT_SYMBOL(noop_qdisc);
37821 @@ -628,9 +632,17 @@
37822         lockdep_set_class(&sch->busylock,
37823                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
37824  
37825 +#ifdef CONFIG_PREEMPT_RT_BASE
37826 +       seqlock_init(&sch->running);
37827 +       lockdep_set_class(&sch->running.seqcount,
37828 +                         dev->qdisc_running_key ?: &qdisc_running_key);
37829 +       lockdep_set_class(&sch->running.lock,
37830 +                         dev->qdisc_running_key ?: &qdisc_running_key);
37831 +#else
37832         seqcount_init(&sch->running);
37833         lockdep_set_class(&sch->running,
37834                           dev->qdisc_running_key ?: &qdisc_running_key);
37835 +#endif
37836  
37837         sch->ops = ops;
37838         sch->enqueue = ops->enqueue;
37839 @@ -933,7 +945,7 @@
37840         /* Wait for outstanding qdisc_run calls. */
37841         list_for_each_entry(dev, head, close_list) {
37842                 while (some_qdisc_is_busy(dev))
37843 -                       yield();
37844 +                       msleep(1);
37845                 /* The new qdisc is assigned at this point so we can safely
37846                  * unwind stale skb lists and qdisc statistics
37847                  */
37848 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/sunrpc/svc_xprt.c linux-4.14/net/sunrpc/svc_xprt.c
37849 --- linux-4.14.orig/net/sunrpc/svc_xprt.c       2017-11-12 19:46:13.000000000 +0100
37850 +++ linux-4.14/net/sunrpc/svc_xprt.c    2018-09-05 11:05:07.000000000 +0200
37851 @@ -396,7 +396,7 @@
37852                 goto out;
37853         }
37854  
37855 -       cpu = get_cpu();
37856 +       cpu = get_cpu_light();
37857         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
37858  
37859         atomic_long_inc(&pool->sp_stats.packets);
37860 @@ -432,7 +432,7 @@
37861  
37862                 atomic_long_inc(&pool->sp_stats.threads_woken);
37863                 wake_up_process(rqstp->rq_task);
37864 -               put_cpu();
37865 +               put_cpu_light();
37866                 goto out;
37867         }
37868         rcu_read_unlock();
37869 @@ -453,7 +453,7 @@
37870                 goto redo_search;
37871         }
37872         rqstp = NULL;
37873 -       put_cpu();
37874 +       put_cpu_light();
37875  out:
37876         trace_svc_xprt_do_enqueue(xprt, rqstp);
37877  }
37878 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/net/xfrm/xfrm_state.c linux-4.14/net/xfrm/xfrm_state.c
37879 --- linux-4.14.orig/net/xfrm/xfrm_state.c       2018-09-05 11:03:25.000000000 +0200
37880 +++ linux-4.14/net/xfrm/xfrm_state.c    2018-09-05 11:05:07.000000000 +0200
37881 @@ -427,7 +427,7 @@
37882  
37883  static void xfrm_state_gc_destroy(struct xfrm_state *x)
37884  {
37885 -       tasklet_hrtimer_cancel(&x->mtimer);
37886 +       hrtimer_cancel(&x->mtimer);
37887         del_timer_sync(&x->rtimer);
37888         kfree(x->aead);
37889         kfree(x->aalg);
37890 @@ -472,8 +472,8 @@
37891  
37892  static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
37893  {
37894 -       struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
37895 -       struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
37896 +       struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
37897 +       enum hrtimer_restart ret = HRTIMER_NORESTART;
37898         unsigned long now = get_seconds();
37899         long next = LONG_MAX;
37900         int warn = 0;
37901 @@ -537,7 +537,8 @@
37902                 km_state_expired(x, 0, 0);
37903  resched:
37904         if (next != LONG_MAX) {
37905 -               tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
37906 +               hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
37907 +               ret = HRTIMER_RESTART;
37908         }
37909  
37910         goto out;
37911 @@ -554,7 +555,7 @@
37912  
37913  out:
37914         spin_unlock(&x->lock);
37915 -       return HRTIMER_NORESTART;
37916 +       return ret;
37917  }
37918  
37919  static void xfrm_replay_timer_handler(unsigned long data);
37920 @@ -573,8 +574,8 @@
37921                 INIT_HLIST_NODE(&x->bydst);
37922                 INIT_HLIST_NODE(&x->bysrc);
37923                 INIT_HLIST_NODE(&x->byspi);
37924 -               tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
37925 -                                       CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
37926 +               hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT);
37927 +               x->mtimer.function = xfrm_timer_handler;
37928                 setup_timer(&x->rtimer, xfrm_replay_timer_handler,
37929                                 (unsigned long)x);
37930                 x->curlft.add_time = get_seconds();
37931 @@ -1031,7 +1032,9 @@
37932                                 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
37933                         }
37934                         x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
37935 -                       tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
37936 +                       hrtimer_start(&x->mtimer,
37937 +                                     ktime_set(net->xfrm.sysctl_acq_expires, 0),
37938 +                                     HRTIMER_MODE_REL_SOFT);
37939                         net->xfrm.state_num++;
37940                         xfrm_hash_grow_check(net, x->bydst.next != NULL);
37941                         spin_unlock_bh(&net->xfrm.xfrm_state_lock);
37942 @@ -1142,7 +1145,7 @@
37943                 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
37944         }
37945  
37946 -       tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
37947 +       hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
37948         if (x->replay_maxage)
37949                 mod_timer(&x->rtimer, jiffies + x->replay_maxage);
37950  
37951 @@ -1246,7 +1249,9 @@
37952                 x->mark.m = m->m;
37953                 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
37954                 xfrm_state_hold(x);
37955 -               tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
37956 +               hrtimer_start(&x->mtimer,
37957 +                             ktime_set(net->xfrm.sysctl_acq_expires, 0),
37958 +                             HRTIMER_MODE_REL_SOFT);
37959                 list_add(&x->km.all, &net->xfrm.state_all);
37960                 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
37961                 h = xfrm_src_hash(net, daddr, saddr, family);
37962 @@ -1546,7 +1551,8 @@
37963                 memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
37964                 x1->km.dying = 0;
37965  
37966 -               tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
37967 +               hrtimer_start(&x1->mtimer, ktime_set(1, 0),
37968 +                             HRTIMER_MODE_REL_SOFT);
37969                 if (x1->curlft.use_time)
37970                         xfrm_state_check_expire(x1);
37971  
37972 @@ -1570,7 +1576,7 @@
37973         if (x->curlft.bytes >= x->lft.hard_byte_limit ||
37974             x->curlft.packets >= x->lft.hard_packet_limit) {
37975                 x->km.state = XFRM_STATE_EXPIRED;
37976 -               tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
37977 +               hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
37978                 return -EINVAL;
37979         }
37980  
37981 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/samples/trace_events/trace-events-sample.c linux-4.14/samples/trace_events/trace-events-sample.c
37982 --- linux-4.14.orig/samples/trace_events/trace-events-sample.c  2017-11-12 19:46:13.000000000 +0100
37983 +++ linux-4.14/samples/trace_events/trace-events-sample.c       2018-09-05 11:05:07.000000000 +0200
37984 @@ -33,7 +33,7 @@
37985  
37986         /* Silly tracepoints */
37987         trace_foo_bar("hello", cnt, array, random_strings[len],
37988 -                     &current->cpus_allowed);
37989 +                     current->cpus_ptr);
37990  
37991         trace_foo_with_template_simple("HELLO", cnt);
37992  
37993 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/scripts/mkcompile_h linux-4.14/scripts/mkcompile_h
37994 --- linux-4.14.orig/scripts/mkcompile_h 2017-11-12 19:46:13.000000000 +0100
37995 +++ linux-4.14/scripts/mkcompile_h      2018-09-05 11:05:07.000000000 +0200
37996 @@ -5,7 +5,8 @@
37997  ARCH=$2
37998  SMP=$3
37999  PREEMPT=$4
38000 -CC=$5
38001 +RT=$5
38002 +CC=$6
38003  
38004  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
38005  
38006 @@ -58,6 +59,7 @@
38007  CONFIG_FLAGS=""
38008  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
38009  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
38010 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
38011  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
38012  
38013  # Truncate to maximum length
38014 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/security/apparmor/include/path.h linux-4.14/security/apparmor/include/path.h
38015 --- linux-4.14.orig/security/apparmor/include/path.h    2017-11-12 19:46:13.000000000 +0100
38016 +++ linux-4.14/security/apparmor/include/path.h 2018-09-05 11:05:07.000000000 +0200
38017 @@ -39,9 +39,10 @@
38018  };
38019  
38020  #include <linux/percpu.h>
38021 -#include <linux/preempt.h>
38022 +#include <linux/locallock.h>
38023  
38024  DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
38025 +DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
38026  
38027  #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
38028  #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
38029 @@ -55,12 +56,24 @@
38030  
38031  #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
38032  
38033 -#ifdef CONFIG_DEBUG_PREEMPT
38034 +#ifdef CONFIG_PREEMPT_RT_BASE
38035 +
38036 +static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
38037 +{
38038 +       struct local_irq_lock *lv;
38039 +
38040 +       lv = this_cpu_ptr(&aa_buffers_lock);
38041 +       WARN_ONCE(lv->owner != current,
38042 +                 "__get_buffer without aa_buffers_lock\n");
38043 +}
38044 +
38045 +#elif defined(CONFIG_DEBUG_PREEMPT)
38046  #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
38047  #else
38048  #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
38049  #endif
38050  
38051 +
38052  #define __get_buffer(N) ({                                     \
38053         struct aa_buffers *__cpu_var; \
38054         AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled");  \
38055 @@ -73,14 +86,14 @@
38056  
38057  #define get_buffers(X...)      \
38058  do {                           \
38059 -       preempt_disable();      \
38060 +       local_lock(aa_buffers_lock);    \
38061         __get_buffers(X);       \
38062  } while (0)
38063  
38064  #define put_buffers(X, Y...)   \
38065  do {                           \
38066         __put_buffers(X, Y);    \
38067 -       preempt_enable();       \
38068 +       local_unlock(aa_buffers_lock);  \
38069  } while (0)
38070  
38071  #endif /* __AA_PATH_H */
38072 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/security/apparmor/lsm.c linux-4.14/security/apparmor/lsm.c
38073 --- linux-4.14.orig/security/apparmor/lsm.c     2017-11-12 19:46:13.000000000 +0100
38074 +++ linux-4.14/security/apparmor/lsm.c  2018-09-05 11:05:07.000000000 +0200
38075 @@ -44,7 +44,7 @@
38076  int apparmor_initialized;
38077  
38078  DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
38079 -
38080 +DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
38081  
38082  /*
38083   * LSM hook functions
38084 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/sound/core/pcm_native.c linux-4.14/sound/core/pcm_native.c
38085 --- linux-4.14.orig/sound/core/pcm_native.c     2018-09-05 11:03:25.000000000 +0200
38086 +++ linux-4.14/sound/core/pcm_native.c  2018-09-05 11:05:07.000000000 +0200
38087 @@ -148,7 +148,7 @@
38088  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
38089  {
38090         if (!substream->pcm->nonatomic)
38091 -               local_irq_disable();
38092 +               local_irq_disable_nort();
38093         snd_pcm_stream_lock(substream);
38094  }
38095  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
38096 @@ -163,7 +163,7 @@
38097  {
38098         snd_pcm_stream_unlock(substream);
38099         if (!substream->pcm->nonatomic)
38100 -               local_irq_enable();
38101 +               local_irq_enable_nort();
38102  }
38103  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
38104  
38105 @@ -171,7 +171,7 @@
38106  {
38107         unsigned long flags = 0;
38108         if (!substream->pcm->nonatomic)
38109 -               local_irq_save(flags);
38110 +               local_irq_save_nort(flags);
38111         snd_pcm_stream_lock(substream);
38112         return flags;
38113  }
38114 @@ -189,7 +189,7 @@
38115  {
38116         snd_pcm_stream_unlock(substream);
38117         if (!substream->pcm->nonatomic)
38118 -               local_irq_restore(flags);
38119 +               local_irq_restore_nort(flags);
38120  }
38121  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
38122  
38123 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/sound/drivers/dummy.c linux-4.14/sound/drivers/dummy.c
38124 --- linux-4.14.orig/sound/drivers/dummy.c       2017-11-12 19:46:13.000000000 +0100
38125 +++ linux-4.14/sound/drivers/dummy.c    2018-09-05 11:05:07.000000000 +0200
38126 @@ -376,17 +376,9 @@
38127         ktime_t period_time;
38128         atomic_t running;
38129         struct hrtimer timer;
38130 -       struct tasklet_struct tasklet;
38131         struct snd_pcm_substream *substream;
38132  };
38133  
38134 -static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
38135 -{
38136 -       struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
38137 -       if (atomic_read(&dpcm->running))
38138 -               snd_pcm_period_elapsed(dpcm->substream);
38139 -}
38140 -
38141  static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
38142  {
38143         struct dummy_hrtimer_pcm *dpcm;
38144 @@ -394,7 +386,14 @@
38145         dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
38146         if (!atomic_read(&dpcm->running))
38147                 return HRTIMER_NORESTART;
38148 -       tasklet_schedule(&dpcm->tasklet);
38149 +       /*
38150 +        * In cases of XRUN and draining, this calls .trigger to stop PCM
38151 +        * substream.
38152 +        */
38153 +       snd_pcm_period_elapsed(dpcm->substream);
38154 +       if (!atomic_read(&dpcm->running))
38155 +               return HRTIMER_NORESTART;
38156 +
38157         hrtimer_forward_now(timer, dpcm->period_time);
38158         return HRTIMER_RESTART;
38159  }
38160 @@ -404,7 +403,7 @@
38161         struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
38162  
38163         dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer);
38164 -       hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL);
38165 +       hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
38166         atomic_set(&dpcm->running, 1);
38167         return 0;
38168  }
38169 @@ -414,14 +413,14 @@
38170         struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
38171  
38172         atomic_set(&dpcm->running, 0);
38173 -       hrtimer_cancel(&dpcm->timer);
38174 +       if (!hrtimer_callback_running(&dpcm->timer))
38175 +               hrtimer_cancel(&dpcm->timer);
38176         return 0;
38177  }
38178  
38179  static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
38180  {
38181         hrtimer_cancel(&dpcm->timer);
38182 -       tasklet_kill(&dpcm->tasklet);
38183  }
38184  
38185  static snd_pcm_uframes_t
38186 @@ -466,12 +465,10 @@
38187         if (!dpcm)
38188                 return -ENOMEM;
38189         substream->runtime->private_data = dpcm;
38190 -       hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
38191 +       hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
38192         dpcm->timer.function = dummy_hrtimer_callback;
38193         dpcm->substream = substream;
38194         atomic_set(&dpcm->running, 0);
38195 -       tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed,
38196 -                    (unsigned long)dpcm);
38197         return 0;
38198  }
38199  
38200 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/functions linux-4.14/tools/testing/selftests/ftrace/test.d/functions
38201 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/functions     2018-09-05 11:03:25.000000000 +0200
38202 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/functions  2018-09-05 11:05:07.000000000 +0200
38203 @@ -70,6 +70,13 @@
38204      echo 0 > events/enable
38205  }
38206  
38207 +clear_synthetic_events() { # reset all current synthetic events
38208 +    grep -v ^# synthetic_events |
38209 +    while read line; do
38210 +        echo "!$line" >> synthetic_events
38211 +    done
38212 +}
38213 +
38214  initialize_ftrace() { # Reset ftrace to initial-state
38215  # As the initial state, ftrace will be set to nop tracer,
38216  # no events, no triggers, no filters, no function filters,
38217 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
38218 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc 1970-01-01 01:00:00.000000000 +0100
38219 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc      2018-09-05 11:05:07.000000000 +0200
38220 @@ -0,0 +1,39 @@
38221 +#!/bin/sh
38222 +# description: event trigger - test extended error support
38223 +
38224 +
38225 +do_reset() {
38226 +    reset_trigger
38227 +    echo > set_event
38228 +    clear_trace
38229 +}
38230 +
38231 +fail() { #msg
38232 +    do_reset
38233 +    echo $1
38234 +    exit_fail
38235 +}
38236 +
38237 +if [ ! -f set_event ]; then
38238 +    echo "event tracing is not supported"
38239 +    exit_unsupported
38240 +fi
38241 +
38242 +if [ ! -f synthetic_events ]; then
38243 +    echo "synthetic event is not supported"
38244 +    exit_unsupported
38245 +fi
38246 +
38247 +reset_tracer
38248 +do_reset
38249 +
38250 +echo "Test extended error support"
38251 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38252 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
38253 +if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
38254 +    fail "Failed to generate extended error in histogram"
38255 +fi
38256 +
38257 +do_reset
38258 +
38259 +exit 0
38260 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
38261 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc 1970-01-01 01:00:00.000000000 +0100
38262 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc      2018-09-05 11:05:07.000000000 +0200
38263 @@ -0,0 +1,54 @@
38264 +#!/bin/sh
38265 +# description: event trigger - test field variable support
38266 +
38267 +do_reset() {
38268 +    reset_trigger
38269 +    echo > set_event
38270 +    clear_trace
38271 +}
38272 +
38273 +fail() { #msg
38274 +    do_reset
38275 +    echo $1
38276 +    exit_fail
38277 +}
38278 +
38279 +if [ ! -f set_event ]; then
38280 +    echo "event tracing is not supported"
38281 +    exit_unsupported
38282 +fi
38283 +
38284 +if [ ! -f synthetic_events ]; then
38285 +    echo "synthetic event is not supported"
38286 +    exit_unsupported
38287 +fi
38288 +
38289 +clear_synthetic_events
38290 +reset_tracer
38291 +do_reset
38292 +
38293 +echo "Test field variable support"
38294 +
38295 +echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
38296 +echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
38297 +echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
38298 +echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
38299 +
38300 +ping localhost -c 3
38301 +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
38302 +    fail "Failed to create inter-event histogram"
38303 +fi
38304 +
38305 +if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
38306 +    fail "Failed to create histogram with field variable"
38307 +fi
38308 +
38309 +echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38310 +
38311 +if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
38312 +    fail "Failed to remove histogram with field variable"
38313 +fi
38314 +
38315 +do_reset
38316 +
38317 +exit 0
38318 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
38319 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc      1970-01-01 01:00:00.000000000 +0100
38320 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc   2018-09-05 11:05:07.000000000 +0200
38321 @@ -0,0 +1,58 @@
38322 +#!/bin/sh
38323 +# description: event trigger - test inter-event combined histogram trigger
38324 +
38325 +do_reset() {
38326 +    reset_trigger
38327 +    echo > set_event
38328 +    clear_trace
38329 +}
38330 +
38331 +fail() { #msg
38332 +    do_reset
38333 +    echo $1
38334 +    exit_fail
38335 +}
38336 +
38337 +if [ ! -f set_event ]; then
38338 +    echo "event tracing is not supported"
38339 +    exit_unsupported
38340 +fi
38341 +
38342 +if [ ! -f synthetic_events ]; then
38343 +    echo "synthetic event is not supported"
38344 +    exit_unsupported
38345 +fi
38346 +
38347 +reset_tracer
38348 +do_reset
38349 +clear_synthetic_events
38350 +
38351 +echo "Test create synthetic event"
38352 +
38353 +echo 'waking_latency  u64 lat pid_t pid' > synthetic_events
38354 +if [ ! -d events/synthetic/waking_latency ]; then
38355 +    fail "Failed to create waking_latency synthetic event"
38356 +fi
38357 +
38358 +echo "Test combined histogram"
38359 +
38360 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
38361 +echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
38362 +echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
38363 +
38364 +echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
38365 +echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
38366 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
38367 +
38368 +echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
38369 +echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
38370 +echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
38371 +
38372 +ping localhost -c 3
38373 +if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
38374 +    fail "Failed to create combined histogram"
38375 +fi
38376 +
38377 +do_reset
38378 +
38379 +exit 0
38380 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
38381 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc    1970-01-01 01:00:00.000000000 +0100
38382 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc 2018-09-05 11:05:07.000000000 +0200
38383 @@ -0,0 +1,50 @@
38384 +#!/bin/sh
38385 +# description: event trigger - test inter-event histogram trigger onmatch action
38386 +
38387 +do_reset() {
38388 +    reset_trigger
38389 +    echo > set_event
38390 +    clear_trace
38391 +}
38392 +
38393 +fail() { #msg
38394 +    do_reset
38395 +    echo $1
38396 +    exit_fail
38397 +}
38398 +
38399 +if [ ! -f set_event ]; then
38400 +    echo "event tracing is not supported"
38401 +    exit_unsupported
38402 +fi
38403 +
38404 +if [ ! -f synthetic_events ]; then
38405 +    echo "synthetic event is not supported"
38406 +    exit_unsupported
38407 +fi
38408 +
38409 +clear_synthetic_events
38410 +reset_tracer
38411 +do_reset
38412 +
38413 +echo "Test create synthetic event"
38414 +
38415 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
38416 +if [ ! -d events/synthetic/wakeup_latency ]; then
38417 +    fail "Failed to create wakeup_latency synthetic event"
38418 +fi
38419 +
38420 +echo "Test create histogram for synthetic event"
38421 +echo "Test histogram variables,simple expression support and onmatch action"
38422 +
38423 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38424 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
38425 +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
38426 +ping localhost -c 5
38427 +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
38428 +    fail "Failed to create onmatch action inter-event histogram"
38429 +fi
38430 +
38431 +do_reset
38432 +
38433 +exit 0
38434 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
38435 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc      1970-01-01 01:00:00.000000000 +0100
38436 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc   2018-09-05 11:05:07.000000000 +0200
38437 @@ -0,0 +1,50 @@
38438 +#!/bin/sh
38439 +# description: event trigger - test inter-event histogram trigger onmatch-onmax action
38440 +
38441 +do_reset() {
38442 +    reset_trigger
38443 +    echo > set_event
38444 +    clear_trace
38445 +}
38446 +
38447 +fail() { #msg
38448 +    do_reset
38449 +    echo $1
38450 +    exit_fail
38451 +}
38452 +
38453 +if [ ! -f set_event ]; then
38454 +    echo "event tracing is not supported"
38455 +    exit_unsupported
38456 +fi
38457 +
38458 +if [ ! -f synthetic_events ]; then
38459 +    echo "synthetic event is not supported"
38460 +    exit_unsupported
38461 +fi
38462 +
38463 +clear_synthetic_events
38464 +reset_tracer
38465 +do_reset
38466 +
38467 +echo "Test create synthetic event"
38468 +
38469 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
38470 +if [ ! -d events/synthetic/wakeup_latency ]; then
38471 +    fail "Failed to create wakeup_latency synthetic event"
38472 +fi
38473 +
38474 +echo "Test create histogram for synthetic event"
38475 +echo "Test histogram variables,simple expression support and onmatch-onmax action"
38476 +
38477 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
38478 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38479 +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
38480 +ping localhost -c 5
38481 +if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
38482 +    fail "Failed to create onmatch-onmax action inter-event histogram"
38483 +fi
38484 +
38485 +do_reset
38486 +
38487 +exit 0
38488 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
38489 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc      1970-01-01 01:00:00.000000000 +0100
38490 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc   2018-09-05 11:05:07.000000000 +0200
38491 @@ -0,0 +1,48 @@
38492 +#!/bin/sh
38493 +# description: event trigger - test inter-event histogram trigger onmax action
38494 +
38495 +do_reset() {
38496 +    reset_trigger
38497 +    echo > set_event
38498 +    clear_trace
38499 +}
38500 +
38501 +fail() { #msg
38502 +    do_reset
38503 +    echo $1
38504 +    exit_fail
38505 +}
38506 +
38507 +if [ ! -f set_event ]; then
38508 +    echo "event tracing is not supported"
38509 +    exit_unsupported
38510 +fi
38511 +
38512 +if [ ! -f synthetic_events ]; then
38513 +    echo "synthetic event is not supported"
38514 +    exit_unsupported
38515 +fi
38516 +
38517 +clear_synthetic_events
38518 +reset_tracer
38519 +do_reset
38520 +
38521 +echo "Test create synthetic event"
38522 +
38523 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
38524 +if [ ! -d events/synthetic/wakeup_latency ]; then
38525 +    fail "Failed to create wakeup_latency synthetic event"
38526 +fi
38527 +
38528 +echo "Test onmax action"
38529 +
38530 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
38531 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
38532 +ping localhost -c 3
38533 +if ! grep -q "max:" events/sched/sched_switch/hist; then
38534 +    fail "Failed to create onmax action inter-event histogram"
38535 +fi
38536 +
38537 +do_reset
38538 +
38539 +exit 0
38540 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
38541 --- linux-4.14.orig/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc   1970-01-01 01:00:00.000000000 +0100
38542 +++ linux-4.14/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc        2018-09-05 11:05:07.000000000 +0200
38543 @@ -0,0 +1,54 @@
38544 +#!/bin/sh
38545 +# description: event trigger - test synthetic event create remove
38546 +do_reset() {
38547 +    reset_trigger
38548 +    echo > set_event
38549 +    clear_trace
38550 +}
38551 +
38552 +fail() { #msg
38553 +    do_reset
38554 +    echo $1
38555 +    exit_fail
38556 +}
38557 +
38558 +if [ ! -f set_event ]; then
38559 +    echo "event tracing is not supported"
38560 +    exit_unsupported
38561 +fi
38562 +
38563 +if [ ! -f synthetic_events ]; then
38564 +    echo "synthetic event is not supported"
38565 +    exit_unsupported
38566 +fi
38567 +
38568 +clear_synthetic_events
38569 +reset_tracer
38570 +do_reset
38571 +
38572 +echo "Test create synthetic event"
38573 +
38574 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
38575 +if [ ! -d events/synthetic/wakeup_latency ]; then
38576 +    fail "Failed to create wakeup_latency synthetic event"
38577 +fi
38578 +
38579 +reset_trigger
38580 +
38581 +echo "Test create synthetic event with an error"
38582 +echo 'wakeup_latency  u64 lat pid_t pid char' > synthetic_events > /dev/null
38583 +if [ -d events/synthetic/wakeup_latency ]; then
38584 +    fail "Created wakeup_latency synthetic event with an invalid format"
38585 +fi
38586 +
38587 +reset_trigger
38588 +
38589 +echo "Test remove synthetic event"
38590 +echo '!wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
38591 +if [ -d events/synthetic/wakeup_latency ]; then
38592 +    fail "Failed to delete wakeup_latency synthetic event"
38593 +fi
38594 +
38595 +do_reset
38596 +
38597 +exit 0
38598 diff -durN -x '*~' -x '*.orig' linux-4.14.orig/virt/kvm/arm/arm.c linux-4.14/virt/kvm/arm/arm.c
38599 --- linux-4.14.orig/virt/kvm/arm/arm.c  2018-09-05 11:03:25.000000000 +0200
38600 +++ linux-4.14/virt/kvm/arm/arm.c       2018-09-05 11:05:07.000000000 +0200
38601 @@ -69,7 +69,6 @@
38602  
38603  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
38604  {
38605 -       BUG_ON(preemptible());
38606         __this_cpu_write(kvm_arm_running_vcpu, vcpu);
38607  }
38608  
38609 @@ -79,7 +78,6 @@
38610   */
38611  struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
38612  {
38613 -       BUG_ON(preemptible());
38614         return __this_cpu_read(kvm_arm_running_vcpu);
38615  }
38616  
38617 @@ -653,7 +651,7 @@
38618                  * involves poking the GIC, which must be done in a
38619                  * non-preemptible context.
38620                  */
38621 -               preempt_disable();
38622 +               migrate_disable();
38623  
38624                 kvm_pmu_flush_hwstate(vcpu);
38625  
38626 @@ -690,7 +688,7 @@
38627                         kvm_pmu_sync_hwstate(vcpu);
38628                         kvm_timer_sync_hwstate(vcpu);
38629                         kvm_vgic_sync_hwstate(vcpu);
38630 -                       preempt_enable();
38631 +                       migrate_enable();
38632                         continue;
38633                 }
38634  
38635 @@ -745,7 +743,7 @@
38636  
38637                 kvm_vgic_sync_hwstate(vcpu);
38638  
38639 -               preempt_enable();
38640 +               migrate_enable();
38641  
38642                 ret = handle_exit(vcpu, run, ret);
38643         }
This page took 3.097909 seconds and 3 git commands to generate.