]> git.pld-linux.org Git - packages/kernel.git/blob - kernel-rt.patch
- up to 4.9.21
[packages/kernel.git] / kernel-rt.patch
1 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
2 index 3a3b30ac2a75..9e0745cafbd8 100644
3 --- a/Documentation/sysrq.txt
4 +++ b/Documentation/sysrq.txt
5 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
6  On other - If you know of the key combos for other architectures, please
7             let me know so I can add them to this section.
8  
9 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
10 -
11 +On all -  write a character to /proc/sysrq-trigger, e.g.:
12                 echo t > /proc/sysrq-trigger
13  
14 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
15 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
16 +        Send an ICMP echo request with this pattern plus the particular
17 +        SysRq command key. Example:
18 +               # ping -c1 -s57 -p0102030468
19 +        will trigger the SysRq-H (help) command.
20 +
21 +
22  *  What are the 'command' keys?
23  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24  'b'     - Will immediately reboot the system without syncing or unmounting
25 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
26 new file mode 100644
27 index 000000000000..6f2aeabf7faa
28 --- /dev/null
29 +++ b/Documentation/trace/histograms.txt
30 @@ -0,0 +1,186 @@
31 +               Using the Linux Kernel Latency Histograms
32 +
33 +
34 +This document gives a short explanation how to enable, configure and use
35 +latency histograms. Latency histograms are primarily relevant in the
36 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
37 +and are used in the quality management of the Linux real-time
38 +capabilities.
39 +
40 +
41 +* Purpose of latency histograms
42 +
43 +A latency histogram continuously accumulates the frequencies of latency
44 +data. There are two types of histograms
45 +- potential sources of latencies
46 +- effective latencies
47 +
48 +
49 +* Potential sources of latencies
50 +
51 +Potential sources of latencies are code segments where interrupts,
52 +preemption or both are disabled (aka critical sections). To create
53 +histograms of potential sources of latency, the kernel stores the time
54 +stamp at the start of a critical section, determines the time elapsed
55 +when the end of the section is reached, and increments the frequency
56 +counter of that latency value - irrespective of whether any concurrently
57 +running process is affected by latency or not.
58 +- Configuration items (in the Kernel hacking/Tracers submenu)
59 +  CONFIG_INTERRUPT_OFF_LATENCY
60 +  CONFIG_PREEMPT_OFF_LATENCY
61 +
62 +
63 +* Effective latencies
64 +
65 +Effective latencies are actually occuring during wakeup of a process. To
66 +determine effective latencies, the kernel stores the time stamp when a
67 +process is scheduled to be woken up, and determines the duration of the
68 +wakeup time shortly before control is passed over to this process. Note
69 +that the apparent latency in user space may be somewhat longer, since the
70 +process may be interrupted after control is passed over to it but before
71 +the execution in user space takes place. Simply measuring the interval
72 +between enqueuing and wakeup may also not appropriate in cases when a
73 +process is scheduled as a result of a timer expiration. The timer may have
74 +missed its deadline, e.g. due to disabled interrupts, but this latency
75 +would not be registered. Therefore, the offsets of missed timers are
76 +recorded in a separate histogram. If both wakeup latency and missed timer
77 +offsets are configured and enabled, a third histogram may be enabled that
78 +records the overall latency as a sum of the timer latency, if any, and the
79 +wakeup latency. This histogram is called "timerandwakeup".
80 +- Configuration items (in the Kernel hacking/Tracers submenu)
81 +  CONFIG_WAKEUP_LATENCY
82 +  CONFIG_MISSED_TIMER_OFSETS
83 +
84 +
85 +* Usage
86 +
87 +The interface to the administration of the latency histograms is located
88 +in the debugfs file system. To mount it, either enter
89 +
90 +mount -t sysfs nodev /sys
91 +mount -t debugfs nodev /sys/kernel/debug
92 +
93 +from shell command line level, or add
94 +
95 +nodev  /sys                    sysfs   defaults        0 0
96 +nodev  /sys/kernel/debug       debugfs defaults        0 0
97 +
98 +to the file /etc/fstab. All latency histogram related files are then
99 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
100 +particular histogram type is enabled by writing non-zero to the related
101 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
102 +Select "preemptirqsoff" for the histograms of potential sources of
103 +latencies and "wakeup" for histograms of effective latencies etc. The
104 +histogram data - one per CPU - are available in the files
105 +
106 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
107 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
108 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
109 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
110 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
111 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
112 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
113 +
114 +The histograms are reset by writing non-zero to the file "reset" in a
115 +particular latency directory. To reset all latency data, use
116 +
117 +#!/bin/sh
118 +
119 +TRACINGDIR=/sys/kernel/debug/tracing
120 +HISTDIR=$TRACINGDIR/latency_hist
121 +
122 +if test -d $HISTDIR
123 +then
124 +  cd $HISTDIR
125 +  for i in `find . | grep /reset$`
126 +  do
127 +    echo 1 >$i
128 +  done
129 +fi
130 +
131 +
132 +* Data format
133 +
134 +Latency data are stored with a resolution of one microsecond. The
135 +maximum latency is 10,240 microseconds. The data are only valid, if the
136 +overflow register is empty. Every output line contains the latency in
137 +microseconds in the first row and the number of samples in the second
138 +row. To display only lines with a positive latency count, use, for
139 +example,
140 +
141 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
142 +
143 +#Minimum latency: 0 microseconds.
144 +#Average latency: 0 microseconds.
145 +#Maximum latency: 25 microseconds.
146 +#Total samples: 3104770694
147 +#There are 0 samples greater or equal than 10240 microseconds
148 +#usecs          samples
149 +    0        2984486876
150 +    1          49843506
151 +    2          58219047
152 +    3           5348126
153 +    4           2187960
154 +    5           3388262
155 +    6            959289
156 +    7            208294
157 +    8             40420
158 +    9              4485
159 +   10             14918
160 +   11             18340
161 +   12             25052
162 +   13             19455
163 +   14              5602
164 +   15               969
165 +   16                47
166 +   17                18
167 +   18                14
168 +   19                 1
169 +   20                 3
170 +   21                 2
171 +   22                 5
172 +   23                 2
173 +   25                 1
174 +
175 +
176 +* Wakeup latency of a selected process
177 +
178 +To only collect wakeup latency data of a particular process, write the
179 +PID of the requested process to
180 +
181 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
182 +
183 +PIDs are not considered, if this variable is set to 0.
184 +
185 +
186 +* Details of the process with the highest wakeup latency so far
187 +
188 +Selected data of the process that suffered from the highest wakeup
189 +latency that occurred in a particular CPU are available in the file
190 +
191 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
192 +
193 +In addition, other relevant system data at the time when the
194 +latency occurred are given.
195 +
196 +The format of the data is (all in one line):
197 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
198 +<- <PID> <Priority> <Command> <Timestamp>
199 +
200 +The value of <Timeroffset> is only relevant in the combined timer
201 +and wakeup latency recording. In the wakeup recording, it is
202 +always 0, in the missed_timer_offsets recording, it is the same
203 +as <Latency>.
204 +
205 +When retrospectively searching for the origin of a latency and
206 +tracing was not enabled, it may be helpful to know the name and
207 +some basic data of the task that (finally) was switching to the
208 +late real-tlme task. In addition to the victim's data, also the
209 +data of the possible culprit are therefore displayed after the
210 +"<-" symbol.
211 +
212 +Finally, the timestamp of the time when the latency occurred
213 +in <seconds>.<microseconds> after the most recent system boot
214 +is provided.
215 +
216 +These data are also reset when the wakeup histogram is reset.
217 diff --git a/arch/Kconfig b/arch/Kconfig
218 index 659bdd079277..099fc0f5155e 100644
219 --- a/arch/Kconfig
220 +++ b/arch/Kconfig
221 @@ -9,6 +9,7 @@ config OPROFILE
222         tristate "OProfile system profiling"
223         depends on PROFILING
224         depends on HAVE_OPROFILE
225 +       depends on !PREEMPT_RT_FULL
226         select RING_BUFFER
227         select RING_BUFFER_ALLOW_SWAP
228         help
229 @@ -52,6 +53,7 @@ config KPROBES
230  config JUMP_LABEL
231         bool "Optimize very unlikely/likely branches"
232         depends on HAVE_ARCH_JUMP_LABEL
233 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
234         help
235           This option enables a transparent branch optimization that
236          makes certain almost-always-true or almost-always-false branch
237 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
238 index b5d529fdffab..5715844e83e3 100644
239 --- a/arch/arm/Kconfig
240 +++ b/arch/arm/Kconfig
241 @@ -36,7 +36,7 @@ config ARM
242         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
243         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
244         select HAVE_ARCH_HARDENED_USERCOPY
245 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
246 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
247         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
248         select HAVE_ARCH_MMAP_RND_BITS if MMU
249         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
250 @@ -75,6 +75,7 @@ config ARM
251         select HAVE_PERF_EVENTS
252         select HAVE_PERF_REGS
253         select HAVE_PERF_USER_STACK_DUMP
254 +       select HAVE_PREEMPT_LAZY
255         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
256         select HAVE_REGS_AND_STACK_ACCESS_API
257         select HAVE_SYSCALL_TRACEPOINTS
258 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
259 index e53638c8ed8a..6095a1649865 100644
260 --- a/arch/arm/include/asm/irq.h
261 +++ b/arch/arm/include/asm/irq.h
262 @@ -22,6 +22,8 @@
263  #endif
264  
265  #ifndef __ASSEMBLY__
266 +#include <linux/cpumask.h>
267 +
268  struct irqaction;
269  struct pt_regs;
270  extern void migrate_irqs(void);
271 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
272 index 12ebfcc1d539..c962084605bc 100644
273 --- a/arch/arm/include/asm/switch_to.h
274 +++ b/arch/arm/include/asm/switch_to.h
275 @@ -3,6 +3,13 @@
276  
277  #include <linux/thread_info.h>
278  
279 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
280 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
281 +#else
282 +static inline void
283 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
284 +#endif
285 +
286  /*
287   * For v7 SMP cores running a preemptible kernel we may be pre-empted
288   * during a TLB maintenance operation, so execute an inner-shareable dsb
289 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
290  #define switch_to(prev,next,last)                                      \
291  do {                                                                   \
292         __complete_pending_tlbi();                                      \
293 +       switch_kmaps(prev, next);                                       \
294         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
295  } while (0)
296  
297 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
298 index 776757d1604a..1f36a4eccc72 100644
299 --- a/arch/arm/include/asm/thread_info.h
300 +++ b/arch/arm/include/asm/thread_info.h
301 @@ -49,6 +49,7 @@ struct cpu_context_save {
302  struct thread_info {
303         unsigned long           flags;          /* low level flags */
304         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
305 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
306         mm_segment_t            addr_limit;     /* address limit */
307         struct task_struct      *task;          /* main task structure */
308         __u32                   cpu;            /* cpu */
309 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
310  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
311  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
312  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
313 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
314 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
315 +#define TIF_NEED_RESCHED_LAZY  7
316  
317  #define TIF_NOHZ               12      /* in adaptive nohz mode */
318  #define TIF_USING_IWMMXT       17
319 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
320  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
321  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
322  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
323 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
324  #define _TIF_UPROBE            (1 << TIF_UPROBE)
325  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
326  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
327 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
328   * Change these and you break ASM code in entry-common.S
329   */
330  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
331 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
332 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
333 +                                _TIF_NEED_RESCHED_LAZY)
334  
335  #endif /* __KERNEL__ */
336  #endif /* __ASM_ARM_THREAD_INFO_H */
337 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
338 index 608008229c7d..3866da3f7bb7 100644
339 --- a/arch/arm/kernel/asm-offsets.c
340 +++ b/arch/arm/kernel/asm-offsets.c
341 @@ -65,6 +65,7 @@ int main(void)
342    BLANK();
343    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
344    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
345 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
346    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
347    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
348    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
349 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
350 index 9f157e7c51e7..468e224d76aa 100644
351 --- a/arch/arm/kernel/entry-armv.S
352 +++ b/arch/arm/kernel/entry-armv.S
353 @@ -220,11 +220,18 @@ ENDPROC(__dabt_svc)
354  
355  #ifdef CONFIG_PREEMPT
356         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
357 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
358         teq     r8, #0                          @ if preempt count != 0
359 +       bne     1f                              @ return from exeption
360 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
361 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
362 +       blne    svc_preempt                     @ preempt!
363 +
364 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
365 +       teq     r8, #0                          @ if preempt lazy count != 0
366         movne   r0, #0                          @ force flags to 0
367 -       tst     r0, #_TIF_NEED_RESCHED
368 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
369         blne    svc_preempt
370 +1:
371  #endif
372  
373         svc_exit r5, irq = 1                    @ return from exception
374 @@ -239,8 +246,14 @@ ENDPROC(__irq_svc)
375  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
376         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
377         tst     r0, #_TIF_NEED_RESCHED
378 +       bne     1b
379 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
380         reteq   r8                              @ go again
381 -       b       1b
382 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
383 +       teq     r0, #0                          @ if preempt lazy count != 0
384 +       beq     1b
385 +       ret     r8                              @ go again
386 +
387  #endif
388  
389  __und_fault:
390 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
391 index 10c3283d6c19..8872937862cc 100644
392 --- a/arch/arm/kernel/entry-common.S
393 +++ b/arch/arm/kernel/entry-common.S
394 @@ -36,7 +36,9 @@
395   UNWIND(.cantunwind    )
396         disable_irq_notrace                     @ disable interrupts
397         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
398 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
399 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
400 +       bne     fast_work_pending
401 +       tst     r1, #_TIF_SECCOMP
402         bne     fast_work_pending
403  
404         /* perform architecture specific actions before user return */
405 @@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
406         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
407         disable_irq_notrace                     @ disable interrupts
408         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
409 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
410 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
411 +       bne     do_slower_path
412 +       tst     r1, #_TIF_SECCOMP
413         beq     no_work_pending
414 +do_slower_path:
415   UNWIND(.fnend         )
416  ENDPROC(ret_fast_syscall)
417  
418 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
419 index 69bda1a5707e..1f665acaa6a9 100644
420 --- a/arch/arm/kernel/patch.c
421 +++ b/arch/arm/kernel/patch.c
422 @@ -15,7 +15,7 @@ struct patch {
423         unsigned int insn;
424  };
425  
426 -static DEFINE_SPINLOCK(patch_lock);
427 +static DEFINE_RAW_SPINLOCK(patch_lock);
428  
429  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
430         __acquires(&patch_lock)
431 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
432                 return addr;
433  
434         if (flags)
435 -               spin_lock_irqsave(&patch_lock, *flags);
436 +               raw_spin_lock_irqsave(&patch_lock, *flags);
437         else
438                 __acquire(&patch_lock);
439  
440 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
441         clear_fixmap(fixmap);
442  
443         if (flags)
444 -               spin_unlock_irqrestore(&patch_lock, *flags);
445 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
446         else
447                 __release(&patch_lock);
448  }
449 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
450 index 91d2d5b01414..750550098b59 100644
451 --- a/arch/arm/kernel/process.c
452 +++ b/arch/arm/kernel/process.c
453 @@ -322,6 +322,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
454  }
455  
456  #ifdef CONFIG_MMU
457 +/*
458 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
459 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
460 + * fail.
461 + */
462 +static int __init vectors_user_mapping_init_page(void)
463 +{
464 +       struct page *page;
465 +       unsigned long addr = 0xffff0000;
466 +       pgd_t *pgd;
467 +       pud_t *pud;
468 +       pmd_t *pmd;
469 +
470 +       pgd = pgd_offset_k(addr);
471 +       pud = pud_offset(pgd, addr);
472 +       pmd = pmd_offset(pud, addr);
473 +       page = pmd_page(*(pmd));
474 +
475 +       pgtable_page_ctor(page);
476 +
477 +       return 0;
478 +}
479 +late_initcall(vectors_user_mapping_init_page);
480 +
481  #ifdef CONFIG_KUSER_HELPERS
482  /*
483   * The vectors page is always readable from user space for the
484 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
485 index 7b8f2141427b..96541e00b74a 100644
486 --- a/arch/arm/kernel/signal.c
487 +++ b/arch/arm/kernel/signal.c
488 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
489          */
490         trace_hardirqs_off();
491         do {
492 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
493 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
494 +                                          _TIF_NEED_RESCHED_LAZY))) {
495                         schedule();
496                 } else {
497                         if (unlikely(!user_mode(regs)))
498 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
499 index 7dd14e8395e6..4cd7e3d98035 100644
500 --- a/arch/arm/kernel/smp.c
501 +++ b/arch/arm/kernel/smp.c
502 @@ -234,8 +234,6 @@ int __cpu_disable(void)
503         flush_cache_louis();
504         local_flush_tlb_all();
505  
506 -       clear_tasks_mm_cpumask(cpu);
507 -
508         return 0;
509  }
510  
511 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
512                 pr_err("CPU%u: cpu didn't die\n", cpu);
513                 return;
514         }
515 +
516 +       clear_tasks_mm_cpumask(cpu);
517 +
518         pr_notice("CPU%u: shutdown\n", cpu);
519  
520         /*
521 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
522 index 0bee233fef9a..314cfb232a63 100644
523 --- a/arch/arm/kernel/unwind.c
524 +++ b/arch/arm/kernel/unwind.c
525 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
526  static const struct unwind_idx *__origin_unwind_idx;
527  extern const struct unwind_idx __stop_unwind_idx[];
528  
529 -static DEFINE_SPINLOCK(unwind_lock);
530 +static DEFINE_RAW_SPINLOCK(unwind_lock);
531  static LIST_HEAD(unwind_tables);
532  
533  /* Convert a prel31 symbol to an absolute address */
534 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
535                 /* module unwind tables */
536                 struct unwind_table *table;
537  
538 -               spin_lock_irqsave(&unwind_lock, flags);
539 +               raw_spin_lock_irqsave(&unwind_lock, flags);
540                 list_for_each_entry(table, &unwind_tables, list) {
541                         if (addr >= table->begin_addr &&
542                             addr < table->end_addr) {
543 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
544                                 break;
545                         }
546                 }
547 -               spin_unlock_irqrestore(&unwind_lock, flags);
548 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
549         }
550  
551         pr_debug("%s: idx = %p\n", __func__, idx);
552 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
553         tab->begin_addr = text_addr;
554         tab->end_addr = text_addr + text_size;
555  
556 -       spin_lock_irqsave(&unwind_lock, flags);
557 +       raw_spin_lock_irqsave(&unwind_lock, flags);
558         list_add_tail(&tab->list, &unwind_tables);
559 -       spin_unlock_irqrestore(&unwind_lock, flags);
560 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
561  
562         return tab;
563  }
564 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
565         if (!tab)
566                 return;
567  
568 -       spin_lock_irqsave(&unwind_lock, flags);
569 +       raw_spin_lock_irqsave(&unwind_lock, flags);
570         list_del(&tab->list);
571 -       spin_unlock_irqrestore(&unwind_lock, flags);
572 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
573  
574         kfree(tab);
575  }
576 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
577 index 19b5f5c1c0ff..82aa639e6737 100644
578 --- a/arch/arm/kvm/arm.c
579 +++ b/arch/arm/kvm/arm.c
580 @@ -619,7 +619,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
581                  * involves poking the GIC, which must be done in a
582                  * non-preemptible context.
583                  */
584 -               preempt_disable();
585 +               migrate_disable();
586                 kvm_pmu_flush_hwstate(vcpu);
587                 kvm_timer_flush_hwstate(vcpu);
588                 kvm_vgic_flush_hwstate(vcpu);
589 @@ -640,7 +640,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
590                         kvm_pmu_sync_hwstate(vcpu);
591                         kvm_timer_sync_hwstate(vcpu);
592                         kvm_vgic_sync_hwstate(vcpu);
593 -                       preempt_enable();
594 +                       migrate_enable();
595                         continue;
596                 }
597  
598 @@ -696,7 +696,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
599  
600                 kvm_vgic_sync_hwstate(vcpu);
601  
602 -               preempt_enable();
603 +               migrate_enable();
604  
605                 ret = handle_exit(vcpu, run, ret);
606         }
607 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
608 index 98ffe1e62ad5..df9769ddece5 100644
609 --- a/arch/arm/mach-exynos/platsmp.c
610 +++ b/arch/arm/mach-exynos/platsmp.c
611 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
612         return (void __iomem *)(S5P_VA_SCU);
613  }
614  
615 -static DEFINE_SPINLOCK(boot_lock);
616 +static DEFINE_RAW_SPINLOCK(boot_lock);
617  
618  static void exynos_secondary_init(unsigned int cpu)
619  {
620 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
621         /*
622          * Synchronise with the boot thread.
623          */
624 -       spin_lock(&boot_lock);
625 -       spin_unlock(&boot_lock);
626 +       raw_spin_lock(&boot_lock);
627 +       raw_spin_unlock(&boot_lock);
628  }
629  
630  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
631 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
632          * Set synchronisation state between this boot processor
633          * and the secondary one
634          */
635 -       spin_lock(&boot_lock);
636 +       raw_spin_lock(&boot_lock);
637  
638         /*
639          * The secondary processor is waiting to be released from
640 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
641  
642                 if (timeout == 0) {
643                         printk(KERN_ERR "cpu1 power enable failed");
644 -                       spin_unlock(&boot_lock);
645 +                       raw_spin_unlock(&boot_lock);
646                         return -ETIMEDOUT;
647                 }
648         }
649 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
650          * calibrations, then wait for it to finish
651          */
652  fail:
653 -       spin_unlock(&boot_lock);
654 +       raw_spin_unlock(&boot_lock);
655  
656         return pen_release != -1 ? ret : 0;
657  }
658 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
659 index 4b653a8cb75c..b03d5a922cb1 100644
660 --- a/arch/arm/mach-hisi/platmcpm.c
661 +++ b/arch/arm/mach-hisi/platmcpm.c
662 @@ -61,7 +61,7 @@
663  
664  static void __iomem *sysctrl, *fabric;
665  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
666 -static DEFINE_SPINLOCK(boot_lock);
667 +static DEFINE_RAW_SPINLOCK(boot_lock);
668  static u32 fabric_phys_addr;
669  /*
670   * [0]: bootwrapper physical address
671 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
672         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
673                 return -EINVAL;
674  
675 -       spin_lock_irq(&boot_lock);
676 +       raw_spin_lock_irq(&boot_lock);
677  
678         if (hip04_cpu_table[cluster][cpu])
679                 goto out;
680 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
681  
682  out:
683         hip04_cpu_table[cluster][cpu]++;
684 -       spin_unlock_irq(&boot_lock);
685 +       raw_spin_unlock_irq(&boot_lock);
686  
687         return 0;
688  }
689 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
690         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
691         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
692  
693 -       spin_lock(&boot_lock);
694 +       raw_spin_lock(&boot_lock);
695         hip04_cpu_table[cluster][cpu]--;
696         if (hip04_cpu_table[cluster][cpu] == 1) {
697                 /* A power_up request went ahead of us. */
698 -               spin_unlock(&boot_lock);
699 +               raw_spin_unlock(&boot_lock);
700                 return;
701         } else if (hip04_cpu_table[cluster][cpu] > 1) {
702                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
703 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
704         }
705  
706         last_man = hip04_cluster_is_down(cluster);
707 -       spin_unlock(&boot_lock);
708 +       raw_spin_unlock(&boot_lock);
709         if (last_man) {
710                 /* Since it's Cortex A15, disable L2 prefetching. */
711                 asm volatile(
712 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
713                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
714  
715         count = TIMEOUT_MSEC / POLL_MSEC;
716 -       spin_lock_irq(&boot_lock);
717 +       raw_spin_lock_irq(&boot_lock);
718         for (tries = 0; tries < count; tries++) {
719                 if (hip04_cpu_table[cluster][cpu])
720                         goto err;
721 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
722                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
723                 if (data & CORE_WFI_STATUS(cpu))
724                         break;
725 -               spin_unlock_irq(&boot_lock);
726 +               raw_spin_unlock_irq(&boot_lock);
727                 /* Wait for clean L2 when the whole cluster is down. */
728                 msleep(POLL_MSEC);
729 -               spin_lock_irq(&boot_lock);
730 +               raw_spin_lock_irq(&boot_lock);
731         }
732         if (tries >= count)
733                 goto err;
734 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
735                 goto err;
736         if (hip04_cluster_is_down(cluster))
737                 hip04_set_snoop_filter(cluster, 0);
738 -       spin_unlock_irq(&boot_lock);
739 +       raw_spin_unlock_irq(&boot_lock);
740         return 1;
741  err:
742 -       spin_unlock_irq(&boot_lock);
743 +       raw_spin_unlock_irq(&boot_lock);
744         return 0;
745  }
746  #endif
747 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
748 index b4de3da6dffa..b52893319d75 100644
749 --- a/arch/arm/mach-omap2/omap-smp.c
750 +++ b/arch/arm/mach-omap2/omap-smp.c
751 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
752         .startup_addr = omap5_secondary_startup,
753  };
754  
755 -static DEFINE_SPINLOCK(boot_lock);
756 +static DEFINE_RAW_SPINLOCK(boot_lock);
757  
758  void __iomem *omap4_get_scu_base(void)
759  {
760 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
761         /*
762          * Synchronise with the boot thread.
763          */
764 -       spin_lock(&boot_lock);
765 -       spin_unlock(&boot_lock);
766 +       raw_spin_lock(&boot_lock);
767 +       raw_spin_unlock(&boot_lock);
768  }
769  
770  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
771 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
772          * Set synchronisation state between this boot processor
773          * and the secondary one
774          */
775 -       spin_lock(&boot_lock);
776 +       raw_spin_lock(&boot_lock);
777  
778         /*
779          * Update the AuxCoreBoot0 with boot state for secondary core.
780 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
781          * Now the secondary core is starting up let it run its
782          * calibrations, then wait for it to finish
783          */
784 -       spin_unlock(&boot_lock);
785 +       raw_spin_unlock(&boot_lock);
786  
787         return 0;
788  }
789 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
790 index 0875b99add18..18b6d98d2581 100644
791 --- a/arch/arm/mach-prima2/platsmp.c
792 +++ b/arch/arm/mach-prima2/platsmp.c
793 @@ -22,7 +22,7 @@
794  
795  static void __iomem *clk_base;
796  
797 -static DEFINE_SPINLOCK(boot_lock);
798 +static DEFINE_RAW_SPINLOCK(boot_lock);
799  
800  static void sirfsoc_secondary_init(unsigned int cpu)
801  {
802 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
803         /*
804          * Synchronise with the boot thread.
805          */
806 -       spin_lock(&boot_lock);
807 -       spin_unlock(&boot_lock);
808 +       raw_spin_lock(&boot_lock);
809 +       raw_spin_unlock(&boot_lock);
810  }
811  
812  static const struct of_device_id clk_ids[]  = {
813 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
814         /* make sure write buffer is drained */
815         mb();
816  
817 -       spin_lock(&boot_lock);
818 +       raw_spin_lock(&boot_lock);
819  
820         /*
821          * The secondary processor is waiting to be released from
822 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
823          * now the secondary core is starting up let it run its
824          * calibrations, then wait for it to finish
825          */
826 -       spin_unlock(&boot_lock);
827 +       raw_spin_unlock(&boot_lock);
828  
829         return pen_release != -1 ? -ENOSYS : 0;
830  }
831 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
832 index 5494c9e0c909..e8ce157d3548 100644
833 --- a/arch/arm/mach-qcom/platsmp.c
834 +++ b/arch/arm/mach-qcom/platsmp.c
835 @@ -46,7 +46,7 @@
836  
837  extern void secondary_startup_arm(void);
838  
839 -static DEFINE_SPINLOCK(boot_lock);
840 +static DEFINE_RAW_SPINLOCK(boot_lock);
841  
842  #ifdef CONFIG_HOTPLUG_CPU
843  static void qcom_cpu_die(unsigned int cpu)
844 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
845         /*
846          * Synchronise with the boot thread.
847          */
848 -       spin_lock(&boot_lock);
849 -       spin_unlock(&boot_lock);
850 +       raw_spin_lock(&boot_lock);
851 +       raw_spin_unlock(&boot_lock);
852  }
853  
854  static int scss_release_secondary(unsigned int cpu)
855 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
856          * set synchronisation state between this boot processor
857          * and the secondary one
858          */
859 -       spin_lock(&boot_lock);
860 +       raw_spin_lock(&boot_lock);
861  
862         /*
863          * Send the secondary CPU a soft interrupt, thereby causing
864 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
865          * now the secondary core is starting up let it run its
866          * calibrations, then wait for it to finish
867          */
868 -       spin_unlock(&boot_lock);
869 +       raw_spin_unlock(&boot_lock);
870  
871         return ret;
872  }
873 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
874 index 8d1e2d551786..7fa56cc78118 100644
875 --- a/arch/arm/mach-spear/platsmp.c
876 +++ b/arch/arm/mach-spear/platsmp.c
877 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
878         sync_cache_w(&pen_release);
879  }
880  
881 -static DEFINE_SPINLOCK(boot_lock);
882 +static DEFINE_RAW_SPINLOCK(boot_lock);
883  
884  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
885  
886 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
887         /*
888          * Synchronise with the boot thread.
889          */
890 -       spin_lock(&boot_lock);
891 -       spin_unlock(&boot_lock);
892 +       raw_spin_lock(&boot_lock);
893 +       raw_spin_unlock(&boot_lock);
894  }
895  
896  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
897 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
898          * set synchronisation state between this boot processor
899          * and the secondary one
900          */
901 -       spin_lock(&boot_lock);
902 +       raw_spin_lock(&boot_lock);
903  
904         /*
905          * The secondary processor is waiting to be released from
906 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
907          * now the secondary core is starting up let it run its
908          * calibrations, then wait for it to finish
909          */
910 -       spin_unlock(&boot_lock);
911 +       raw_spin_unlock(&boot_lock);
912  
913         return pen_release != -1 ? -ENOSYS : 0;
914  }
915 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
916 index ea5a2277ee46..b988e081ac79 100644
917 --- a/arch/arm/mach-sti/platsmp.c
918 +++ b/arch/arm/mach-sti/platsmp.c
919 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
920         sync_cache_w(&pen_release);
921  }
922  
923 -static DEFINE_SPINLOCK(boot_lock);
924 +static DEFINE_RAW_SPINLOCK(boot_lock);
925  
926  static void sti_secondary_init(unsigned int cpu)
927  {
928 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
929         /*
930          * Synchronise with the boot thread.
931          */
932 -       spin_lock(&boot_lock);
933 -       spin_unlock(&boot_lock);
934 +       raw_spin_lock(&boot_lock);
935 +       raw_spin_unlock(&boot_lock);
936  }
937  
938  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
939 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
940          * set synchronisation state between this boot processor
941          * and the secondary one
942          */
943 -       spin_lock(&boot_lock);
944 +       raw_spin_lock(&boot_lock);
945  
946         /*
947          * The secondary processor is waiting to be released from
948 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
949          * now the secondary core is starting up let it run its
950          * calibrations, then wait for it to finish
951          */
952 -       spin_unlock(&boot_lock);
953 +       raw_spin_unlock(&boot_lock);
954  
955         return pen_release != -1 ? -ENOSYS : 0;
956  }
957 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
958 index 0122ad1a6027..926b1be48043 100644
959 --- a/arch/arm/mm/fault.c
960 +++ b/arch/arm/mm/fault.c
961 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
962         if (addr < TASK_SIZE)
963                 return do_page_fault(addr, fsr, regs);
964  
965 +       if (interrupts_enabled(regs))
966 +               local_irq_enable();
967 +
968         if (user_mode(regs))
969                 goto bad_area;
970  
971 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
972  static int
973  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
974  {
975 +       if (interrupts_enabled(regs))
976 +               local_irq_enable();
977 +
978         do_bad_area(addr, fsr, regs);
979         return 0;
980  }
981 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
982 index d02f8187b1cc..542692dbd40a 100644
983 --- a/arch/arm/mm/highmem.c
984 +++ b/arch/arm/mm/highmem.c
985 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
986         return *ptep;
987  }
988  
989 +static unsigned int fixmap_idx(int type)
990 +{
991 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
992 +}
993 +
994  void *kmap(struct page *page)
995  {
996         might_sleep();
997 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
998  
999  void *kmap_atomic(struct page *page)
1000  {
1001 +       pte_t pte = mk_pte(page, kmap_prot);
1002         unsigned int idx;
1003         unsigned long vaddr;
1004         void *kmap;
1005         int type;
1006  
1007 -       preempt_disable();
1008 +       preempt_disable_nort();
1009         pagefault_disable();
1010         if (!PageHighMem(page))
1011                 return page_address(page);
1012 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1013  
1014         type = kmap_atomic_idx_push();
1015  
1016 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1017 +       idx = fixmap_idx(type);
1018         vaddr = __fix_to_virt(idx);
1019  #ifdef CONFIG_DEBUG_HIGHMEM
1020         /*
1021 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1022          * in place, so the contained TLB flush ensures the TLB is updated
1023          * with the new mapping.
1024          */
1025 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1026 +#ifdef CONFIG_PREEMPT_RT_FULL
1027 +       current->kmap_pte[type] = pte;
1028 +#endif
1029 +       set_fixmap_pte(idx, pte);
1030  
1031         return (void *)vaddr;
1032  }
1033 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1034  
1035         if (kvaddr >= (void *)FIXADDR_START) {
1036                 type = kmap_atomic_idx();
1037 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1038 +               idx = fixmap_idx(type);
1039  
1040                 if (cache_is_vivt())
1041                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1042 +#ifdef CONFIG_PREEMPT_RT_FULL
1043 +               current->kmap_pte[type] = __pte(0);
1044 +#endif
1045  #ifdef CONFIG_DEBUG_HIGHMEM
1046                 BUG_ON(vaddr != __fix_to_virt(idx));
1047 -               set_fixmap_pte(idx, __pte(0));
1048  #else
1049                 (void) idx;  /* to kill a warning */
1050  #endif
1051 +               set_fixmap_pte(idx, __pte(0));
1052                 kmap_atomic_idx_pop();
1053         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1054                 /* this address was obtained through kmap_high_get() */
1055                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1056         }
1057         pagefault_enable();
1058 -       preempt_enable();
1059 +       preempt_enable_nort();
1060  }
1061  EXPORT_SYMBOL(__kunmap_atomic);
1062  
1063  void *kmap_atomic_pfn(unsigned long pfn)
1064  {
1065 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1066         unsigned long vaddr;
1067         int idx, type;
1068         struct page *page = pfn_to_page(pfn);
1069  
1070 -       preempt_disable();
1071 +       preempt_disable_nort();
1072         pagefault_disable();
1073         if (!PageHighMem(page))
1074                 return page_address(page);
1075  
1076         type = kmap_atomic_idx_push();
1077 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1078 +       idx = fixmap_idx(type);
1079         vaddr = __fix_to_virt(idx);
1080  #ifdef CONFIG_DEBUG_HIGHMEM
1081         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1082  #endif
1083 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1084 +#ifdef CONFIG_PREEMPT_RT_FULL
1085 +       current->kmap_pte[type] = pte;
1086 +#endif
1087 +       set_fixmap_pte(idx, pte);
1088  
1089         return (void *)vaddr;
1090  }
1091 +#if defined CONFIG_PREEMPT_RT_FULL
1092 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1093 +{
1094 +       int i;
1095 +
1096 +       /*
1097 +        * Clear @prev's kmap_atomic mappings
1098 +        */
1099 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1100 +               int idx = fixmap_idx(i);
1101 +
1102 +               set_fixmap_pte(idx, __pte(0));
1103 +       }
1104 +       /*
1105 +        * Restore @next_p's kmap_atomic mappings
1106 +        */
1107 +       for (i = 0; i < next_p->kmap_idx; i++) {
1108 +               int idx = fixmap_idx(i);
1109 +
1110 +               if (!pte_none(next_p->kmap_pte[i]))
1111 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1112 +       }
1113 +}
1114 +#endif
1115 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1116 index c2366510187a..6b60f582b738 100644
1117 --- a/arch/arm/plat-versatile/platsmp.c
1118 +++ b/arch/arm/plat-versatile/platsmp.c
1119 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1120         sync_cache_w(&pen_release);
1121  }
1122  
1123 -static DEFINE_SPINLOCK(boot_lock);
1124 +static DEFINE_RAW_SPINLOCK(boot_lock);
1125  
1126  void versatile_secondary_init(unsigned int cpu)
1127  {
1128 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1129         /*
1130          * Synchronise with the boot thread.
1131          */
1132 -       spin_lock(&boot_lock);
1133 -       spin_unlock(&boot_lock);
1134 +       raw_spin_lock(&boot_lock);
1135 +       raw_spin_unlock(&boot_lock);
1136  }
1137  
1138  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1139 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1140          * Set synchronisation state between this boot processor
1141          * and the secondary one
1142          */
1143 -       spin_lock(&boot_lock);
1144 +       raw_spin_lock(&boot_lock);
1145  
1146         /*
1147          * This is really belt and braces; we hold unintended secondary
1148 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1149          * now the secondary core is starting up let it run its
1150          * calibrations, then wait for it to finish
1151          */
1152 -       spin_unlock(&boot_lock);
1153 +       raw_spin_unlock(&boot_lock);
1154  
1155         return pen_release != -1 ? -ENOSYS : 0;
1156  }
1157 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1158 index cf57a7799a0f..78d1b49fbed5 100644
1159 --- a/arch/arm64/Kconfig
1160 +++ b/arch/arm64/Kconfig
1161 @@ -91,6 +91,7 @@ config ARM64
1162         select HAVE_PERF_EVENTS
1163         select HAVE_PERF_REGS
1164         select HAVE_PERF_USER_STACK_DUMP
1165 +       select HAVE_PREEMPT_LAZY
1166         select HAVE_REGS_AND_STACK_ACCESS_API
1167         select HAVE_RCU_TABLE_FREE
1168         select HAVE_SYSCALL_TRACEPOINTS
1169 @@ -704,7 +705,7 @@ config XEN_DOM0
1170  
1171  config XEN
1172         bool "Xen guest support on ARM64"
1173 -       depends on ARM64 && OF
1174 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1175         select SWIOTLB_XEN
1176         select PARAVIRT
1177         help
1178 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1179 index e9ea5a6bd449..6c500ad63c6a 100644
1180 --- a/arch/arm64/include/asm/thread_info.h
1181 +++ b/arch/arm64/include/asm/thread_info.h
1182 @@ -49,6 +49,7 @@ struct thread_info {
1183         mm_segment_t            addr_limit;     /* address limit */
1184         struct task_struct      *task;          /* main task structure */
1185         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1186 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1187         int                     cpu;            /* cpu */
1188  };
1189  
1190 @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
1191  #define TIF_NEED_RESCHED       1
1192  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1193  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1194 +#define TIF_NEED_RESCHED_LAZY  4
1195  #define TIF_NOHZ               7
1196  #define TIF_SYSCALL_TRACE      8
1197  #define TIF_SYSCALL_AUDIT      9
1198 @@ -127,6 +129,7 @@ static inline struct thread_info *current_thread_info(void)
1199  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1200  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1201  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1202 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1203  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1204  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1205  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1206 @@ -135,7 +138,9 @@ static inline struct thread_info *current_thread_info(void)
1207  #define _TIF_32BIT             (1 << TIF_32BIT)
1208  
1209  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1210 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1211 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1212 +                                _TIF_NEED_RESCHED_LAZY)
1213 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1214  
1215  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1216                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1217 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1218 index 4a2f0f0fef32..6bf2bc17c400 100644
1219 --- a/arch/arm64/kernel/asm-offsets.c
1220 +++ b/arch/arm64/kernel/asm-offsets.c
1221 @@ -38,6 +38,7 @@ int main(void)
1222    BLANK();
1223    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1224    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1225 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1226    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1227    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1228    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1229 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1230 index 79b0fe24d5b7..f3c959ade308 100644
1231 --- a/arch/arm64/kernel/entry.S
1232 +++ b/arch/arm64/kernel/entry.S
1233 @@ -428,11 +428,16 @@ ENDPROC(el1_sync)
1234  
1235  #ifdef CONFIG_PREEMPT
1236         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1237 -       cbnz    w24, 1f                         // preempt count != 0
1238 +       cbnz    w24, 2f                         // preempt count != 0
1239         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1240 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1241 -       bl      el1_preempt
1242 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1243 +
1244 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1245 +       cbnz    w24, 2f                         // preempt lazy count != 0
1246 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1247  1:
1248 +       bl      el1_preempt
1249 +2:
1250  #endif
1251  #ifdef CONFIG_TRACE_IRQFLAGS
1252         bl      trace_hardirqs_on
1253 @@ -446,6 +451,7 @@ ENDPROC(el1_irq)
1254  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1255         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1256         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1257 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1258         ret     x24
1259  #endif
1260  
1261 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
1262 index 404dd67080b9..639dc6d12e72 100644
1263 --- a/arch/arm64/kernel/signal.c
1264 +++ b/arch/arm64/kernel/signal.c
1265 @@ -409,7 +409,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
1266          */
1267         trace_hardirqs_off();
1268         do {
1269 -               if (thread_flags & _TIF_NEED_RESCHED) {
1270 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1271                         schedule();
1272                 } else {
1273                         local_irq_enable();
1274 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1275 index b3c5bde43d34..8122bf058de0 100644
1276 --- a/arch/mips/Kconfig
1277 +++ b/arch/mips/Kconfig
1278 @@ -2514,7 +2514,7 @@ config MIPS_ASID_BITS_VARIABLE
1279  #
1280  config HIGHMEM
1281         bool "High Memory Support"
1282 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1283 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1284  
1285  config CPU_SUPPORTS_HIGHMEM
1286         bool
1287 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1288 index 65fba4c34cd7..4b5ba68910e0 100644
1289 --- a/arch/powerpc/Kconfig
1290 +++ b/arch/powerpc/Kconfig
1291 @@ -52,10 +52,11 @@ config LOCKDEP_SUPPORT
1292  
1293  config RWSEM_GENERIC_SPINLOCK
1294         bool
1295 +       default y if PREEMPT_RT_FULL
1296  
1297  config RWSEM_XCHGADD_ALGORITHM
1298         bool
1299 -       default y
1300 +       default y if !PREEMPT_RT_FULL
1301  
1302  config GENERIC_LOCKBREAK
1303         bool
1304 @@ -134,6 +135,7 @@ config PPC
1305         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1306         select GENERIC_STRNCPY_FROM_USER
1307         select GENERIC_STRNLEN_USER
1308 +       select HAVE_PREEMPT_LAZY
1309         select HAVE_MOD_ARCH_SPECIFIC
1310         select MODULES_USE_ELF_RELA
1311         select CLONE_BACKWARDS
1312 @@ -321,7 +323,7 @@ menu "Kernel options"
1313  
1314  config HIGHMEM
1315         bool "High memory support"
1316 -       depends on PPC32
1317 +       depends on PPC32 && !PREEMPT_RT_FULL
1318  
1319  source kernel/Kconfig.hz
1320  source kernel/Kconfig.preempt
1321 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1322 index 87e4b2d8dcd4..981e501a4359 100644
1323 --- a/arch/powerpc/include/asm/thread_info.h
1324 +++ b/arch/powerpc/include/asm/thread_info.h
1325 @@ -43,6 +43,8 @@ struct thread_info {
1326         int             cpu;                    /* cpu we're on */
1327         int             preempt_count;          /* 0 => preemptable,
1328                                                    <0 => BUG */
1329 +       int             preempt_lazy_count;     /* 0 => preemptable,
1330 +                                                  <0 => BUG */
1331         unsigned long   local_flags;            /* private flags for thread */
1332  #ifdef CONFIG_LIVEPATCH
1333         unsigned long *livepatch_sp;
1334 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1335  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1336  #define TIF_SIGPENDING         1       /* signal pending */
1337  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1338 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1339 -                                          TIF_NEED_RESCHED */
1340 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1341  #define TIF_32BIT              4       /* 32 bit binary */
1342  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1343  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1344 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1345  #if defined(CONFIG_PPC64)
1346  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1347  #endif
1348 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1349 +                                          TIF_NEED_RESCHED */
1350  
1351  /* as above, but as bit values */
1352  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1353 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1354  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1355  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1356  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1357 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1358  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1359                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1360                                  _TIF_NOHZ)
1361  
1362  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1363                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1364 -                                _TIF_RESTORE_TM)
1365 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1366  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1367 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1368  
1369  /* Bits in local_flags */
1370  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1371 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1372 index c833d88c423d..96e9fbc3f684 100644
1373 --- a/arch/powerpc/kernel/asm-offsets.c
1374 +++ b/arch/powerpc/kernel/asm-offsets.c
1375 @@ -156,6 +156,7 @@ int main(void)
1376         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1377         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1378         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1379 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1380         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1381         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1382  
1383 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1384 index 3841d749a430..6dbaeff192b9 100644
1385 --- a/arch/powerpc/kernel/entry_32.S
1386 +++ b/arch/powerpc/kernel/entry_32.S
1387 @@ -835,7 +835,14 @@ user_exc_return:           /* r10 contains MSR_KERNEL here */
1388         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1389         bne     restore
1390         andi.   r8,r8,_TIF_NEED_RESCHED
1391 +       bne+    1f
1392 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1393 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1394 +       bne     restore
1395 +       lwz     r0,TI_FLAGS(r9)
1396 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1397         beq+    restore
1398 +1:
1399         lwz     r3,_MSR(r1)
1400         andi.   r0,r3,MSR_EE    /* interrupts off? */
1401         beq     restore         /* don't schedule if so */
1402 @@ -846,11 +853,11 @@ user_exc_return:          /* r10 contains MSR_KERNEL here */
1403          */
1404         bl      trace_hardirqs_off
1405  #endif
1406 -1:     bl      preempt_schedule_irq
1407 +2:     bl      preempt_schedule_irq
1408         CURRENT_THREAD_INFO(r9, r1)
1409         lwz     r3,TI_FLAGS(r9)
1410 -       andi.   r0,r3,_TIF_NEED_RESCHED
1411 -       bne-    1b
1412 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1413 +       bne-    2b
1414  #ifdef CONFIG_TRACE_IRQFLAGS
1415         /* And now, to properly rebalance the above, we tell lockdep they
1416          * are being turned back on, which will happen when we return
1417 @@ -1171,7 +1178,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
1418  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1419  
1420  do_work:                       /* r10 contains MSR_KERNEL here */
1421 -       andi.   r0,r9,_TIF_NEED_RESCHED
1422 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1423         beq     do_user_signal
1424  
1425  do_resched:                    /* r10 contains MSR_KERNEL here */
1426 @@ -1192,7 +1199,7 @@ do_resched:                       /* r10 contains MSR_KERNEL here */
1427         MTMSRD(r10)             /* disable interrupts */
1428         CURRENT_THREAD_INFO(r9, r1)
1429         lwz     r9,TI_FLAGS(r9)
1430 -       andi.   r0,r9,_TIF_NEED_RESCHED
1431 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1432         bne-    do_resched
1433         andi.   r0,r9,_TIF_USER_WORK_MASK
1434         beq     restore_user
1435 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1436 index 6432d4bf08c8..5509a26f1070 100644
1437 --- a/arch/powerpc/kernel/entry_64.S
1438 +++ b/arch/powerpc/kernel/entry_64.S
1439 @@ -656,7 +656,7 @@ _GLOBAL(ret_from_except_lite)
1440         bl      restore_math
1441         b       restore
1442  #endif
1443 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1444 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1445         beq     2f
1446         bl      restore_interrupts
1447         SCHEDULE_USER
1448 @@ -718,10 +718,18 @@ _GLOBAL(ret_from_except_lite)
1449  
1450  #ifdef CONFIG_PREEMPT
1451         /* Check if we need to preempt */
1452 -       andi.   r0,r4,_TIF_NEED_RESCHED
1453 -       beq+    restore
1454 -       /* Check that preempt_count() == 0 and interrupts are enabled */
1455         lwz     r8,TI_PREEMPT(r9)
1456 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1457 +       bne     restore
1458 +       andi.   r0,r4,_TIF_NEED_RESCHED
1459 +       bne+    check_count
1460 +
1461 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1462 +       beq+    restore
1463 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1464 +
1465 +       /* Check that preempt_count() == 0 and interrupts are enabled */
1466 +check_count:
1467         cmpwi   cr1,r8,0
1468         ld      r0,SOFTE(r1)
1469         cmpdi   r0,0
1470 @@ -738,7 +746,7 @@ _GLOBAL(ret_from_except_lite)
1471         /* Re-test flags and eventually loop */
1472         CURRENT_THREAD_INFO(r9, r1)
1473         ld      r4,TI_FLAGS(r9)
1474 -       andi.   r0,r4,_TIF_NEED_RESCHED
1475 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1476         bne     1b
1477  
1478         /*
1479 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1480 index 3c05c311e35e..f83f6ac1274d 100644
1481 --- a/arch/powerpc/kernel/irq.c
1482 +++ b/arch/powerpc/kernel/irq.c
1483 @@ -638,6 +638,7 @@ void irq_ctx_init(void)
1484         }
1485  }
1486  
1487 +#ifndef CONFIG_PREEMPT_RT_FULL
1488  void do_softirq_own_stack(void)
1489  {
1490         struct thread_info *curtp, *irqtp;
1491 @@ -655,6 +656,7 @@ void do_softirq_own_stack(void)
1492         if (irqtp->flags)
1493                 set_bits(irqtp->flags, &curtp->flags);
1494  }
1495 +#endif
1496  
1497  irq_hw_number_t virq_to_hw(unsigned int virq)
1498  {
1499 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1500 index 030d72df5dd5..b471a709e100 100644
1501 --- a/arch/powerpc/kernel/misc_32.S
1502 +++ b/arch/powerpc/kernel/misc_32.S
1503 @@ -41,6 +41,7 @@
1504   * We store the saved ksp_limit in the unused part
1505   * of the STACK_FRAME_OVERHEAD
1506   */
1507 +#ifndef CONFIG_PREEMPT_RT_FULL
1508  _GLOBAL(call_do_softirq)
1509         mflr    r0
1510         stw     r0,4(r1)
1511 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
1512         stw     r10,THREAD+KSP_LIMIT(r2)
1513         mtlr    r0
1514         blr
1515 +#endif
1516  
1517  /*
1518   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1519 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1520 index 4f178671f230..39e7d84a3492 100644
1521 --- a/arch/powerpc/kernel/misc_64.S
1522 +++ b/arch/powerpc/kernel/misc_64.S
1523 @@ -31,6 +31,7 @@
1524  
1525         .text
1526  
1527 +#ifndef CONFIG_PREEMPT_RT_FULL
1528  _GLOBAL(call_do_softirq)
1529         mflr    r0
1530         std     r0,16(r1)
1531 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
1532         ld      r0,16(r1)
1533         mtlr    r0
1534         blr
1535 +#endif
1536  
1537  _GLOBAL(call_do_irq)
1538         mflr    r0
1539 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1540 index 029be26b5a17..9528089ea142 100644
1541 --- a/arch/powerpc/kvm/Kconfig
1542 +++ b/arch/powerpc/kvm/Kconfig
1543 @@ -175,6 +175,7 @@ config KVM_E500MC
1544  config KVM_MPIC
1545         bool "KVM in-kernel MPIC emulation"
1546         depends on KVM && E500
1547 +       depends on !PREEMPT_RT_FULL
1548         select HAVE_KVM_IRQCHIP
1549         select HAVE_KVM_IRQFD
1550         select HAVE_KVM_IRQ_ROUTING
1551 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1552 index e48462447ff0..2670cee66064 100644
1553 --- a/arch/powerpc/platforms/ps3/device-init.c
1554 +++ b/arch/powerpc/platforms/ps3/device-init.c
1555 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1556         }
1557         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1558  
1559 -       res = wait_event_interruptible(dev->done.wait,
1560 +       res = swait_event_interruptible(dev->done.wait,
1561                                        dev->done.done || kthread_should_stop());
1562         if (kthread_should_stop())
1563                 res = -EINTR;
1564 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1565 index 6c0378c0b8b5..abd58b4dff97 100644
1566 --- a/arch/sh/kernel/irq.c
1567 +++ b/arch/sh/kernel/irq.c
1568 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1569         hardirq_ctx[cpu] = NULL;
1570  }
1571  
1572 +#ifndef CONFIG_PREEMPT_RT_FULL
1573  void do_softirq_own_stack(void)
1574  {
1575         struct thread_info *curctx;
1576 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1577                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1578         );
1579  }
1580 +#endif
1581  #else
1582  static inline void handle_one_irq(unsigned int irq)
1583  {
1584 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1585 index 165ecdd24d22..b68a464a22be 100644
1586 --- a/arch/sparc/Kconfig
1587 +++ b/arch/sparc/Kconfig
1588 @@ -194,12 +194,10 @@ config NR_CPUS
1589  source kernel/Kconfig.hz
1590  
1591  config RWSEM_GENERIC_SPINLOCK
1592 -       bool
1593 -       default y if SPARC32
1594 +       def_bool PREEMPT_RT_FULL
1595  
1596  config RWSEM_XCHGADD_ALGORITHM
1597 -       bool
1598 -       default y if SPARC64
1599 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1600  
1601  config GENERIC_HWEIGHT
1602         bool
1603 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1604 index 34a7930b76ef..773740521008 100644
1605 --- a/arch/sparc/kernel/irq_64.c
1606 +++ b/arch/sparc/kernel/irq_64.c
1607 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1608         set_irq_regs(old_regs);
1609  }
1610  
1611 +#ifndef CONFIG_PREEMPT_RT_FULL
1612  void do_softirq_own_stack(void)
1613  {
1614         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1615 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1616         __asm__ __volatile__("mov %0, %%sp"
1617                              : : "r" (orig_sp));
1618  }
1619 +#endif
1620  
1621  #ifdef CONFIG_HOTPLUG_CPU
1622  void fixup_irqs(void)
1623 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1624 index bada636d1065..f8a995c90c01 100644
1625 --- a/arch/x86/Kconfig
1626 +++ b/arch/x86/Kconfig
1627 @@ -17,6 +17,7 @@ config X86_64
1628  ### Arch settings
1629  config X86
1630         def_bool y
1631 +       select HAVE_PREEMPT_LAZY
1632         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1633         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1634         select ANON_INODES
1635 @@ -232,8 +233,11 @@ config ARCH_MAY_HAVE_PC_FDC
1636         def_bool y
1637         depends on ISA_DMA_API
1638  
1639 +config RWSEM_GENERIC_SPINLOCK
1640 +       def_bool PREEMPT_RT_FULL
1641 +
1642  config RWSEM_XCHGADD_ALGORITHM
1643 -       def_bool y
1644 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1645  
1646  config GENERIC_CALIBRATE_DELAY
1647         def_bool y
1648 @@ -897,7 +901,7 @@ config IOMMU_HELPER
1649  config MAXSMP
1650         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1651         depends on X86_64 && SMP && DEBUG_KERNEL
1652 -       select CPUMASK_OFFSTACK
1653 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1654         ---help---
1655           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1656           If unsure, say N.
1657 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1658 index aa8b0672f87a..2429414bfc71 100644
1659 --- a/arch/x86/crypto/aesni-intel_glue.c
1660 +++ b/arch/x86/crypto/aesni-intel_glue.c
1661 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1662         err = blkcipher_walk_virt(desc, &walk);
1663         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1664  
1665 -       kernel_fpu_begin();
1666         while ((nbytes = walk.nbytes)) {
1667 +               kernel_fpu_begin();
1668                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1669 -                             nbytes & AES_BLOCK_MASK);
1670 +                               nbytes & AES_BLOCK_MASK);
1671 +               kernel_fpu_end();
1672                 nbytes &= AES_BLOCK_SIZE - 1;
1673                 err = blkcipher_walk_done(desc, &walk, nbytes);
1674         }
1675 -       kernel_fpu_end();
1676  
1677         return err;
1678  }
1679 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1680         err = blkcipher_walk_virt(desc, &walk);
1681         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1682  
1683 -       kernel_fpu_begin();
1684         while ((nbytes = walk.nbytes)) {
1685 +               kernel_fpu_begin();
1686                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1687                               nbytes & AES_BLOCK_MASK);
1688 +               kernel_fpu_end();
1689                 nbytes &= AES_BLOCK_SIZE - 1;
1690                 err = blkcipher_walk_done(desc, &walk, nbytes);
1691         }
1692 -       kernel_fpu_end();
1693  
1694         return err;
1695  }
1696 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1697         err = blkcipher_walk_virt(desc, &walk);
1698         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1699  
1700 -       kernel_fpu_begin();
1701         while ((nbytes = walk.nbytes)) {
1702 +               kernel_fpu_begin();
1703                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1704                               nbytes & AES_BLOCK_MASK, walk.iv);
1705 +               kernel_fpu_end();
1706                 nbytes &= AES_BLOCK_SIZE - 1;
1707                 err = blkcipher_walk_done(desc, &walk, nbytes);
1708         }
1709 -       kernel_fpu_end();
1710  
1711         return err;
1712  }
1713 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1714         err = blkcipher_walk_virt(desc, &walk);
1715         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1716  
1717 -       kernel_fpu_begin();
1718         while ((nbytes = walk.nbytes)) {
1719 +               kernel_fpu_begin();
1720                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1721                               nbytes & AES_BLOCK_MASK, walk.iv);
1722 +               kernel_fpu_end();
1723                 nbytes &= AES_BLOCK_SIZE - 1;
1724                 err = blkcipher_walk_done(desc, &walk, nbytes);
1725         }
1726 -       kernel_fpu_end();
1727  
1728         return err;
1729  }
1730 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1731         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1732         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1733  
1734 -       kernel_fpu_begin();
1735         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1736 +               kernel_fpu_begin();
1737                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1738                                       nbytes & AES_BLOCK_MASK, walk.iv);
1739 +               kernel_fpu_end();
1740                 nbytes &= AES_BLOCK_SIZE - 1;
1741                 err = blkcipher_walk_done(desc, &walk, nbytes);
1742         }
1743         if (walk.nbytes) {
1744 +               kernel_fpu_begin();
1745                 ctr_crypt_final(ctx, &walk);
1746 +               kernel_fpu_end();
1747                 err = blkcipher_walk_done(desc, &walk, 0);
1748         }
1749 -       kernel_fpu_end();
1750  
1751         return err;
1752  }
1753 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1754 index 8648158f3916..d7699130ee36 100644
1755 --- a/arch/x86/crypto/cast5_avx_glue.c
1756 +++ b/arch/x86/crypto/cast5_avx_glue.c
1757 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1758  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1759                      bool enc)
1760  {
1761 -       bool fpu_enabled = false;
1762 +       bool fpu_enabled;
1763         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1764         const unsigned int bsize = CAST5_BLOCK_SIZE;
1765         unsigned int nbytes;
1766 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1767                 u8 *wsrc = walk->src.virt.addr;
1768                 u8 *wdst = walk->dst.virt.addr;
1769  
1770 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1771 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1772  
1773                 /* Process multi-block batch */
1774                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1775 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1776                 } while (nbytes >= bsize);
1777  
1778  done:
1779 +               cast5_fpu_end(fpu_enabled);
1780                 err = blkcipher_walk_done(desc, walk, nbytes);
1781         }
1782 -
1783 -       cast5_fpu_end(fpu_enabled);
1784         return err;
1785  }
1786  
1787 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1788  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1789                        struct scatterlist *src, unsigned int nbytes)
1790  {
1791 -       bool fpu_enabled = false;
1792 +       bool fpu_enabled;
1793         struct blkcipher_walk walk;
1794         int err;
1795  
1796 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1797         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1798  
1799         while ((nbytes = walk.nbytes)) {
1800 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1801 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1802                 nbytes = __cbc_decrypt(desc, &walk);
1803 +               cast5_fpu_end(fpu_enabled);
1804                 err = blkcipher_walk_done(desc, &walk, nbytes);
1805         }
1806 -
1807 -       cast5_fpu_end(fpu_enabled);
1808         return err;
1809  }
1810  
1811 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1812  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1813                      struct scatterlist *src, unsigned int nbytes)
1814  {
1815 -       bool fpu_enabled = false;
1816 +       bool fpu_enabled;
1817         struct blkcipher_walk walk;
1818         int err;
1819  
1820 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1821         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1822  
1823         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1824 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1825 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1826                 nbytes = __ctr_crypt(desc, &walk);
1827 +               cast5_fpu_end(fpu_enabled);
1828                 err = blkcipher_walk_done(desc, &walk, nbytes);
1829         }
1830  
1831 -       cast5_fpu_end(fpu_enabled);
1832 -
1833         if (walk.nbytes) {
1834                 ctr_crypt_final(desc, &walk);
1835                 err = blkcipher_walk_done(desc, &walk, 0);
1836 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1837 index 6a85598931b5..3a506ce7ed93 100644
1838 --- a/arch/x86/crypto/glue_helper.c
1839 +++ b/arch/x86/crypto/glue_helper.c
1840 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1841         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1842         const unsigned int bsize = 128 / 8;
1843         unsigned int nbytes, i, func_bytes;
1844 -       bool fpu_enabled = false;
1845 +       bool fpu_enabled;
1846         int err;
1847  
1848         err = blkcipher_walk_virt(desc, walk);
1849 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1850                 u8 *wdst = walk->dst.virt.addr;
1851  
1852                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1853 -                                            desc, fpu_enabled, nbytes);
1854 +                                            desc, false, nbytes);
1855  
1856                 for (i = 0; i < gctx->num_funcs; i++) {
1857                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1858 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1859                 }
1860  
1861  done:
1862 +               glue_fpu_end(fpu_enabled);
1863                 err = blkcipher_walk_done(desc, walk, nbytes);
1864         }
1865  
1866 -       glue_fpu_end(fpu_enabled);
1867         return err;
1868  }
1869  
1870 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1871                             struct scatterlist *src, unsigned int nbytes)
1872  {
1873         const unsigned int bsize = 128 / 8;
1874 -       bool fpu_enabled = false;
1875 +       bool fpu_enabled;
1876         struct blkcipher_walk walk;
1877         int err;
1878  
1879 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1880  
1881         while ((nbytes = walk.nbytes)) {
1882                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1883 -                                            desc, fpu_enabled, nbytes);
1884 +                                            desc, false, nbytes);
1885                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1886 +               glue_fpu_end(fpu_enabled);
1887                 err = blkcipher_walk_done(desc, &walk, nbytes);
1888         }
1889  
1890 -       glue_fpu_end(fpu_enabled);
1891         return err;
1892  }
1893  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1894 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1895                           struct scatterlist *src, unsigned int nbytes)
1896  {
1897         const unsigned int bsize = 128 / 8;
1898 -       bool fpu_enabled = false;
1899 +       bool fpu_enabled;
1900         struct blkcipher_walk walk;
1901         int err;
1902  
1903 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1904  
1905         while ((nbytes = walk.nbytes) >= bsize) {
1906                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1907 -                                            desc, fpu_enabled, nbytes);
1908 +                                            desc, false, nbytes);
1909                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1910 +               glue_fpu_end(fpu_enabled);
1911                 err = blkcipher_walk_done(desc, &walk, nbytes);
1912         }
1913  
1914 -       glue_fpu_end(fpu_enabled);
1915 -
1916         if (walk.nbytes) {
1917                 glue_ctr_crypt_final_128bit(
1918                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1919 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1920                           void *tweak_ctx, void *crypt_ctx)
1921  {
1922         const unsigned int bsize = 128 / 8;
1923 -       bool fpu_enabled = false;
1924 +       bool fpu_enabled;
1925         struct blkcipher_walk walk;
1926         int err;
1927  
1928 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1929  
1930         /* set minimum length to bsize, for tweak_fn */
1931         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1932 -                                    desc, fpu_enabled,
1933 +                                    desc, false,
1934                                      nbytes < bsize ? bsize : nbytes);
1935 -
1936         /* calculate first value of T */
1937         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1938 +       glue_fpu_end(fpu_enabled);
1939  
1940         while (nbytes) {
1941 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1942 +                               desc, false, nbytes);
1943                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1944  
1945 +               glue_fpu_end(fpu_enabled);
1946                 err = blkcipher_walk_done(desc, &walk, nbytes);
1947                 nbytes = walk.nbytes;
1948         }
1949 -
1950 -       glue_fpu_end(fpu_enabled);
1951 -
1952         return err;
1953  }
1954  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1955 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1956 index bdd9cc59d20f..56d01a339ba4 100644
1957 --- a/arch/x86/entry/common.c
1958 +++ b/arch/x86/entry/common.c
1959 @@ -129,7 +129,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1960  
1961  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1962         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1963 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1964 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1965  
1966  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1967  {
1968 @@ -145,9 +145,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1969                 /* We have work to do. */
1970                 local_irq_enable();
1971  
1972 -               if (cached_flags & _TIF_NEED_RESCHED)
1973 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
1974                         schedule();
1975  
1976 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
1977 +               if (unlikely(current->forced_info.si_signo)) {
1978 +                       struct task_struct *t = current;
1979 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
1980 +                       t->forced_info.si_signo = 0;
1981 +               }
1982 +#endif
1983                 if (cached_flags & _TIF_UPROBE)
1984                         uprobe_notify_resume(regs);
1985  
1986 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
1987 index edba8606b99a..4a3389535fc6 100644
1988 --- a/arch/x86/entry/entry_32.S
1989 +++ b/arch/x86/entry/entry_32.S
1990 @@ -308,8 +308,25 @@ END(ret_from_exception)
1991  ENTRY(resume_kernel)
1992         DISABLE_INTERRUPTS(CLBR_ANY)
1993  need_resched:
1994 +       # preempt count == 0 + NEED_RS set?
1995         cmpl    $0, PER_CPU_VAR(__preempt_count)
1996 +#ifndef CONFIG_PREEMPT_LAZY
1997         jnz     restore_all
1998 +#else
1999 +       jz test_int_off
2000 +
2001 +       # atleast preempt count == 0 ?
2002 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2003 +       jne restore_all
2004 +
2005 +       movl    PER_CPU_VAR(current_task), %ebp
2006 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
2007 +       jnz restore_all
2008 +
2009 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2010 +       jz restore_all
2011 +test_int_off:
2012 +#endif
2013         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2014         jz      restore_all
2015         call    preempt_schedule_irq
2016 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2017 index ef766a358b37..28401f826ab1 100644
2018 --- a/arch/x86/entry/entry_64.S
2019 +++ b/arch/x86/entry/entry_64.S
2020 @@ -546,7 +546,23 @@ GLOBAL(retint_user)
2021         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2022         jnc     1f
2023  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2024 +#ifndef CONFIG_PREEMPT_LAZY
2025         jnz     1f
2026 +#else
2027 +       jz      do_preempt_schedule_irq
2028 +
2029 +       # atleast preempt count == 0 ?
2030 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2031 +       jnz     1f
2032 +
2033 +       movq    PER_CPU_VAR(current_task), %rcx
2034 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2035 +       jnz     1f
2036 +
2037 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2038 +       jnc     1f
2039 +do_preempt_schedule_irq:
2040 +#endif
2041         call    preempt_schedule_irq
2042         jmp     0b
2043  1:
2044 @@ -894,6 +910,7 @@ EXPORT_SYMBOL(native_load_gs_index)
2045         jmp     2b
2046         .previous
2047  
2048 +#ifndef CONFIG_PREEMPT_RT_FULL
2049  /* Call softirq on interrupt stack. Interrupts are off. */
2050  ENTRY(do_softirq_own_stack)
2051         pushq   %rbp
2052 @@ -906,6 +923,7 @@ ENTRY(do_softirq_own_stack)
2053         decl    PER_CPU_VAR(irq_count)
2054         ret
2055  END(do_softirq_own_stack)
2056 +#endif
2057  
2058  #ifdef CONFIG_XEN
2059  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2060 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2061 index 17f218645701..11bd1b7ee6eb 100644
2062 --- a/arch/x86/include/asm/preempt.h
2063 +++ b/arch/x86/include/asm/preempt.h
2064 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2065   * a decrement which hits zero means we have no preempt_count and should
2066   * reschedule.
2067   */
2068 -static __always_inline bool __preempt_count_dec_and_test(void)
2069 +static __always_inline bool ____preempt_count_dec_and_test(void)
2070  {
2071         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2072  }
2073  
2074 +static __always_inline bool __preempt_count_dec_and_test(void)
2075 +{
2076 +       if (____preempt_count_dec_and_test())
2077 +               return true;
2078 +#ifdef CONFIG_PREEMPT_LAZY
2079 +       if (current_thread_info()->preempt_lazy_count)
2080 +               return false;
2081 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2082 +#else
2083 +       return false;
2084 +#endif
2085 +}
2086 +
2087  /*
2088   * Returns true when we need to resched and can (barring IRQ state).
2089   */
2090  static __always_inline bool should_resched(int preempt_offset)
2091  {
2092 +#ifdef CONFIG_PREEMPT_LAZY
2093 +       u32 tmp;
2094 +
2095 +       tmp = raw_cpu_read_4(__preempt_count);
2096 +       if (tmp == preempt_offset)
2097 +               return true;
2098 +
2099 +       /* preempt count == 0 ? */
2100 +       tmp &= ~PREEMPT_NEED_RESCHED;
2101 +       if (tmp)
2102 +               return false;
2103 +       if (current_thread_info()->preempt_lazy_count)
2104 +               return false;
2105 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2106 +#else
2107         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2108 +#endif
2109  }
2110  
2111  #ifdef CONFIG_PREEMPT
2112 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2113 index 8af22be0fe61..d1328789b759 100644
2114 --- a/arch/x86/include/asm/signal.h
2115 +++ b/arch/x86/include/asm/signal.h
2116 @@ -27,6 +27,19 @@ typedef struct {
2117  #define SA_IA32_ABI    0x02000000u
2118  #define SA_X32_ABI     0x01000000u
2119  
2120 +/*
2121 + * Because some traps use the IST stack, we must keep preemption
2122 + * disabled while calling do_trap(), but do_trap() may call
2123 + * force_sig_info() which will grab the signal spin_locks for the
2124 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2125 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2126 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2127 + * trap.
2128 + */
2129 +#if defined(CONFIG_PREEMPT_RT_FULL)
2130 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2131 +#endif
2132 +
2133  #ifndef CONFIG_COMPAT
2134  typedef sigset_t compat_sigset_t;
2135  #endif
2136 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2137 index 58505f01962f..02fa39652cd6 100644
2138 --- a/arch/x86/include/asm/stackprotector.h
2139 +++ b/arch/x86/include/asm/stackprotector.h
2140 @@ -59,7 +59,7 @@
2141   */
2142  static __always_inline void boot_init_stack_canary(void)
2143  {
2144 -       u64 canary;
2145 +       u64 uninitialized_var(canary);
2146         u64 tsc;
2147  
2148  #ifdef CONFIG_X86_64
2149 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2150          * of randomness. The TSC only matters for very early init,
2151          * there it already has some randomness on most systems. Later
2152          * on during the bootup the random pool has true entropy too.
2153 +        *
2154 +        * For preempt-rt we need to weaken the randomness a bit, as
2155 +        * we can't call into the random generator from atomic context
2156 +        * due to locking constraints. We just leave canary
2157 +        * uninitialized and use the TSC based randomness on top of it.
2158          */
2159 +#ifndef CONFIG_PREEMPT_RT_FULL
2160         get_random_bytes(&canary, sizeof(canary));
2161 +#endif
2162         tsc = rdtsc();
2163         canary += tsc + (tsc << 32UL);
2164  
2165 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2166 index ad6f5eb07a95..5ceb3a1c2b1a 100644
2167 --- a/arch/x86/include/asm/thread_info.h
2168 +++ b/arch/x86/include/asm/thread_info.h
2169 @@ -54,11 +54,14 @@ struct task_struct;
2170  
2171  struct thread_info {
2172         unsigned long           flags;          /* low level flags */
2173 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2174 +                                                          <0 => BUG */
2175  };
2176  
2177  #define INIT_THREAD_INFO(tsk)                  \
2178  {                                              \
2179         .flags          = 0,                    \
2180 +       .preempt_lazy_count = 0,                \
2181  }
2182  
2183  #define init_stack             (init_thread_union.stack)
2184 @@ -67,6 +70,10 @@ struct thread_info {
2185  
2186  #include <asm/asm-offsets.h>
2187  
2188 +#define GET_THREAD_INFO(reg) \
2189 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2190 +       _ASM_SUB $(THREAD_SIZE),reg ;
2191 +
2192  #endif
2193  
2194  /*
2195 @@ -85,6 +92,7 @@ struct thread_info {
2196  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2197  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2198  #define TIF_SECCOMP            8       /* secure computing */
2199 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2200  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2201  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2202  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2203 @@ -108,6 +116,7 @@ struct thread_info {
2204  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2205  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2206  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2207 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2208  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2209  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2210  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2211 @@ -143,6 +152,8 @@ struct thread_info {
2212  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2213  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2214  
2215 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2216 +
2217  #define STACK_WARN             (THREAD_SIZE/8)
2218  
2219  /*
2220 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2221 index 57ab86d94d64..35d25e27180f 100644
2222 --- a/arch/x86/include/asm/uv/uv_bau.h
2223 +++ b/arch/x86/include/asm/uv/uv_bau.h
2224 @@ -624,9 +624,9 @@ struct bau_control {
2225         cycles_t                send_message;
2226         cycles_t                period_end;
2227         cycles_t                period_time;
2228 -       spinlock_t              uvhub_lock;
2229 -       spinlock_t              queue_lock;
2230 -       spinlock_t              disable_lock;
2231 +       raw_spinlock_t          uvhub_lock;
2232 +       raw_spinlock_t          queue_lock;
2233 +       raw_spinlock_t          disable_lock;
2234         /* tunables */
2235         int                     max_concurr;
2236         int                     max_concurr_const;
2237 @@ -815,15 +815,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2238   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2239   * on equal.
2240   */
2241 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2242 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2243  {
2244 -       spin_lock(lock);
2245 +       raw_spin_lock(lock);
2246         if (atomic_read(v) >= u) {
2247 -               spin_unlock(lock);
2248 +               raw_spin_unlock(lock);
2249                 return 0;
2250         }
2251         atomic_inc(v);
2252 -       spin_unlock(lock);
2253 +       raw_spin_unlock(lock);
2254         return 1;
2255  }
2256  
2257 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2258 index 931ced8ca345..167975ac8af7 100644
2259 --- a/arch/x86/kernel/acpi/boot.c
2260 +++ b/arch/x86/kernel/acpi/boot.c
2261 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2262   *             ->ioapic_mutex
2263   *                     ->ioapic_lock
2264   */
2265 +#ifdef CONFIG_X86_IO_APIC
2266  static DEFINE_MUTEX(acpi_ioapic_lock);
2267 +#endif
2268  
2269  /* --------------------------------------------------------------------------
2270                                Boot-time Configuration
2271 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2272 index d1e25564b3c1..67e585fa801f 100644
2273 --- a/arch/x86/kernel/apic/io_apic.c
2274 +++ b/arch/x86/kernel/apic/io_apic.c
2275 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2276  static inline bool ioapic_irqd_mask(struct irq_data *data)
2277  {
2278         /* If we are moving the irq we need to mask it */
2279 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2280 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2281 +                    !irqd_irq_inprogress(data))) {
2282                 mask_ioapic_irq(data);
2283                 return true;
2284         }
2285 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2286 index c62e015b126c..0cc71257fca6 100644
2287 --- a/arch/x86/kernel/asm-offsets.c
2288 +++ b/arch/x86/kernel/asm-offsets.c
2289 @@ -36,6 +36,7 @@ void common(void) {
2290  
2291         BLANK();
2292         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2293 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2294         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2295  
2296         BLANK();
2297 @@ -91,4 +92,5 @@ void common(void) {
2298  
2299         BLANK();
2300         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2301 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2302  }
2303 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2304 index a7fdf453d895..e3a0e969a66e 100644
2305 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2306 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2307 @@ -41,6 +41,8 @@
2308  #include <linux/debugfs.h>
2309  #include <linux/irq_work.h>
2310  #include <linux/export.h>
2311 +#include <linux/jiffies.h>
2312 +#include <linux/swork.h>
2313  #include <linux/jump_label.h>
2314  
2315  #include <asm/processor.h>
2316 @@ -1317,7 +1319,7 @@ void mce_log_therm_throt_event(__u64 status)
2317  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2318  
2319  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2320 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2321 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2322  
2323  static unsigned long mce_adjust_timer_default(unsigned long interval)
2324  {
2325 @@ -1326,32 +1328,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2326  
2327  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2328  
2329 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2330 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2331  {
2332 -       unsigned long when = jiffies + interval;
2333 -       unsigned long flags;
2334 -
2335 -       local_irq_save(flags);
2336 -
2337 -       if (timer_pending(t)) {
2338 -               if (time_before(when, t->expires))
2339 -                       mod_timer(t, when);
2340 -       } else {
2341 -               t->expires = round_jiffies(when);
2342 -               add_timer_on(t, smp_processor_id());
2343 -       }
2344 -
2345 -       local_irq_restore(flags);
2346 +       if (!interval)
2347 +               return HRTIMER_NORESTART;
2348 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2349 +       return HRTIMER_RESTART;
2350  }
2351  
2352 -static void mce_timer_fn(unsigned long data)
2353 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2354  {
2355 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2356 -       int cpu = smp_processor_id();
2357         unsigned long iv;
2358  
2359 -       WARN_ON(cpu != data);
2360 -
2361         iv = __this_cpu_read(mce_next_interval);
2362  
2363         if (mce_available(this_cpu_ptr(&cpu_info))) {
2364 @@ -1374,7 +1362,7 @@ static void mce_timer_fn(unsigned long data)
2365  
2366  done:
2367         __this_cpu_write(mce_next_interval, iv);
2368 -       __restart_timer(t, iv);
2369 +       return __restart_timer(timer, iv);
2370  }
2371  
2372  /*
2373 @@ -1382,7 +1370,7 @@ static void mce_timer_fn(unsigned long data)
2374   */
2375  void mce_timer_kick(unsigned long interval)
2376  {
2377 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2378 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2379         unsigned long iv = __this_cpu_read(mce_next_interval);
2380  
2381         __restart_timer(t, interval);
2382 @@ -1397,7 +1385,7 @@ static void mce_timer_delete_all(void)
2383         int cpu;
2384  
2385         for_each_online_cpu(cpu)
2386 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2387 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2388  }
2389  
2390  static void mce_do_trigger(struct work_struct *work)
2391 @@ -1407,6 +1395,56 @@ static void mce_do_trigger(struct work_struct *work)
2392  
2393  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2394  
2395 +static void __mce_notify_work(struct swork_event *event)
2396 +{
2397 +       /* Not more than two messages every minute */
2398 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2399 +
2400 +       /* wake processes polling /dev/mcelog */
2401 +       wake_up_interruptible(&mce_chrdev_wait);
2402 +
2403 +       /*
2404 +        * There is no risk of missing notifications because
2405 +        * work_pending is always cleared before the function is
2406 +        * executed.
2407 +        */
2408 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2409 +               schedule_work(&mce_trigger_work);
2410 +
2411 +       if (__ratelimit(&ratelimit))
2412 +               pr_info(HW_ERR "Machine check events logged\n");
2413 +}
2414 +
2415 +#ifdef CONFIG_PREEMPT_RT_FULL
2416 +static bool notify_work_ready __read_mostly;
2417 +static struct swork_event notify_work;
2418 +
2419 +static int mce_notify_work_init(void)
2420 +{
2421 +       int err;
2422 +
2423 +       err = swork_get();
2424 +       if (err)
2425 +               return err;
2426 +
2427 +       INIT_SWORK(&notify_work, __mce_notify_work);
2428 +       notify_work_ready = true;
2429 +       return 0;
2430 +}
2431 +
2432 +static void mce_notify_work(void)
2433 +{
2434 +       if (notify_work_ready)
2435 +               swork_queue(&notify_work);
2436 +}
2437 +#else
2438 +static void mce_notify_work(void)
2439 +{
2440 +       __mce_notify_work(NULL);
2441 +}
2442 +static inline int mce_notify_work_init(void) { return 0; }
2443 +#endif
2444 +
2445  /*
2446   * Notify the user(s) about new machine check events.
2447   * Can be called from interrupt context, but not from machine check/NMI
2448 @@ -1414,19 +1452,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2449   */
2450  int mce_notify_irq(void)
2451  {
2452 -       /* Not more than two messages every minute */
2453 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2454 -
2455         if (test_and_clear_bit(0, &mce_need_notify)) {
2456 -               /* wake processes polling /dev/mcelog */
2457 -               wake_up_interruptible(&mce_chrdev_wait);
2458 -
2459 -               if (mce_helper[0])
2460 -                       schedule_work(&mce_trigger_work);
2461 -
2462 -               if (__ratelimit(&ratelimit))
2463 -                       pr_info(HW_ERR "Machine check events logged\n");
2464 -
2465 +               mce_notify_work();
2466                 return 1;
2467         }
2468         return 0;
2469 @@ -1732,7 +1759,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2470         }
2471  }
2472  
2473 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2474 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2475  {
2476         unsigned long iv = check_interval * HZ;
2477  
2478 @@ -1741,16 +1768,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2479  
2480         per_cpu(mce_next_interval, cpu) = iv;
2481  
2482 -       t->expires = round_jiffies(jiffies + iv);
2483 -       add_timer_on(t, cpu);
2484 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2485 +                       0, HRTIMER_MODE_REL_PINNED);
2486  }
2487  
2488  static void __mcheck_cpu_init_timer(void)
2489  {
2490 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2491 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2492         unsigned int cpu = smp_processor_id();
2493  
2494 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2495 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2496 +       t->function = mce_timer_fn;
2497         mce_start_timer(cpu, t);
2498  }
2499  
2500 @@ -2475,6 +2503,8 @@ static void mce_disable_cpu(void *h)
2501         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2502                 return;
2503  
2504 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2505 +
2506         if (!(action & CPU_TASKS_FROZEN))
2507                 cmci_clear();
2508  
2509 @@ -2497,6 +2527,7 @@ static void mce_reenable_cpu(void *h)
2510                 if (b->init)
2511                         wrmsrl(msr_ops.ctl(i), b->ctl);
2512         }
2513 +       __mcheck_cpu_init_timer();
2514  }
2515  
2516  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2517 @@ -2504,7 +2535,6 @@ static int
2518  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2519  {
2520         unsigned int cpu = (unsigned long)hcpu;
2521 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2522  
2523         switch (action & ~CPU_TASKS_FROZEN) {
2524         case CPU_ONLINE:
2525 @@ -2524,11 +2554,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2526                 break;
2527         case CPU_DOWN_PREPARE:
2528                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2529 -               del_timer_sync(t);
2530                 break;
2531         case CPU_DOWN_FAILED:
2532                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2533 -               mce_start_timer(cpu, t);
2534                 break;
2535         }
2536  
2537 @@ -2567,6 +2595,10 @@ static __init int mcheck_init_device(void)
2538                 goto err_out;
2539         }
2540  
2541 +       err = mce_notify_work_init();
2542 +       if (err)
2543 +               goto err_out;
2544 +
2545         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2546                 err = -ENOMEM;
2547                 goto err_out;
2548 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2549 index 1f38d9a4d9de..053bf3b2ef39 100644
2550 --- a/arch/x86/kernel/irq_32.c
2551 +++ b/arch/x86/kernel/irq_32.c
2552 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2553                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2554  }
2555  
2556 +#ifndef CONFIG_PREEMPT_RT_FULL
2557  void do_softirq_own_stack(void)
2558  {
2559         struct irq_stack *irqstk;
2560 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2561  
2562         call_on_stack(__do_softirq, isp);
2563  }
2564 +#endif
2565  
2566  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2567  {
2568 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2569 index bd7be8efdc4c..b3b0a7f7b1ca 100644
2570 --- a/arch/x86/kernel/process_32.c
2571 +++ b/arch/x86/kernel/process_32.c
2572 @@ -35,6 +35,7 @@
2573  #include <linux/uaccess.h>
2574  #include <linux/io.h>
2575  #include <linux/kdebug.h>
2576 +#include <linux/highmem.h>
2577  
2578  #include <asm/pgtable.h>
2579  #include <asm/ldt.h>
2580 @@ -195,6 +196,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2581  }
2582  EXPORT_SYMBOL_GPL(start_thread);
2583  
2584 +#ifdef CONFIG_PREEMPT_RT_FULL
2585 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2586 +{
2587 +       int i;
2588 +
2589 +       /*
2590 +        * Clear @prev's kmap_atomic mappings
2591 +        */
2592 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2593 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2594 +               pte_t *ptep = kmap_pte - idx;
2595 +
2596 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2597 +       }
2598 +       /*
2599 +        * Restore @next_p's kmap_atomic mappings
2600 +        */
2601 +       for (i = 0; i < next_p->kmap_idx; i++) {
2602 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2603 +
2604 +               if (!pte_none(next_p->kmap_pte[i]))
2605 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2606 +       }
2607 +}
2608 +#else
2609 +static inline void
2610 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2611 +#endif
2612 +
2613  
2614  /*
2615   *     switch_to(x,y) should switch tasks from x to y.
2616 @@ -271,6 +301,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2617                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2618                 __switch_to_xtra(prev_p, next_p, tss);
2619  
2620 +       switch_kmaps(prev_p, next_p);
2621 +
2622         /*
2623          * Leave lazy mode, flushing any hypercalls made here.
2624          * This must be done before restoring TLS segments so
2625 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2626 index 3f05c044720b..fe68afd37162 100644
2627 --- a/arch/x86/kvm/lapic.c
2628 +++ b/arch/x86/kvm/lapic.c
2629 @@ -1939,6 +1939,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2630         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2631                      HRTIMER_MODE_ABS_PINNED);
2632         apic->lapic_timer.timer.function = apic_timer_fn;
2633 +       apic->lapic_timer.timer.irqsafe = 1;
2634  
2635         /*
2636          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2637 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2638 index e5bc139d1ba7..fa0aa5931a4b 100644
2639 --- a/arch/x86/kvm/x86.c
2640 +++ b/arch/x86/kvm/x86.c
2641 @@ -5933,6 +5933,13 @@ int kvm_arch_init(void *opaque)
2642                 goto out;
2643         }
2644  
2645 +#ifdef CONFIG_PREEMPT_RT_FULL
2646 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2647 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2648 +               return -EOPNOTSUPP;
2649 +       }
2650 +#endif
2651 +
2652         r = kvm_mmu_module_init();
2653         if (r)
2654                 goto out_free_percpu;
2655 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2656 index 6d18b70ed5a9..f752724c22e8 100644
2657 --- a/arch/x86/mm/highmem_32.c
2658 +++ b/arch/x86/mm/highmem_32.c
2659 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2660   */
2661  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2662  {
2663 +       pte_t pte = mk_pte(page, prot);
2664         unsigned long vaddr;
2665         int idx, type;
2666  
2667 -       preempt_disable();
2668 +       preempt_disable_nort();
2669         pagefault_disable();
2670  
2671         if (!PageHighMem(page))
2672 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2673         idx = type + KM_TYPE_NR*smp_processor_id();
2674         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2675         BUG_ON(!pte_none(*(kmap_pte-idx)));
2676 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2677 +#ifdef CONFIG_PREEMPT_RT_FULL
2678 +       current->kmap_pte[type] = pte;
2679 +#endif
2680 +       set_pte(kmap_pte-idx, pte);
2681         arch_flush_lazy_mmu_mode();
2682  
2683         return (void *)vaddr;
2684 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2685                  * is a bad idea also, in case the page changes cacheability
2686                  * attributes or becomes a protected page in a hypervisor.
2687                  */
2688 +#ifdef CONFIG_PREEMPT_RT_FULL
2689 +               current->kmap_pte[type] = __pte(0);
2690 +#endif
2691                 kpte_clear_flush(kmap_pte-idx, vaddr);
2692                 kmap_atomic_idx_pop();
2693                 arch_flush_lazy_mmu_mode();
2694 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2695  #endif
2696  
2697         pagefault_enable();
2698 -       preempt_enable();
2699 +       preempt_enable_nort();
2700  }
2701  EXPORT_SYMBOL(__kunmap_atomic);
2702  
2703 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2704 index ada98b39b8ad..585f6829653b 100644
2705 --- a/arch/x86/mm/iomap_32.c
2706 +++ b/arch/x86/mm/iomap_32.c
2707 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2708  
2709  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2710  {
2711 +       pte_t pte = pfn_pte(pfn, prot);
2712         unsigned long vaddr;
2713         int idx, type;
2714  
2715 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2716         type = kmap_atomic_idx_push();
2717         idx = type + KM_TYPE_NR * smp_processor_id();
2718         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2719 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2720 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2721 +
2722 +#ifdef CONFIG_PREEMPT_RT_FULL
2723 +       current->kmap_pte[type] = pte;
2724 +#endif
2725 +       set_pte(kmap_pte - idx, pte);
2726         arch_flush_lazy_mmu_mode();
2727  
2728         return (void *)vaddr;
2729 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2730                  * is a bad idea also, in case the page changes cacheability
2731                  * attributes or becomes a protected page in a hypervisor.
2732                  */
2733 +#ifdef CONFIG_PREEMPT_RT_FULL
2734 +               current->kmap_pte[type] = __pte(0);
2735 +#endif
2736                 kpte_clear_flush(kmap_pte-idx, vaddr);
2737                 kmap_atomic_idx_pop();
2738         }
2739 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2740 index e3353c97d086..01664968555c 100644
2741 --- a/arch/x86/mm/pageattr.c
2742 +++ b/arch/x86/mm/pageattr.c
2743 @@ -214,7 +214,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
2744                             int in_flags, struct page **pages)
2745  {
2746         unsigned int i, level;
2747 +#ifdef CONFIG_PREEMPT
2748 +       /*
2749 +        * Avoid wbinvd() because it causes latencies on all CPUs,
2750 +        * regardless of any CPU isolation that may be in effect.
2751 +        */
2752 +       unsigned long do_wbinvd = 0;
2753 +#else
2754         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
2755 +#endif
2756  
2757         BUG_ON(irqs_disabled());
2758  
2759 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2760 index 9e42842e924a..5398f97172f9 100644
2761 --- a/arch/x86/platform/uv/tlb_uv.c
2762 +++ b/arch/x86/platform/uv/tlb_uv.c
2763 @@ -748,9 +748,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2764  
2765                 quiesce_local_uvhub(hmaster);
2766  
2767 -               spin_lock(&hmaster->queue_lock);
2768 +               raw_spin_lock(&hmaster->queue_lock);
2769                 reset_with_ipi(&bau_desc->distribution, bcp);
2770 -               spin_unlock(&hmaster->queue_lock);
2771 +               raw_spin_unlock(&hmaster->queue_lock);
2772  
2773                 end_uvhub_quiesce(hmaster);
2774  
2775 @@ -770,9 +770,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2776  
2777                 quiesce_local_uvhub(hmaster);
2778  
2779 -               spin_lock(&hmaster->queue_lock);
2780 +               raw_spin_lock(&hmaster->queue_lock);
2781                 reset_with_ipi(&bau_desc->distribution, bcp);
2782 -               spin_unlock(&hmaster->queue_lock);
2783 +               raw_spin_unlock(&hmaster->queue_lock);
2784  
2785                 end_uvhub_quiesce(hmaster);
2786  
2787 @@ -793,7 +793,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2788         cycles_t tm1;
2789  
2790         hmaster = bcp->uvhub_master;
2791 -       spin_lock(&hmaster->disable_lock);
2792 +       raw_spin_lock(&hmaster->disable_lock);
2793         if (!bcp->baudisabled) {
2794                 stat->s_bau_disabled++;
2795                 tm1 = get_cycles();
2796 @@ -806,7 +806,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2797                         }
2798                 }
2799         }
2800 -       spin_unlock(&hmaster->disable_lock);
2801 +       raw_spin_unlock(&hmaster->disable_lock);
2802  }
2803  
2804  static void count_max_concurr(int stat, struct bau_control *bcp,
2805 @@ -869,7 +869,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2806   */
2807  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2808  {
2809 -       spinlock_t *lock = &hmaster->uvhub_lock;
2810 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2811         atomic_t *v;
2812  
2813         v = &hmaster->active_descriptor_count;
2814 @@ -1002,7 +1002,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2815         struct bau_control *hmaster;
2816  
2817         hmaster = bcp->uvhub_master;
2818 -       spin_lock(&hmaster->disable_lock);
2819 +       raw_spin_lock(&hmaster->disable_lock);
2820         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2821                 stat->s_bau_reenabled++;
2822                 for_each_present_cpu(tcpu) {
2823 @@ -1014,10 +1014,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2824                                 tbcp->period_giveups = 0;
2825                         }
2826                 }
2827 -               spin_unlock(&hmaster->disable_lock);
2828 +               raw_spin_unlock(&hmaster->disable_lock);
2829                 return 0;
2830         }
2831 -       spin_unlock(&hmaster->disable_lock);
2832 +       raw_spin_unlock(&hmaster->disable_lock);
2833         return -1;
2834  }
2835  
2836 @@ -1940,9 +1940,9 @@ static void __init init_per_cpu_tunables(void)
2837                 bcp->cong_reps                  = congested_reps;
2838                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2839                 bcp->giveup_limit               = giveup_limit;
2840 -               spin_lock_init(&bcp->queue_lock);
2841 -               spin_lock_init(&bcp->uvhub_lock);
2842 -               spin_lock_init(&bcp->disable_lock);
2843 +               raw_spin_lock_init(&bcp->queue_lock);
2844 +               raw_spin_lock_init(&bcp->uvhub_lock);
2845 +               raw_spin_lock_init(&bcp->disable_lock);
2846         }
2847  }
2848  
2849 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2850 index b333fc45f9ec..8b85916e6986 100644
2851 --- a/arch/x86/platform/uv/uv_time.c
2852 +++ b/arch/x86/platform/uv/uv_time.c
2853 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2854  
2855  /* There is one of these allocated per node */
2856  struct uv_rtc_timer_head {
2857 -       spinlock_t      lock;
2858 +       raw_spinlock_t  lock;
2859         /* next cpu waiting for timer, local node relative: */
2860         int             next_cpu;
2861         /* number of cpus on this node: */
2862 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2863                                 uv_rtc_deallocate_timers();
2864                                 return -ENOMEM;
2865                         }
2866 -                       spin_lock_init(&head->lock);
2867 +                       raw_spin_lock_init(&head->lock);
2868                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2869                         head->next_cpu = -1;
2870                         blade_info[bid] = head;
2871 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2872         unsigned long flags;
2873         int next_cpu;
2874  
2875 -       spin_lock_irqsave(&head->lock, flags);
2876 +       raw_spin_lock_irqsave(&head->lock, flags);
2877  
2878         next_cpu = head->next_cpu;
2879         *t = expires;
2880 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2881                 if (uv_setup_intr(cpu, expires)) {
2882                         *t = ULLONG_MAX;
2883                         uv_rtc_find_next_timer(head, pnode);
2884 -                       spin_unlock_irqrestore(&head->lock, flags);
2885 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2886                         return -ETIME;
2887                 }
2888         }
2889  
2890 -       spin_unlock_irqrestore(&head->lock, flags);
2891 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2892         return 0;
2893  }
2894  
2895 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2896         unsigned long flags;
2897         int rc = 0;
2898  
2899 -       spin_lock_irqsave(&head->lock, flags);
2900 +       raw_spin_lock_irqsave(&head->lock, flags);
2901  
2902         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2903                 rc = 1;
2904 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2905                         uv_rtc_find_next_timer(head, pnode);
2906         }
2907  
2908 -       spin_unlock_irqrestore(&head->lock, flags);
2909 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2910  
2911         return rc;
2912  }
2913 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2914  static cycle_t uv_read_rtc(struct clocksource *cs)
2915  {
2916         unsigned long offset;
2917 +       cycle_t cycles;
2918  
2919 +       preempt_disable();
2920         if (uv_get_min_hub_revision_id() == 1)
2921                 offset = 0;
2922         else
2923                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2924  
2925 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2926 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2927 +       preempt_enable();
2928 +
2929 +       return cycles;
2930  }
2931  
2932  /*
2933 diff --git a/block/blk-core.c b/block/blk-core.c
2934 index 14d7c0740dc0..dfd905bea77c 100644
2935 --- a/block/blk-core.c
2936 +++ b/block/blk-core.c
2937 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
2938  
2939         INIT_LIST_HEAD(&rq->queuelist);
2940         INIT_LIST_HEAD(&rq->timeout_list);
2941 +#ifdef CONFIG_PREEMPT_RT_FULL
2942 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2943 +#endif
2944         rq->cpu = -1;
2945         rq->q = q;
2946         rq->__sector = (sector_t) -1;
2947 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
2948   **/
2949  void blk_start_queue(struct request_queue *q)
2950  {
2951 -       WARN_ON(!irqs_disabled());
2952 +       WARN_ON_NONRT(!irqs_disabled());
2953  
2954         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2955         __blk_run_queue(q);
2956 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
2957                 if (nowait)
2958                         return -EBUSY;
2959  
2960 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2961 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2962                                 !atomic_read(&q->mq_freeze_depth) ||
2963                                 blk_queue_dying(q));
2964                 if (blk_queue_dying(q))
2965 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
2966         struct request_queue *q =
2967                 container_of(ref, struct request_queue, q_usage_counter);
2968  
2969 -       wake_up_all(&q->mq_freeze_wq);
2970 +       swake_up_all(&q->mq_freeze_wq);
2971  }
2972  
2973  static void blk_rq_timed_out_timer(unsigned long data)
2974 @@ -748,7 +751,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
2975         q->bypass_depth = 1;
2976         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
2977  
2978 -       init_waitqueue_head(&q->mq_freeze_wq);
2979 +       init_swait_queue_head(&q->mq_freeze_wq);
2980  
2981         /*
2982          * Init percpu_ref in atomic mode so that it's faster to shutdown.
2983 @@ -3177,7 +3180,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
2984                 blk_run_queue_async(q);
2985         else
2986                 __blk_run_queue(q);
2987 -       spin_unlock(q->queue_lock);
2988 +       spin_unlock_irq(q->queue_lock);
2989  }
2990  
2991  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2992 @@ -3225,7 +3228,6 @@ EXPORT_SYMBOL(blk_check_plugged);
2993  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2994  {
2995         struct request_queue *q;
2996 -       unsigned long flags;
2997         struct request *rq;
2998         LIST_HEAD(list);
2999         unsigned int depth;
3000 @@ -3245,11 +3247,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3001         q = NULL;
3002         depth = 0;
3003  
3004 -       /*
3005 -        * Save and disable interrupts here, to avoid doing it for every
3006 -        * queue lock we have to take.
3007 -        */
3008 -       local_irq_save(flags);
3009         while (!list_empty(&list)) {
3010                 rq = list_entry_rq(list.next);
3011                 list_del_init(&rq->queuelist);
3012 @@ -3262,7 +3259,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3013                                 queue_unplugged(q, depth, from_schedule);
3014                         q = rq->q;
3015                         depth = 0;
3016 -                       spin_lock(q->queue_lock);
3017 +                       spin_lock_irq(q->queue_lock);
3018                 }
3019  
3020                 /*
3021 @@ -3289,8 +3286,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3022          */
3023         if (q)
3024                 queue_unplugged(q, depth, from_schedule);
3025 -
3026 -       local_irq_restore(flags);
3027  }
3028  
3029  void blk_finish_plug(struct blk_plug *plug)
3030 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3031 index 381cb50a673c..dc8785233d94 100644
3032 --- a/block/blk-ioc.c
3033 +++ b/block/blk-ioc.c
3034 @@ -7,6 +7,7 @@
3035  #include <linux/bio.h>
3036  #include <linux/blkdev.h>
3037  #include <linux/slab.h>
3038 +#include <linux/delay.h>
3039  
3040  #include "blk.h"
3041  
3042 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3043                         spin_unlock(q->queue_lock);
3044                 } else {
3045                         spin_unlock_irqrestore(&ioc->lock, flags);
3046 -                       cpu_relax();
3047 +                       cpu_chill();
3048                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3049                 }
3050         }
3051 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3052                         spin_unlock(icq->q->queue_lock);
3053                 } else {
3054                         spin_unlock_irqrestore(&ioc->lock, flags);
3055 -                       cpu_relax();
3056 +                       cpu_chill();
3057                         goto retry;
3058                 }
3059         }
3060 diff --git a/block/blk-mq.c b/block/blk-mq.c
3061 index ee54ad01f7ac..1a428fe7bbe1 100644
3062 --- a/block/blk-mq.c
3063 +++ b/block/blk-mq.c
3064 @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3065  
3066  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3067  {
3068 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3069 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3070  }
3071  
3072  /*
3073 @@ -110,7 +110,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3074         WARN_ON_ONCE(freeze_depth < 0);
3075         if (!freeze_depth) {
3076                 percpu_ref_reinit(&q->q_usage_counter);
3077 -               wake_up_all(&q->mq_freeze_wq);
3078 +               swake_up_all(&q->mq_freeze_wq);
3079         }
3080  }
3081  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3082 @@ -129,7 +129,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3083          * dying, we need to ensure that processes currently waiting on
3084          * the queue are notified as well.
3085          */
3086 -       wake_up_all(&q->mq_freeze_wq);
3087 +       swake_up_all(&q->mq_freeze_wq);
3088  }
3089  
3090  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3091 @@ -177,6 +177,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3092         rq->resid_len = 0;
3093         rq->sense = NULL;
3094  
3095 +#ifdef CONFIG_PREEMPT_RT_FULL
3096 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3097 +#endif
3098         INIT_LIST_HEAD(&rq->timeout_list);
3099         rq->timeout = 0;
3100  
3101 @@ -345,6 +348,17 @@ void blk_mq_end_request(struct request *rq, int error)
3102  }
3103  EXPORT_SYMBOL(blk_mq_end_request);
3104  
3105 +#ifdef CONFIG_PREEMPT_RT_FULL
3106 +
3107 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3108 +{
3109 +       struct request *rq = container_of(work, struct request, work);
3110 +
3111 +       rq->q->softirq_done_fn(rq);
3112 +}
3113 +
3114 +#else
3115 +
3116  static void __blk_mq_complete_request_remote(void *data)
3117  {
3118         struct request *rq = data;
3119 @@ -352,6 +366,8 @@ static void __blk_mq_complete_request_remote(void *data)
3120         rq->q->softirq_done_fn(rq);
3121  }
3122  
3123 +#endif
3124 +
3125  static void blk_mq_ipi_complete_request(struct request *rq)
3126  {
3127         struct blk_mq_ctx *ctx = rq->mq_ctx;
3128 @@ -363,19 +379,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3129                 return;
3130         }
3131  
3132 -       cpu = get_cpu();
3133 +       cpu = get_cpu_light();
3134         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3135                 shared = cpus_share_cache(cpu, ctx->cpu);
3136  
3137         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3138 +#ifdef CONFIG_PREEMPT_RT_FULL
3139 +               schedule_work_on(ctx->cpu, &rq->work);
3140 +#else
3141                 rq->csd.func = __blk_mq_complete_request_remote;
3142                 rq->csd.info = rq;
3143                 rq->csd.flags = 0;
3144                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3145 +#endif
3146         } else {
3147                 rq->q->softirq_done_fn(rq);
3148         }
3149 -       put_cpu();
3150 +       put_cpu_light();
3151  }
3152  
3153  static void __blk_mq_complete_request(struct request *rq)
3154 @@ -906,14 +926,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3155                 return;
3156  
3157         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3158 -               int cpu = get_cpu();
3159 +               int cpu = get_cpu_light();
3160                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3161                         __blk_mq_run_hw_queue(hctx);
3162 -                       put_cpu();
3163 +                       put_cpu_light();
3164                         return;
3165                 }
3166  
3167 -               put_cpu();
3168 +               put_cpu_light();
3169         }
3170  
3171         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3172 diff --git a/block/blk-mq.h b/block/blk-mq.h
3173 index e5d25249028c..1e846b842eab 100644
3174 --- a/block/blk-mq.h
3175 +++ b/block/blk-mq.h
3176 @@ -72,12 +72,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3177   */
3178  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3179  {
3180 -       return __blk_mq_get_ctx(q, get_cpu());
3181 +       return __blk_mq_get_ctx(q, get_cpu_light());
3182  }
3183  
3184  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3185  {
3186 -       put_cpu();
3187 +       put_cpu_light();
3188  }
3189  
3190  struct blk_mq_alloc_data {
3191 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3192 index 06cf9807f49a..c40342643ca0 100644
3193 --- a/block/blk-softirq.c
3194 +++ b/block/blk-softirq.c
3195 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3196                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3197  
3198         local_irq_restore(flags);
3199 +       preempt_check_resched_rt();
3200  }
3201  
3202  /*
3203 @@ -89,6 +90,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
3204                          this_cpu_ptr(&blk_cpu_done));
3205         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3206         local_irq_enable();
3207 +       preempt_check_resched_rt();
3208  
3209         return 0;
3210  }
3211 @@ -141,6 +143,7 @@ void __blk_complete_request(struct request *req)
3212                 goto do_local;
3213  
3214         local_irq_restore(flags);
3215 +       preempt_check_resched_rt();
3216  }
3217  
3218  /**
3219 diff --git a/block/bounce.c b/block/bounce.c
3220 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3221 --- a/block/bounce.c
3222 +++ b/block/bounce.c
3223 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3224         unsigned long flags;
3225         unsigned char *vto;
3226  
3227 -       local_irq_save(flags);
3228 +       local_irq_save_nort(flags);
3229         vto = kmap_atomic(to->bv_page);
3230         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3231         kunmap_atomic(vto);
3232 -       local_irq_restore(flags);
3233 +       local_irq_restore_nort(flags);
3234  }
3235  
3236  #else /* CONFIG_HIGHMEM */
3237 diff --git a/crypto/algapi.c b/crypto/algapi.c
3238 index 1fad2a6b3bbb..ecb7315426a9 100644
3239 --- a/crypto/algapi.c
3240 +++ b/crypto/algapi.c
3241 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3242  
3243  int crypto_register_notifier(struct notifier_block *nb)
3244  {
3245 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3246 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3247  }
3248  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3249  
3250  int crypto_unregister_notifier(struct notifier_block *nb)
3251  {
3252 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3253 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3254  }
3255  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3256  
3257 diff --git a/crypto/api.c b/crypto/api.c
3258 index bbc147cb5dec..bc1a848f02ec 100644
3259 --- a/crypto/api.c
3260 +++ b/crypto/api.c
3261 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3262  DECLARE_RWSEM(crypto_alg_sem);
3263  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3264  
3265 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3266 +SRCU_NOTIFIER_HEAD(crypto_chain);
3267  EXPORT_SYMBOL_GPL(crypto_chain);
3268  
3269  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3270 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3271  {
3272         int ok;
3273  
3274 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3275 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3276         if (ok == NOTIFY_DONE) {
3277                 request_module("cryptomgr");
3278 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3279 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3280         }
3281  
3282         return ok;
3283 diff --git a/crypto/internal.h b/crypto/internal.h
3284 index 7eefcdb00227..0ecc7f5a2f40 100644
3285 --- a/crypto/internal.h
3286 +++ b/crypto/internal.h
3287 @@ -47,7 +47,7 @@ struct crypto_larval {
3288  
3289  extern struct list_head crypto_alg_list;
3290  extern struct rw_semaphore crypto_alg_sem;
3291 -extern struct blocking_notifier_head crypto_chain;
3292 +extern struct srcu_notifier_head crypto_chain;
3293  
3294  #ifdef CONFIG_PROC_FS
3295  void __init crypto_init_proc(void);
3296 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3297  
3298  static inline void crypto_notify(unsigned long val, void *v)
3299  {
3300 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3301 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3302  }
3303  
3304  #endif /* _CRYPTO_INTERNAL_H */
3305 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3306 index 750fa824d42c..441edf51484a 100644
3307 --- a/drivers/acpi/acpica/acglobal.h
3308 +++ b/drivers/acpi/acpica/acglobal.h
3309 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3310   * interrupt level
3311   */
3312  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3313 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3314 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3315  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3316  
3317  /* Mutex for _OSI support */
3318 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3319 index 3b7fb99362b6..696bf8e62afb 100644
3320 --- a/drivers/acpi/acpica/hwregs.c
3321 +++ b/drivers/acpi/acpica/hwregs.c
3322 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3323                           ACPI_BITMASK_ALL_FIXED_STATUS,
3324                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3325  
3326 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3327 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3328  
3329         /* Clear the fixed events in PM1 A/B */
3330  
3331         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3332                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3333  
3334 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3335 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3336  
3337         if (ACPI_FAILURE(status)) {
3338                 goto exit;
3339 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3340 index 98c26ff39409..6e236f2ea791 100644
3341 --- a/drivers/acpi/acpica/hwxface.c
3342 +++ b/drivers/acpi/acpica/hwxface.c
3343 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3344                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3345         }
3346  
3347 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3348 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3349  
3350         /*
3351          * At this point, we know that the parent register is one of the
3352 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3353  
3354  unlock_and_exit:
3355  
3356 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3357 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3358         return_ACPI_STATUS(status);
3359  }
3360  
3361 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3362 index 15073375bd00..357e7ca5a587 100644
3363 --- a/drivers/acpi/acpica/utmutex.c
3364 +++ b/drivers/acpi/acpica/utmutex.c
3365 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3366                 return_ACPI_STATUS (status);
3367         }
3368  
3369 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3370 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3371         if (ACPI_FAILURE (status)) {
3372                 return_ACPI_STATUS (status);
3373         }
3374 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3375         /* Delete the spinlocks */
3376  
3377         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3378 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3379 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3380         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3381  
3382         /* Delete the reader/writer lock */
3383 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3384 index 051b6158d1b7..7ad293bef6ed 100644
3385 --- a/drivers/ata/libata-sff.c
3386 +++ b/drivers/ata/libata-sff.c
3387 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3388         unsigned long flags;
3389         unsigned int consumed;
3390  
3391 -       local_irq_save(flags);
3392 +       local_irq_save_nort(flags);
3393         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3394 -       local_irq_restore(flags);
3395 +       local_irq_restore_nort(flags);
3396  
3397         return consumed;
3398  }
3399 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3400                 unsigned long flags;
3401  
3402                 /* FIXME: use a bounce buffer */
3403 -               local_irq_save(flags);
3404 +               local_irq_save_nort(flags);
3405                 buf = kmap_atomic(page);
3406  
3407                 /* do the actual data transfer */
3408 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3409                                        do_write);
3410  
3411                 kunmap_atomic(buf);
3412 -               local_irq_restore(flags);
3413 +               local_irq_restore_nort(flags);
3414         } else {
3415                 buf = page_address(page);
3416                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3417 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3418                 unsigned long flags;
3419  
3420                 /* FIXME: use bounce buffer */
3421 -               local_irq_save(flags);
3422 +               local_irq_save_nort(flags);
3423                 buf = kmap_atomic(page);
3424  
3425                 /* do the actual data transfer */
3426 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3427                                                                 count, rw);
3428  
3429                 kunmap_atomic(buf);
3430 -               local_irq_restore(flags);
3431 +               local_irq_restore_nort(flags);
3432         } else {
3433                 buf = page_address(page);
3434                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3435 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3436 index 4b5cd3a7b2b6..fa8329ad79fd 100644
3437 --- a/drivers/block/zram/zcomp.c
3438 +++ b/drivers/block/zram/zcomp.c
3439 @@ -118,12 +118,19 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3440  
3441  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3442  {
3443 -       return *get_cpu_ptr(comp->stream);
3444 +       struct zcomp_strm *zstrm;
3445 +
3446 +       zstrm = *this_cpu_ptr(comp->stream);
3447 +       spin_lock(&zstrm->zcomp_lock);
3448 +       return zstrm;
3449  }
3450  
3451  void zcomp_stream_put(struct zcomp *comp)
3452  {
3453 -       put_cpu_ptr(comp->stream);
3454 +       struct zcomp_strm *zstrm;
3455 +
3456 +       zstrm = *this_cpu_ptr(comp->stream);
3457 +       spin_unlock(&zstrm->zcomp_lock);
3458  }
3459  
3460  int zcomp_compress(struct zcomp_strm *zstrm,
3461 @@ -174,6 +181,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3462                         pr_err("Can't allocate a compression stream\n");
3463                         return NOTIFY_BAD;
3464                 }
3465 +               spin_lock_init(&zstrm->zcomp_lock);
3466                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3467                 break;
3468         case CPU_DEAD:
3469 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3470 index 478cac2ed465..f7a6efdc3285 100644
3471 --- a/drivers/block/zram/zcomp.h
3472 +++ b/drivers/block/zram/zcomp.h
3473 @@ -14,6 +14,7 @@ struct zcomp_strm {
3474         /* compression/decompression buffer */
3475         void *buffer;
3476         struct crypto_comp *tfm;
3477 +       spinlock_t zcomp_lock;
3478  };
3479  
3480  /* dynamic per-device compression frontend */
3481 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3482 index d2ef51ca9cf4..05e749736560 100644
3483 --- a/drivers/block/zram/zram_drv.c
3484 +++ b/drivers/block/zram/zram_drv.c
3485 @@ -528,6 +528,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3486                 goto out_error;
3487         }
3488  
3489 +       zram_meta_init_table_locks(meta, disksize);
3490 +
3491         return meta;
3492  
3493  out_error:
3494 @@ -575,28 +577,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3495         struct zram_meta *meta = zram->meta;
3496         unsigned long handle;
3497         unsigned int size;
3498 +       struct zcomp_strm *zstrm;
3499  
3500 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3501 +       zram_lock_table(&meta->table[index]);
3502         handle = meta->table[index].handle;
3503         size = zram_get_obj_size(meta, index);
3504  
3505         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3506 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3507 +               zram_unlock_table(&meta->table[index]);
3508                 clear_page(mem);
3509                 return 0;
3510         }
3511  
3512 +       zstrm = zcomp_stream_get(zram->comp);
3513         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3514         if (size == PAGE_SIZE) {
3515                 copy_page(mem, cmem);
3516         } else {
3517 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3518 -
3519                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3520 -               zcomp_stream_put(zram->comp);
3521         }
3522         zs_unmap_object(meta->mem_pool, handle);
3523 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3524 +       zcomp_stream_put(zram->comp);
3525 +       zram_unlock_table(&meta->table[index]);
3526  
3527         /* Should NEVER happen. Return bio error if it does. */
3528         if (unlikely(ret)) {
3529 @@ -616,14 +618,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3530         struct zram_meta *meta = zram->meta;
3531         page = bvec->bv_page;
3532  
3533 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3534 +       zram_lock_table(&meta->table[index]);
3535         if (unlikely(!meta->table[index].handle) ||
3536                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3537 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3538 +               zram_unlock_table(&meta->table[index]);
3539                 handle_zero_page(bvec);
3540                 return 0;
3541         }
3542 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3543 +       zram_unlock_table(&meta->table[index]);
3544  
3545         if (is_partial_io(bvec))
3546                 /* Use  a temporary buffer to decompress the page */
3547 @@ -700,10 +702,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3548                 if (user_mem)
3549                         kunmap_atomic(user_mem);
3550                 /* Free memory associated with this sector now. */
3551 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3552 +               zram_lock_table(&meta->table[index]);
3553                 zram_free_page(zram, index);
3554                 zram_set_flag(meta, index, ZRAM_ZERO);
3555 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3556 +               zram_unlock_table(&meta->table[index]);
3557  
3558                 atomic64_inc(&zram->stats.zero_pages);
3559                 ret = 0;
3560 @@ -794,12 +796,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3561          * Free memory associated with this sector
3562          * before overwriting unused sectors.
3563          */
3564 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3565 +       zram_lock_table(&meta->table[index]);
3566         zram_free_page(zram, index);
3567  
3568         meta->table[index].handle = handle;
3569         zram_set_obj_size(meta, index, clen);
3570 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3571 +       zram_unlock_table(&meta->table[index]);
3572  
3573         /* Update stats */
3574         atomic64_add(clen, &zram->stats.compr_data_size);
3575 @@ -842,9 +844,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3576         }
3577  
3578         while (n >= PAGE_SIZE) {
3579 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3580 +               zram_lock_table(&meta->table[index]);
3581                 zram_free_page(zram, index);
3582 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3583 +               zram_unlock_table(&meta->table[index]);
3584                 atomic64_inc(&zram->stats.notify_free);
3585                 index++;
3586                 n -= PAGE_SIZE;
3587 @@ -973,9 +975,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3588         zram = bdev->bd_disk->private_data;
3589         meta = zram->meta;
3590  
3591 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3592 +       zram_lock_table(&meta->table[index]);
3593         zram_free_page(zram, index);
3594 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3595 +       zram_unlock_table(&meta->table[index]);
3596         atomic64_inc(&zram->stats.notify_free);
3597  }
3598  
3599 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3600 index 74fcf10da374..fd4020c99b9e 100644
3601 --- a/drivers/block/zram/zram_drv.h
3602 +++ b/drivers/block/zram/zram_drv.h
3603 @@ -73,6 +73,9 @@ enum zram_pageflags {
3604  struct zram_table_entry {
3605         unsigned long handle;
3606         unsigned long value;
3607 +#ifdef CONFIG_PREEMPT_RT_BASE
3608 +       spinlock_t lock;
3609 +#endif
3610  };
3611  
3612  struct zram_stats {
3613 @@ -120,4 +123,42 @@ struct zram {
3614          */
3615         bool claim; /* Protected by bdev->bd_mutex */
3616  };
3617 +
3618 +#ifndef CONFIG_PREEMPT_RT_BASE
3619 +static inline void zram_lock_table(struct zram_table_entry *table)
3620 +{
3621 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3622 +}
3623 +
3624 +static inline void zram_unlock_table(struct zram_table_entry *table)
3625 +{
3626 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3627 +}
3628 +
3629 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3630 +#else /* CONFIG_PREEMPT_RT_BASE */
3631 +static inline void zram_lock_table(struct zram_table_entry *table)
3632 +{
3633 +       spin_lock(&table->lock);
3634 +       __set_bit(ZRAM_ACCESS, &table->value);
3635 +}
3636 +
3637 +static inline void zram_unlock_table(struct zram_table_entry *table)
3638 +{
3639 +       __clear_bit(ZRAM_ACCESS, &table->value);
3640 +       spin_unlock(&table->lock);
3641 +}
3642 +
3643 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3644 +{
3645 +        size_t num_pages = disksize >> PAGE_SHIFT;
3646 +        size_t index;
3647 +
3648 +        for (index = 0; index < num_pages; index++) {
3649 +               spinlock_t *lock = &meta->table[index].lock;
3650 +               spin_lock_init(lock);
3651 +        }
3652 +}
3653 +#endif /* CONFIG_PREEMPT_RT_BASE */
3654 +
3655  #endif
3656 diff --git a/drivers/char/random.c b/drivers/char/random.c
3657 index d6876d506220..0c60b1e54579 100644
3658 --- a/drivers/char/random.c
3659 +++ b/drivers/char/random.c
3660 @@ -1028,8 +1028,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3661         } sample;
3662         long delta, delta2, delta3;
3663  
3664 -       preempt_disable();
3665 -
3666         sample.jiffies = jiffies;
3667         sample.cycles = random_get_entropy();
3668         sample.num = num;
3669 @@ -1070,7 +1068,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3670                  */
3671                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3672         }
3673 -       preempt_enable();
3674  }
3675  
3676  void add_input_randomness(unsigned int type, unsigned int code,
3677 @@ -1123,28 +1120,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3678         return *(ptr + f->reg_idx++);
3679  }
3680  
3681 -void add_interrupt_randomness(int irq, int irq_flags)
3682 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3683  {
3684         struct entropy_store    *r;
3685         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3686 -       struct pt_regs          *regs = get_irq_regs();
3687         unsigned long           now = jiffies;
3688         cycles_t                cycles = random_get_entropy();
3689         __u32                   c_high, j_high;
3690 -       __u64                   ip;
3691         unsigned long           seed;
3692         int                     credit = 0;
3693  
3694         if (cycles == 0)
3695 -               cycles = get_reg(fast_pool, regs);
3696 +               cycles = get_reg(fast_pool, NULL);
3697         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3698         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3699         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3700         fast_pool->pool[1] ^= now ^ c_high;
3701 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3702 +       if (!ip)
3703 +               ip = _RET_IP_;
3704         fast_pool->pool[2] ^= ip;
3705         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3706 -               get_reg(fast_pool, regs);
3707 +               get_reg(fast_pool, NULL);
3708  
3709         fast_mix(fast_pool);
3710         add_interrupt_bench(cycles);
3711 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3712 index 4da2af9694a2..5b6f57f500b8 100644
3713 --- a/drivers/clocksource/tcb_clksrc.c
3714 +++ b/drivers/clocksource/tcb_clksrc.c
3715 @@ -23,8 +23,7 @@
3716   *     this 32 bit free-running counter. the second channel is not used.
3717   *
3718   *   - The third channel may be used to provide a 16-bit clockevent
3719 - *     source, used in either periodic or oneshot mode.  This runs
3720 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3721 + *     source, used in either periodic or oneshot mode.
3722   *
3723   * A boot clocksource and clockevent source are also currently needed,
3724   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3725 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3726  struct tc_clkevt_device {
3727         struct clock_event_device       clkevt;
3728         struct clk                      *clk;
3729 +       bool                            clk_enabled;
3730 +       u32                             freq;
3731         void __iomem                    *regs;
3732  };
3733  
3734 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3735         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3736  }
3737  
3738 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3739 - * because using one of the divided clocks would usually mean the
3740 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3741 - *
3742 - * A divided clock could be good for high resolution timers, since
3743 - * 30.5 usec resolution can seem "low".
3744 - */
3745  static u32 timer_clock;
3746  
3747 +static void tc_clk_disable(struct clock_event_device *d)
3748 +{
3749 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3750 +
3751 +       clk_disable(tcd->clk);
3752 +       tcd->clk_enabled = false;
3753 +}
3754 +
3755 +static void tc_clk_enable(struct clock_event_device *d)
3756 +{
3757 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3758 +
3759 +       if (tcd->clk_enabled)
3760 +               return;
3761 +       clk_enable(tcd->clk);
3762 +       tcd->clk_enabled = true;
3763 +}
3764 +
3765  static int tc_shutdown(struct clock_event_device *d)
3766  {
3767         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3768 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3769  
3770         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3771         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3772 +       return 0;
3773 +}
3774 +
3775 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3776 +{
3777 +       tc_shutdown(d);
3778         if (!clockevent_state_detached(d))
3779 -               clk_disable(tcd->clk);
3780 +               tc_clk_disable(d);
3781  
3782         return 0;
3783  }
3784 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3785         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3786                 tc_shutdown(d);
3787  
3788 -       clk_enable(tcd->clk);
3789 +       tc_clk_enable(d);
3790  
3791 -       /* slow clock, count up to RC, then irq and stop */
3792 +       /* count up to RC, then irq and stop */
3793         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3794                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3795         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3796 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3797         /* By not making the gentime core emulate periodic mode on top
3798          * of oneshot, we get lower overhead and improved accuracy.
3799          */
3800 -       clk_enable(tcd->clk);
3801 +       tc_clk_enable(d);
3802  
3803 -       /* slow clock, count up to RC, then irq and restart */
3804 +       /* count up to RC, then irq and restart */
3805         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3806                      regs + ATMEL_TC_REG(2, CMR));
3807 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3808 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3809  
3810         /* Enable clock and interrupts on RC compare */
3811         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3812 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3813                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3814                                           CLOCK_EVT_FEAT_ONESHOT,
3815                 /* Should be lower than at91rm9200's system timer */
3816 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3817                 .rating                 = 125,
3818 +#else
3819 +               .rating                 = 200,
3820 +#endif
3821                 .set_next_event         = tc_next_event,
3822 -               .set_state_shutdown     = tc_shutdown,
3823 +               .set_state_shutdown     = tc_shutdown_clk_off,
3824                 .set_state_periodic     = tc_set_periodic,
3825                 .set_state_oneshot      = tc_set_oneshot,
3826         },
3827 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3828         return IRQ_NONE;
3829  }
3830  
3831 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3832 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3833  {
3834 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3835         int ret;
3836         struct clk *t2_clk = tc->clk[2];
3837         int irq = tc->irq[2];
3838 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3839         clkevt.regs = tc->regs;
3840         clkevt.clk = t2_clk;
3841  
3842 -       timer_clock = clk32k_divisor_idx;
3843 +       timer_clock = divisor_idx;
3844 +       if (!divisor)
3845 +               clkevt.freq = 32768;
3846 +       else
3847 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3848  
3849         clkevt.clkevt.cpumask = cpumask_of(0);
3850  
3851 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3852                 return ret;
3853         }
3854  
3855 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3856 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3857  
3858         return ret;
3859  }
3860 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
3861                 goto err_disable_t1;
3862  
3863         /* channel 2:  periodic and oneshot timer support */
3864 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3865         ret = setup_clkevents(tc, clk32k_divisor_idx);
3866 +#else
3867 +       ret = setup_clkevents(tc, best_divisor_idx);
3868 +#endif
3869         if (ret)
3870                 goto err_unregister_clksrc;
3871  
3872 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
3873 index 6555821bbdae..93288849b2bd 100644
3874 --- a/drivers/clocksource/timer-atmel-pit.c
3875 +++ b/drivers/clocksource/timer-atmel-pit.c
3876 @@ -46,6 +46,7 @@ struct pit_data {
3877         u32             cycle;
3878         u32             cnt;
3879         unsigned int    irq;
3880 +       bool            irq_requested;
3881         struct clk      *mck;
3882  };
3883  
3884 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
3885  
3886         /* disable irq, leaving the clocksource active */
3887         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3888 +       if (data->irq_requested) {
3889 +               free_irq(data->irq, data);
3890 +               data->irq_requested = false;
3891 +       }
3892         return 0;
3893  }
3894  
3895 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3896  /*
3897   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
3898   */
3899  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3900  {
3901         struct pit_data *data = clkevt_to_pit_data(dev);
3902 +       int ret;
3903 +
3904 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3905 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3906 +                         "at91_tick", data);
3907 +       if (ret)
3908 +               panic(pr_fmt("Unable to setup IRQ\n"));
3909 +
3910 +       data->irq_requested = true;
3911  
3912         /* update clocksource counter */
3913         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3914 @@ -230,15 +245,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
3915                 return ret;
3916         }
3917  
3918 -       /* Set up irq handler */
3919 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3920 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3921 -                         "at91_tick", data);
3922 -       if (ret) {
3923 -               pr_err("Unable to setup IRQ\n");
3924 -               return ret;
3925 -       }
3926 -
3927         /* Set up and register clockevents */
3928         data->clkevt.name = "pit";
3929         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
3930 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
3931 index e90ab5b63a90..9e124087c55f 100644
3932 --- a/drivers/clocksource/timer-atmel-st.c
3933 +++ b/drivers/clocksource/timer-atmel-st.c
3934 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
3935         last_crtr = read_CRTR();
3936  }
3937  
3938 +static int atmel_st_irq;
3939 +
3940  static int clkevt32k_shutdown(struct clock_event_device *evt)
3941  {
3942         clkdev32k_disable_and_flush_irq();
3943         irqmask = 0;
3944         regmap_write(regmap_st, AT91_ST_IER, irqmask);
3945 +       free_irq(atmel_st_irq, regmap_st);
3946         return 0;
3947  }
3948  
3949  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3950  {
3951 +       int ret;
3952 +
3953         clkdev32k_disable_and_flush_irq();
3954  
3955 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3956 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3957 +                         "at91_tick", regmap_st);
3958 +       if (ret)
3959 +               panic(pr_fmt("Unable to setup IRQ\n"));
3960 +
3961         /*
3962          * ALM for oneshot irqs, set by next_event()
3963          * before 32 seconds have passed.
3964 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3965  
3966  static int clkevt32k_set_periodic(struct clock_event_device *dev)
3967  {
3968 +       int ret;
3969 +
3970         clkdev32k_disable_and_flush_irq();
3971  
3972 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3973 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3974 +                         "at91_tick", regmap_st);
3975 +       if (ret)
3976 +               panic(pr_fmt("Unable to setup IRQ\n"));
3977 +
3978         /* PIT for periodic irqs; fixed rate of 1/HZ */
3979         irqmask = AT91_ST_PITS;
3980         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
3981 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
3982  {
3983         struct clk *sclk;
3984         unsigned int sclk_rate, val;
3985 -       int irq, ret;
3986 +       int ret;
3987  
3988         regmap_st = syscon_node_to_regmap(node);
3989         if (IS_ERR(regmap_st)) {
3990 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
3991         regmap_read(regmap_st, AT91_ST_SR, &val);
3992  
3993         /* Get the interrupts property */
3994 -       irq  = irq_of_parse_and_map(node, 0);
3995 -       if (!irq) {
3996 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
3997 +       if (!atmel_st_irq) {
3998                 pr_err("Unable to get IRQ from DT\n");
3999                 return -EINVAL;
4000         }
4001  
4002 -       /* Make IRQs happen for the system timer */
4003 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
4004 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4005 -                         "at91_tick", regmap_st);
4006 -       if (ret) {
4007 -               pr_err("Unable to setup IRQ\n");
4008 -               return ret;
4009 -       }
4010 -
4011         sclk = of_clk_get(node, 0);
4012         if (IS_ERR(sclk)) {
4013                 pr_err("Unable to get slow clock\n");
4014 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
4015 index a782ce87715c..19d265948526 100644
4016 --- a/drivers/connector/cn_proc.c
4017 +++ b/drivers/connector/cn_proc.c
4018 @@ -32,6 +32,7 @@
4019  #include <linux/pid_namespace.h>
4020  
4021  #include <linux/cn_proc.h>
4022 +#include <linux/locallock.h>
4023  
4024  /*
4025   * Size of a cn_msg followed by a proc_event structure.  Since the
4026 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4027  
4028  /* proc_event_counts is used as the sequence number of the netlink message */
4029  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4030 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4031  
4032  static inline void send_msg(struct cn_msg *msg)
4033  {
4034 -       preempt_disable();
4035 +       local_lock(send_msg_lock);
4036  
4037         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4038         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4039 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4040          */
4041         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4042  
4043 -       preempt_enable();
4044 +       local_unlock(send_msg_lock);
4045  }
4046  
4047  void proc_fork_connector(struct task_struct *task)
4048 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4049 index adbd1de1cea5..1fac5074f2cf 100644
4050 --- a/drivers/cpufreq/Kconfig.x86
4051 +++ b/drivers/cpufreq/Kconfig.x86
4052 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4053  
4054  config X86_POWERNOW_K8
4055         tristate "AMD Opteron/Athlon64 PowerNow!"
4056 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4057 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4058         help
4059           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4060           Support for K10 and newer processors is now in acpi-cpufreq.
4061 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4062 index 0c400f852a76..97d5f6193751 100644
4063 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4064 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4065 @@ -1537,7 +1537,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
4066         if (ret)
4067                 return ret;
4068  
4069 +#ifndef CONFIG_PREEMPT_RT_BASE
4070         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4071 +#endif
4072  
4073         i915_gem_execbuffer_move_to_active(vmas, params->request);
4074  
4075 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4076 index 1c237d02f30b..9e9b4404c0d7 100644
4077 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4078 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4079 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4080         if (!mutex_is_locked(mutex))
4081                 return false;
4082  
4083 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4084 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4085         return mutex->owner == task;
4086  #else
4087         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4088 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4089 index 3fc286cd1157..252a1117b103 100644
4090 --- a/drivers/gpu/drm/i915/i915_irq.c
4091 +++ b/drivers/gpu/drm/i915/i915_irq.c
4092 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4093         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4094  
4095         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4096 +       preempt_disable_rt();
4097  
4098         /* Get optional system timestamp before query. */
4099         if (stime)
4100 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4101                 *etime = ktime_get();
4102  
4103         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4104 +       preempt_enable_rt();
4105  
4106         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4107  
4108 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4109 index b9be8a6141d8..3162feddabe8 100644
4110 --- a/drivers/gpu/drm/i915/intel_display.c
4111 +++ b/drivers/gpu/drm/i915/intel_display.c
4112 @@ -12141,7 +12141,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4113         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4114         struct intel_flip_work *work;
4115  
4116 -       WARN_ON(!in_interrupt());
4117 +       WARN_ON_NONRT(!in_interrupt());
4118  
4119         if (crtc == NULL)
4120                 return;
4121 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4122 index dbed12c484c9..5c540b78e8b5 100644
4123 --- a/drivers/gpu/drm/i915/intel_sprite.c
4124 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4125 @@ -35,6 +35,7 @@
4126  #include <drm/drm_rect.h>
4127  #include <drm/drm_atomic.h>
4128  #include <drm/drm_plane_helper.h>
4129 +#include <linux/locallock.h>
4130  #include "intel_drv.h"
4131  #include "intel_frontbuffer.h"
4132  #include <drm/i915_drm.h>
4133 @@ -65,6 +66,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4134                             1000 * adjusted_mode->crtc_htotal);
4135  }
4136  
4137 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4138 +
4139  /**
4140   * intel_pipe_update_start() - start update of a set of display registers
4141   * @crtc: the crtc of which the registers are going to be updated
4142 @@ -95,7 +98,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4143         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4144         max = vblank_start - 1;
4145  
4146 -       local_irq_disable();
4147 +       local_lock_irq(pipe_update_lock);
4148  
4149         if (min <= 0 || max <= 0)
4150                 return;
4151 @@ -125,11 +128,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4152                         break;
4153                 }
4154  
4155 -               local_irq_enable();
4156 +               local_unlock_irq(pipe_update_lock);
4157  
4158                 timeout = schedule_timeout(timeout);
4159  
4160 -               local_irq_disable();
4161 +               local_lock_irq(pipe_update_lock);
4162         }
4163  
4164         finish_wait(wq, &wait);
4165 @@ -181,7 +184,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4166                 crtc->base.state->event = NULL;
4167         }
4168  
4169 -       local_irq_enable();
4170 +       local_unlock_irq(pipe_update_lock);
4171  
4172         if (crtc->debug.start_vbl_count &&
4173             crtc->debug.start_vbl_count != end_vbl_count) {
4174 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4175 index 192b2d3a79cb..d5372a207326 100644
4176 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4177 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4178 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4179         if (!mutex_is_locked(mutex))
4180                 return false;
4181  
4182 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4183 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4184         return mutex->owner == task;
4185  #else
4186         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4187 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4188 index cdb8cb568c15..b6d7fd964cbc 100644
4189 --- a/drivers/gpu/drm/radeon/radeon_display.c
4190 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4191 @@ -1845,6 +1845,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4192         struct radeon_device *rdev = dev->dev_private;
4193  
4194         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4195 +       preempt_disable_rt();
4196  
4197         /* Get optional system timestamp before query. */
4198         if (stime)
4199 @@ -1937,6 +1938,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4200                 *etime = ktime_get();
4201  
4202         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4203 +       preempt_enable_rt();
4204  
4205         /* Decode into vertical and horizontal scanout position. */
4206         *vpos = position & 0x1fff;
4207 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4208 index 0276d2ef06ee..8868045eabde 100644
4209 --- a/drivers/hv/vmbus_drv.c
4210 +++ b/drivers/hv/vmbus_drv.c
4211 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4212         void *page_addr;
4213         struct hv_message *msg;
4214         union hv_synic_event_flags *event;
4215 +       struct pt_regs *regs = get_irq_regs();
4216 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4217         bool handled = false;
4218  
4219         page_addr = hv_context.synic_event_page[cpu];
4220 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4221                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4222         }
4223  
4224 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4225 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4226  }
4227  
4228  
4229 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4230 index 36f76e28a0bf..394f142f90c7 100644
4231 --- a/drivers/ide/alim15x3.c
4232 +++ b/drivers/ide/alim15x3.c
4233 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4234  
4235         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4236  
4237 -       local_irq_save(flags);
4238 +       local_irq_save_nort(flags);
4239  
4240         if (m5229_revision < 0xC2) {
4241                 /*
4242 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4243         }
4244         pci_dev_put(north);
4245         pci_dev_put(isa_dev);
4246 -       local_irq_restore(flags);
4247 +       local_irq_restore_nort(flags);
4248         return 0;
4249  }
4250  
4251 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4252 index 0ceae5cbd89a..c212e85d7f3e 100644
4253 --- a/drivers/ide/hpt366.c
4254 +++ b/drivers/ide/hpt366.c
4255 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4256  
4257         dma_old = inb(base + 2);
4258  
4259 -       local_irq_save(flags);
4260 +       local_irq_save_nort(flags);
4261  
4262         dma_new = dma_old;
4263         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4264 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4265         if (dma_new != dma_old)
4266                 outb(dma_new, base + 2);
4267  
4268 -       local_irq_restore(flags);
4269 +       local_irq_restore_nort(flags);
4270  
4271         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4272                          hwif->name, base, base + 7);
4273 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4274 index 19763977568c..4169433faab5 100644
4275 --- a/drivers/ide/ide-io-std.c
4276 +++ b/drivers/ide/ide-io-std.c
4277 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4278                 unsigned long uninitialized_var(flags);
4279  
4280                 if ((io_32bit & 2) && !mmio) {
4281 -                       local_irq_save(flags);
4282 +                       local_irq_save_nort(flags);
4283                         ata_vlb_sync(io_ports->nsect_addr);
4284                 }
4285  
4286 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4287                         insl(data_addr, buf, words);
4288  
4289                 if ((io_32bit & 2) && !mmio)
4290 -                       local_irq_restore(flags);
4291 +                       local_irq_restore_nort(flags);
4292  
4293                 if (((len + 1) & 3) < 2)
4294                         return;
4295 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4296                 unsigned long uninitialized_var(flags);
4297  
4298                 if ((io_32bit & 2) && !mmio) {
4299 -                       local_irq_save(flags);
4300 +                       local_irq_save_nort(flags);
4301                         ata_vlb_sync(io_ports->nsect_addr);
4302                 }
4303  
4304 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4305                         outsl(data_addr, buf, words);
4306  
4307                 if ((io_32bit & 2) && !mmio)
4308 -                       local_irq_restore(flags);
4309 +                       local_irq_restore_nort(flags);
4310  
4311                 if (((len + 1) & 3) < 2)
4312                         return;
4313 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4314 index 669ea1e45795..e12e43e62245 100644
4315 --- a/drivers/ide/ide-io.c
4316 +++ b/drivers/ide/ide-io.c
4317 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4318                 /* disable_irq_nosync ?? */
4319                 disable_irq(hwif->irq);
4320                 /* local CPU only, as if we were handling an interrupt */
4321 -               local_irq_disable();
4322 +               local_irq_disable_nort();
4323                 if (hwif->polling) {
4324                         startstop = handler(drive);
4325                 } else if (drive_is_ready(drive)) {
4326 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4327 index 376f2dc410c5..f014dd1b73dc 100644
4328 --- a/drivers/ide/ide-iops.c
4329 +++ b/drivers/ide/ide-iops.c
4330 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4331                                 if ((stat & ATA_BUSY) == 0)
4332                                         break;
4333  
4334 -                               local_irq_restore(flags);
4335 +                               local_irq_restore_nort(flags);
4336                                 *rstat = stat;
4337                                 return -EBUSY;
4338                         }
4339                 }
4340 -               local_irq_restore(flags);
4341 +               local_irq_restore_nort(flags);
4342         }
4343         /*
4344          * Allow status to settle, then read it again.
4345 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4346 index 0b63facd1d87..4ceba37afc0c 100644
4347 --- a/drivers/ide/ide-probe.c
4348 +++ b/drivers/ide/ide-probe.c
4349 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4350         int bswap = 1;
4351  
4352         /* local CPU only; some systems need this */
4353 -       local_irq_save(flags);
4354 +       local_irq_save_nort(flags);
4355         /* read 512 bytes of id info */
4356         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4357 -       local_irq_restore(flags);
4358 +       local_irq_restore_nort(flags);
4359  
4360         drive->dev_flags |= IDE_DFLAG_ID_READ;
4361  #ifdef DEBUG
4362 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4363 index a716693417a3..be0568c722d6 100644
4364 --- a/drivers/ide/ide-taskfile.c
4365 +++ b/drivers/ide/ide-taskfile.c
4366 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4367  
4368                 page_is_high = PageHighMem(page);
4369                 if (page_is_high)
4370 -                       local_irq_save(flags);
4371 +                       local_irq_save_nort(flags);
4372  
4373                 buf = kmap_atomic(page) + offset;
4374  
4375 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4376                 kunmap_atomic(buf);
4377  
4378                 if (page_is_high)
4379 -                       local_irq_restore(flags);
4380 +                       local_irq_restore_nort(flags);
4381  
4382                 len -= nr_bytes;
4383         }
4384 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4385         }
4386  
4387         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4388 -               local_irq_disable();
4389 +               local_irq_disable_nort();
4390  
4391         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4392  
4393 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4394 index fddff403d5d2..cca1bb4fbfe3 100644
4395 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4396 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4397 @@ -902,7 +902,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4398  
4399         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4400  
4401 -       local_irq_save(flags);
4402 +       local_irq_save_nort(flags);
4403         netif_addr_lock(dev);
4404         spin_lock(&priv->lock);
4405  
4406 @@ -984,7 +984,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4407  
4408         spin_unlock(&priv->lock);
4409         netif_addr_unlock(dev);
4410 -       local_irq_restore(flags);
4411 +       local_irq_restore_nort(flags);
4412  
4413         /*
4414          * make sure the in-flight joins have finished before we attempt
4415 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4416 index 4a2a9e370be7..e970d9afd179 100644
4417 --- a/drivers/input/gameport/gameport.c
4418 +++ b/drivers/input/gameport/gameport.c
4419 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4420         tx = ~0;
4421  
4422         for (i = 0; i < 50; i++) {
4423 -               local_irq_save(flags);
4424 +               local_irq_save_nort(flags);
4425                 t1 = ktime_get_ns();
4426                 for (t = 0; t < 50; t++)
4427                         gameport_read(gameport);
4428                 t2 = ktime_get_ns();
4429                 t3 = ktime_get_ns();
4430 -               local_irq_restore(flags);
4431 +               local_irq_restore_nort(flags);
4432                 udelay(i * 10);
4433                 t = (t2 - t1) - (t3 - t2);
4434                 if (t < tx)
4435 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4436         tx = 1 << 30;
4437  
4438         for(i = 0; i < 50; i++) {
4439 -               local_irq_save(flags);
4440 +               local_irq_save_nort(flags);
4441                 GET_TIME(t1);
4442                 for (t = 0; t < 50; t++) gameport_read(gameport);
4443                 GET_TIME(t2);
4444                 GET_TIME(t3);
4445 -               local_irq_restore(flags);
4446 +               local_irq_restore_nort(flags);
4447                 udelay(i * 10);
4448                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4449         }
4450 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4451         tx = 1 << 30;
4452  
4453         for(i = 0; i < 50; i++) {
4454 -               local_irq_save(flags);
4455 +               local_irq_save_nort(flags);
4456                 t1 = rdtsc();
4457                 for (t = 0; t < 50; t++) gameport_read(gameport);
4458                 t2 = rdtsc();
4459 -               local_irq_restore(flags);
4460 +               local_irq_restore_nort(flags);
4461                 udelay(i * 10);
4462                 if (t2 - t1 < tx) tx = t2 - t1;
4463         }
4464 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4465 index 11a13b5be73a..baaed0ac274b 100644
4466 --- a/drivers/iommu/amd_iommu.c
4467 +++ b/drivers/iommu/amd_iommu.c
4468 @@ -1923,10 +1923,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4469         int ret;
4470  
4471         /*
4472 -        * Must be called with IRQs disabled. Warn here to detect early
4473 -        * when its not.
4474 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4475 +        * detect early when its not.
4476          */
4477 -       WARN_ON(!irqs_disabled());
4478 +       WARN_ON_NONRT(!irqs_disabled());
4479  
4480         /* lock domain */
4481         spin_lock(&domain->lock);
4482 @@ -2094,10 +2094,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4483         struct protection_domain *domain;
4484  
4485         /*
4486 -        * Must be called with IRQs disabled. Warn here to detect early
4487 -        * when its not.
4488 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4489 +        * detect early when its not.
4490          */
4491 -       WARN_ON(!irqs_disabled());
4492 +       WARN_ON_NONRT(!irqs_disabled());
4493  
4494         if (WARN_ON(!dev_data->domain))
4495                 return;
4496 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4497 index b9e50c10213b..fd3b4657723f 100644
4498 --- a/drivers/iommu/intel-iommu.c
4499 +++ b/drivers/iommu/intel-iommu.c
4500 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4501         struct deferred_flush_table *tables;
4502  };
4503  
4504 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4505 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4506  
4507  /* bitmap for indexing intel_iommus */
4508  static int g_num_of_iommus;
4509 @@ -3716,10 +3716,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4510         struct intel_iommu *iommu;
4511         struct deferred_flush_entry *entry;
4512         struct deferred_flush_data *flush_data;
4513 -       unsigned int cpuid;
4514  
4515 -       cpuid = get_cpu();
4516 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4517 +       flush_data = raw_cpu_ptr(&deferred_flush);
4518  
4519         /* Flush all CPUs' entries to avoid deferring too much.  If
4520          * this becomes a bottleneck, can just flush us, and rely on
4521 @@ -3752,8 +3750,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4522         }
4523         flush_data->size++;
4524         spin_unlock_irqrestore(&flush_data->lock, flags);
4525 -
4526 -       put_cpu();
4527  }
4528  
4529  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4530 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4531 index e23001bfcfee..359d5d169ec0 100644
4532 --- a/drivers/iommu/iova.c
4533 +++ b/drivers/iommu/iova.c
4534 @@ -22,6 +22,7 @@
4535  #include <linux/slab.h>
4536  #include <linux/smp.h>
4537  #include <linux/bitops.h>
4538 +#include <linux/cpu.h>
4539  
4540  static bool iova_rcache_insert(struct iova_domain *iovad,
4541                                unsigned long pfn,
4542 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4543  
4544                 /* Try replenishing IOVAs by flushing rcache. */
4545                 flushed_rcache = true;
4546 -               preempt_disable();
4547                 for_each_online_cpu(cpu)
4548                         free_cpu_cached_iovas(cpu, iovad);
4549 -               preempt_enable();
4550                 goto retry;
4551         }
4552  
4553 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4554         bool can_insert = false;
4555         unsigned long flags;
4556  
4557 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4558 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4559         spin_lock_irqsave(&cpu_rcache->lock, flags);
4560  
4561         if (!iova_magazine_full(cpu_rcache->loaded)) {
4562 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4563                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4564  
4565         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4566 -       put_cpu_ptr(rcache->cpu_rcaches);
4567  
4568         if (mag_to_free) {
4569                 iova_magazine_free_pfns(mag_to_free, iovad);
4570 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4571         bool has_pfn = false;
4572         unsigned long flags;
4573  
4574 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4575 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4576         spin_lock_irqsave(&cpu_rcache->lock, flags);
4577  
4578         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4579 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4580                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4581  
4582         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4583 -       put_cpu_ptr(rcache->cpu_rcaches);
4584  
4585         return iova_pfn;
4586  }
4587 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4588 index 3f9ddb9fafa7..09da5b6b44a1 100644
4589 --- a/drivers/leds/trigger/Kconfig
4590 +++ b/drivers/leds/trigger/Kconfig
4591 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4592  
4593  config LEDS_TRIGGER_CPU
4594         bool "LED CPU Trigger"
4595 -       depends on LEDS_TRIGGERS
4596 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4597         help
4598           This allows LEDs to be controlled by active CPUs. This shows
4599           the active CPUs across an array of LEDs so you can see which
4600 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4601 index 4d200883c505..98b64ed5cb81 100644
4602 --- a/drivers/md/bcache/Kconfig
4603 +++ b/drivers/md/bcache/Kconfig
4604 @@ -1,6 +1,7 @@
4605  
4606  config BCACHE
4607         tristate "Block device as cache"
4608 +       depends on !PREEMPT_RT_FULL
4609         ---help---
4610         Allows a block device to be used as cache for other devices; uses
4611         a btree for indexing and the layout is optimized for SSDs.
4612 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4613 index 2c965424d383..2c8877f50626 100644
4614 --- a/drivers/md/dm-rq.c
4615 +++ b/drivers/md/dm-rq.c
4616 @@ -842,7 +842,7 @@ static void dm_old_request_fn(struct request_queue *q)
4617                 /* Establish tio->ti before queuing work (map_tio_request) */
4618                 tio->ti = ti;
4619                 kthread_queue_work(&md->kworker, &tio->work);
4620 -               BUG_ON(!irqs_disabled());
4621 +               BUG_ON_NONRT(!irqs_disabled());
4622         }
4623  }
4624  
4625 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4626 index cce6057b9aca..fa2c4de32a64 100644
4627 --- a/drivers/md/raid5.c
4628 +++ b/drivers/md/raid5.c
4629 @@ -1928,8 +1928,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4630         struct raid5_percpu *percpu;
4631         unsigned long cpu;
4632  
4633 -       cpu = get_cpu();
4634 +       cpu = get_cpu_light();
4635         percpu = per_cpu_ptr(conf->percpu, cpu);
4636 +       spin_lock(&percpu->lock);
4637         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4638                 ops_run_biofill(sh);
4639                 overlap_clear++;
4640 @@ -1985,7 +1986,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4641                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4642                                 wake_up(&sh->raid_conf->wait_for_overlap);
4643                 }
4644 -       put_cpu();
4645 +       spin_unlock(&percpu->lock);
4646 +       put_cpu_light();
4647  }
4648  
4649  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4650 @@ -6391,6 +6393,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
4651                        __func__, cpu);
4652                 return -ENOMEM;
4653         }
4654 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4655         return 0;
4656  }
4657  
4658 @@ -6401,7 +6404,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4659         conf->percpu = alloc_percpu(struct raid5_percpu);
4660         if (!conf->percpu)
4661                 return -ENOMEM;
4662 -
4663         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4664         if (!err) {
4665                 conf->scribble_disks = max(conf->raid_disks,
4666 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4667 index 57ec49f0839e..0739604990b7 100644
4668 --- a/drivers/md/raid5.h
4669 +++ b/drivers/md/raid5.h
4670 @@ -504,6 +504,7 @@ struct r5conf {
4671         int                     recovery_disabled;
4672         /* per cpu variables */
4673         struct raid5_percpu {
4674 +               spinlock_t      lock;           /* Protection for -RT */
4675                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4676                 struct flex_array *scribble;   /* space for constructing buffer
4677                                               * lists and performing address
4678 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4679 index 64971baf11fa..215e91e36198 100644
4680 --- a/drivers/misc/Kconfig
4681 +++ b/drivers/misc/Kconfig
4682 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4683  config ATMEL_TCLIB
4684         bool "Atmel AT32/AT91 Timer/Counter Library"
4685         depends on (AVR32 || ARCH_AT91)
4686 +       default y if PREEMPT_RT_FULL
4687         help
4688           Select this if you want a library to allocate the Timer/Counter
4689           blocks found on many Atmel processors.  This facilitates using
4690 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4691           are combined to make a single 32-bit timer.
4692  
4693           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4694 -         may be used as a clock event device supporting oneshot mode
4695 -         (delays of up to two seconds) based on the 32 KiHz clock.
4696 +         may be used as a clock event device supporting oneshot mode.
4697  
4698  config ATMEL_TCB_CLKSRC_BLOCK
4699         int
4700 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4701           TC can be used for other purposes, such as PWM generation and
4702           interval timing.
4703  
4704 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4705 +       bool "TC Block use 32 KiHz clock"
4706 +       depends on ATMEL_TCB_CLKSRC
4707 +       default y if !PREEMPT_RT_FULL
4708 +       help
4709 +         Select this to use 32 KiHz base clock rate as TC block clock
4710 +         source for clock events.
4711 +
4712 +
4713  config DUMMY_IRQ
4714         tristate "Dummy IRQ handler"
4715         default n
4716 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
4717 index df990bb8c873..1a162709a85e 100644
4718 --- a/drivers/mmc/host/mmci.c
4719 +++ b/drivers/mmc/host/mmci.c
4720 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4721         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4722         struct variant_data *variant = host->variant;
4723         void __iomem *base = host->base;
4724 -       unsigned long flags;
4725         u32 status;
4726  
4727         status = readl(base + MMCISTATUS);
4728  
4729         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4730  
4731 -       local_irq_save(flags);
4732 -
4733         do {
4734                 unsigned int remain, len;
4735                 char *buffer;
4736 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4737  
4738         sg_miter_stop(sg_miter);
4739  
4740 -       local_irq_restore(flags);
4741 -
4742         /*
4743          * If we have less than the fifo 'half-full' threshold to transfer,
4744          * trigger a PIO interrupt as soon as any data is available.
4745 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
4746 index 9133e7926da5..63afb921ed40 100644
4747 --- a/drivers/net/ethernet/3com/3c59x.c
4748 +++ b/drivers/net/ethernet/3com/3c59x.c
4749 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
4750  {
4751         struct vortex_private *vp = netdev_priv(dev);
4752         unsigned long flags;
4753 -       local_irq_save(flags);
4754 +       local_irq_save_nort(flags);
4755         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4756 -       local_irq_restore(flags);
4757 +       local_irq_restore_nort(flags);
4758  }
4759  #endif
4760  
4761 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
4762                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4763                          */
4764                         unsigned long flags;
4765 -                       local_irq_save(flags);
4766 +                       local_irq_save_nort(flags);
4767                         if (vp->full_bus_master_tx)
4768                                 boomerang_interrupt(dev->irq, dev);
4769                         else
4770                                 vortex_interrupt(dev->irq, dev);
4771 -                       local_irq_restore(flags);
4772 +                       local_irq_restore_nort(flags);
4773                 }
4774         }
4775  
4776 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
4777 index da4c2d8a4173..1420dfb56bac 100644
4778 --- a/drivers/net/ethernet/realtek/8139too.c
4779 +++ b/drivers/net/ethernet/realtek/8139too.c
4780 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
4781         struct rtl8139_private *tp = netdev_priv(dev);
4782         const int irq = tp->pci_dev->irq;
4783  
4784 -       disable_irq(irq);
4785 +       disable_irq_nosync(irq);
4786         rtl8139_interrupt(irq, dev);
4787         enable_irq(irq);
4788  }
4789 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4790 index bca6935a94db..d7a35ee34d03 100644
4791 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4792 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4793 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
4794                         while (!ctx->done.done && msecs--)
4795                                 udelay(1000);
4796                 } else {
4797 -                       wait_event_interruptible(ctx->done.wait,
4798 +                       swait_event_interruptible(ctx->done.wait,
4799                                                  ctx->done.done);
4800                 }
4801                 break;
4802 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
4803 index d11cdbb8fba3..223bbb9acb03 100644
4804 --- a/drivers/pci/access.c
4805 +++ b/drivers/pci/access.c
4806 @@ -672,7 +672,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
4807         WARN_ON(!dev->block_cfg_access);
4808  
4809         dev->block_cfg_access = 0;
4810 -       wake_up_all(&pci_cfg_wait);
4811 +       wake_up_all_locked(&pci_cfg_wait);
4812         raw_spin_unlock_irqrestore(&pci_lock, flags);
4813  }
4814  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4815 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
4816 index bedce3453dd3..faf038978650 100644
4817 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
4818 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
4819 @@ -61,7 +61,7 @@ struct msm_pinctrl {
4820         struct notifier_block restart_nb;
4821         int irq;
4822  
4823 -       spinlock_t lock;
4824 +       raw_spinlock_t lock;
4825  
4826         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
4827         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
4828 @@ -153,14 +153,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
4829         if (WARN_ON(i == g->nfuncs))
4830                 return -EINVAL;
4831  
4832 -       spin_lock_irqsave(&pctrl->lock, flags);
4833 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4834  
4835         val = readl(pctrl->regs + g->ctl_reg);
4836         val &= ~mask;
4837         val |= i << g->mux_bit;
4838         writel(val, pctrl->regs + g->ctl_reg);
4839  
4840 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4841 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4842  
4843         return 0;
4844  }
4845 @@ -323,14 +323,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
4846                         break;
4847                 case PIN_CONFIG_OUTPUT:
4848                         /* set output value */
4849 -                       spin_lock_irqsave(&pctrl->lock, flags);
4850 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
4851                         val = readl(pctrl->regs + g->io_reg);
4852                         if (arg)
4853                                 val |= BIT(g->out_bit);
4854                         else
4855                                 val &= ~BIT(g->out_bit);
4856                         writel(val, pctrl->regs + g->io_reg);
4857 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
4858 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4859  
4860                         /* enable output */
4861                         arg = 1;
4862 @@ -351,12 +351,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
4863                         return -EINVAL;
4864                 }
4865  
4866 -               spin_lock_irqsave(&pctrl->lock, flags);
4867 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
4868                 val = readl(pctrl->regs + g->ctl_reg);
4869                 val &= ~(mask << bit);
4870                 val |= arg << bit;
4871                 writel(val, pctrl->regs + g->ctl_reg);
4872 -               spin_unlock_irqrestore(&pctrl->lock, flags);
4873 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4874         }
4875  
4876         return 0;
4877 @@ -384,13 +384,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
4878  
4879         g = &pctrl->soc->groups[offset];
4880  
4881 -       spin_lock_irqsave(&pctrl->lock, flags);
4882 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4883  
4884         val = readl(pctrl->regs + g->ctl_reg);
4885         val &= ~BIT(g->oe_bit);
4886         writel(val, pctrl->regs + g->ctl_reg);
4887  
4888 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4889 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4890  
4891         return 0;
4892  }
4893 @@ -404,7 +404,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
4894  
4895         g = &pctrl->soc->groups[offset];
4896  
4897 -       spin_lock_irqsave(&pctrl->lock, flags);
4898 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4899  
4900         val = readl(pctrl->regs + g->io_reg);
4901         if (value)
4902 @@ -417,7 +417,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
4903         val |= BIT(g->oe_bit);
4904         writel(val, pctrl->regs + g->ctl_reg);
4905  
4906 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4907 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4908  
4909         return 0;
4910  }
4911 @@ -443,7 +443,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
4912  
4913         g = &pctrl->soc->groups[offset];
4914  
4915 -       spin_lock_irqsave(&pctrl->lock, flags);
4916 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4917  
4918         val = readl(pctrl->regs + g->io_reg);
4919         if (value)
4920 @@ -452,7 +452,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
4921                 val &= ~BIT(g->out_bit);
4922         writel(val, pctrl->regs + g->io_reg);
4923  
4924 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4925 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4926  }
4927  
4928  #ifdef CONFIG_DEBUG_FS
4929 @@ -571,7 +571,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
4930  
4931         g = &pctrl->soc->groups[d->hwirq];
4932  
4933 -       spin_lock_irqsave(&pctrl->lock, flags);
4934 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4935  
4936         val = readl(pctrl->regs + g->intr_cfg_reg);
4937         val &= ~BIT(g->intr_enable_bit);
4938 @@ -579,7 +579,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
4939  
4940         clear_bit(d->hwirq, pctrl->enabled_irqs);
4941  
4942 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4943 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4944  }
4945  
4946  static void msm_gpio_irq_unmask(struct irq_data *d)
4947 @@ -592,7 +592,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
4948  
4949         g = &pctrl->soc->groups[d->hwirq];
4950  
4951 -       spin_lock_irqsave(&pctrl->lock, flags);
4952 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4953  
4954         val = readl(pctrl->regs + g->intr_cfg_reg);
4955         val |= BIT(g->intr_enable_bit);
4956 @@ -600,7 +600,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
4957  
4958         set_bit(d->hwirq, pctrl->enabled_irqs);
4959  
4960 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4961 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4962  }
4963  
4964  static void msm_gpio_irq_ack(struct irq_data *d)
4965 @@ -613,7 +613,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
4966  
4967         g = &pctrl->soc->groups[d->hwirq];
4968  
4969 -       spin_lock_irqsave(&pctrl->lock, flags);
4970 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4971  
4972         val = readl(pctrl->regs + g->intr_status_reg);
4973         if (g->intr_ack_high)
4974 @@ -625,7 +625,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
4975         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
4976                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
4977  
4978 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4979 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4980  }
4981  
4982  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4983 @@ -638,7 +638,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4984  
4985         g = &pctrl->soc->groups[d->hwirq];
4986  
4987 -       spin_lock_irqsave(&pctrl->lock, flags);
4988 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4989  
4990         /*
4991          * For hw without possibility of detecting both edges
4992 @@ -712,7 +712,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4993         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
4994                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
4995  
4996 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4997 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4998  
4999         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
5000                 irq_set_handler_locked(d, handle_level_irq);
5001 @@ -728,11 +728,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
5002         struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
5003         unsigned long flags;
5004  
5005 -       spin_lock_irqsave(&pctrl->lock, flags);
5006 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5007  
5008         irq_set_irq_wake(pctrl->irq, on);
5009  
5010 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5011 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5012  
5013         return 0;
5014  }
5015 @@ -878,7 +878,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
5016         pctrl->soc = soc_data;
5017         pctrl->chip = msm_gpio_template;
5018  
5019 -       spin_lock_init(&pctrl->lock);
5020 +       raw_spin_lock_init(&pctrl->lock);
5021  
5022         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
5023         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
5024 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
5025 index 9bd41a35a78a..8e2d436c2e3f 100644
5026 --- a/drivers/scsi/fcoe/fcoe.c
5027 +++ b/drivers/scsi/fcoe/fcoe.c
5028 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
5029  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
5030  {
5031         struct fcoe_percpu_s *fps;
5032 -       int rc;
5033 +       int rc, cpu = get_cpu_light();
5034  
5035 -       fps = &get_cpu_var(fcoe_percpu);
5036 +       fps = &per_cpu(fcoe_percpu, cpu);
5037         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
5038 -       put_cpu_var(fcoe_percpu);
5039 +       put_cpu_light();
5040  
5041         return rc;
5042  }
5043 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
5044                 return 0;
5045         }
5046  
5047 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5048 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5049         stats->InvalidCRCCount++;
5050         if (stats->InvalidCRCCount < 5)
5051                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
5052 -       put_cpu();
5053 +       put_cpu_light();
5054         return -EINVAL;
5055  }
5056  
5057 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5058          */
5059         hp = (struct fcoe_hdr *) skb_network_header(skb);
5060  
5061 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5062 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5063         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
5064                 if (stats->ErrorFrames < 5)
5065                         printk(KERN_WARNING "fcoe: FCoE version "
5066 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5067                 goto drop;
5068  
5069         if (!fcoe_filter_frames(lport, fp)) {
5070 -               put_cpu();
5071 +               put_cpu_light();
5072                 fc_exch_recv(lport, fp);
5073                 return;
5074         }
5075  drop:
5076         stats->ErrorFrames++;
5077 -       put_cpu();
5078 +       put_cpu_light();
5079         kfree_skb(skb);
5080  }
5081  
5082 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
5083 index dcf36537a767..1a1f2e46452c 100644
5084 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
5085 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
5086 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5087  
5088         INIT_LIST_HEAD(&del_list);
5089  
5090 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
5091 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
5092  
5093         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
5094                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
5095 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5096                                 sel_time = fcf->time;
5097                 }
5098         }
5099 -       put_cpu();
5100 +       put_cpu_light();
5101  
5102         list_for_each_entry_safe(fcf, next, &del_list, list) {
5103                 /* Removes fcf from current list */
5104 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
5105 index 16ca31ad5ec0..c3987347e762 100644
5106 --- a/drivers/scsi/libfc/fc_exch.c
5107 +++ b/drivers/scsi/libfc/fc_exch.c
5108 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
5109         }
5110         memset(ep, 0, sizeof(*ep));
5111  
5112 -       cpu = get_cpu();
5113 +       cpu = get_cpu_light();
5114         pool = per_cpu_ptr(mp->pool, cpu);
5115         spin_lock_bh(&pool->lock);
5116 -       put_cpu();
5117 +       put_cpu_light();
5118  
5119         /* peek cache of free slot */
5120         if (pool->left != FC_XID_UNKNOWN) {
5121 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
5122 index 763f012fdeca..d0f61b595470 100644
5123 --- a/drivers/scsi/libsas/sas_ata.c
5124 +++ b/drivers/scsi/libsas/sas_ata.c
5125 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5126         /* TODO: audit callers to ensure they are ready for qc_issue to
5127          * unconditionally re-enable interrupts
5128          */
5129 -       local_irq_save(flags);
5130 +       local_irq_save_nort(flags);
5131         spin_unlock(ap->lock);
5132  
5133         /* If the device fell off, no sense in issuing commands */
5134 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5135  
5136   out:
5137         spin_lock(ap->lock);
5138 -       local_irq_restore(flags);
5139 +       local_irq_restore_nort(flags);
5140         return ret;
5141  }
5142  
5143 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
5144 index edc48f3b8230..ee5c6f9dfb6f 100644
5145 --- a/drivers/scsi/qla2xxx/qla_inline.h
5146 +++ b/drivers/scsi/qla2xxx/qla_inline.h
5147 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
5148  {
5149         unsigned long flags;
5150         struct qla_hw_data *ha = rsp->hw;
5151 -       local_irq_save(flags);
5152 +       local_irq_save_nort(flags);
5153         if (IS_P3P_TYPE(ha))
5154                 qla82xx_poll(0, rsp);
5155         else
5156                 ha->isp_ops->intr_handler(0, rsp);
5157 -       local_irq_restore(flags);
5158 +       local_irq_restore_nort(flags);
5159  }
5160  
5161  static inline uint8_t *
5162 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
5163 index 068c4e47fac9..a2090f640397 100644
5164 --- a/drivers/scsi/qla2xxx/qla_isr.c
5165 +++ b/drivers/scsi/qla2xxx/qla_isr.c
5166 @@ -3125,7 +3125,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
5167                 * kref_put().
5168                 */
5169                 kref_get(&qentry->irq_notify.kref);
5170 +#ifdef CONFIG_PREEMPT_RT_BASE
5171 +               swork_queue(&qentry->irq_notify.swork);
5172 +#else
5173                 schedule_work(&qentry->irq_notify.work);
5174 +#endif
5175         }
5176  
5177         /*
5178 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
5179 index 95f4c1bcdb4c..0be934799bff 100644
5180 --- a/drivers/thermal/x86_pkg_temp_thermal.c
5181 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
5182 @@ -29,6 +29,7 @@
5183  #include <linux/pm.h>
5184  #include <linux/thermal.h>
5185  #include <linux/debugfs.h>
5186 +#include <linux/swork.h>
5187  #include <asm/cpu_device_id.h>
5188  #include <asm/mce.h>
5189  
5190 @@ -353,7 +354,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
5191         }
5192  }
5193  
5194 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5195 +static void platform_thermal_notify_work(struct swork_event *event)
5196  {
5197         unsigned long flags;
5198         int cpu = smp_processor_id();
5199 @@ -370,7 +371,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5200                         pkg_work_scheduled[phy_id]) {
5201                 disable_pkg_thres_interrupt();
5202                 spin_unlock_irqrestore(&pkg_work_lock, flags);
5203 -               return -EINVAL;
5204 +               return;
5205         }
5206         pkg_work_scheduled[phy_id] = 1;
5207         spin_unlock_irqrestore(&pkg_work_lock, flags);
5208 @@ -379,9 +380,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5209         schedule_delayed_work_on(cpu,
5210                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
5211                                 msecs_to_jiffies(notify_delay_ms));
5212 +}
5213 +
5214 +#ifdef CONFIG_PREEMPT_RT_FULL
5215 +static struct swork_event notify_work;
5216 +
5217 +static int thermal_notify_work_init(void)
5218 +{
5219 +       int err;
5220 +
5221 +       err = swork_get();
5222 +       if (err)
5223 +               return err;
5224 +
5225 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
5226         return 0;
5227  }
5228  
5229 +static void thermal_notify_work_cleanup(void)
5230 +{
5231 +       swork_put();
5232 +}
5233 +
5234 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5235 +{
5236 +       swork_queue(&notify_work);
5237 +       return 0;
5238 +}
5239 +
5240 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5241 +
5242 +static int thermal_notify_work_init(void) { return 0; }
5243 +
5244 +static void thermal_notify_work_cleanup(void) {  }
5245 +
5246 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5247 +{
5248 +       platform_thermal_notify_work(NULL);
5249 +
5250 +       return 0;
5251 +}
5252 +#endif /* CONFIG_PREEMPT_RT_FULL */
5253 +
5254  static int find_siblings_cpu(int cpu)
5255  {
5256         int i;
5257 @@ -585,6 +625,9 @@ static int __init pkg_temp_thermal_init(void)
5258         if (!x86_match_cpu(pkg_temp_thermal_ids))
5259                 return -ENODEV;
5260  
5261 +       if (!thermal_notify_work_init())
5262 +               return -ENODEV;
5263 +
5264         spin_lock_init(&pkg_work_lock);
5265         platform_thermal_package_notify =
5266                         pkg_temp_thermal_platform_thermal_notify;
5267 @@ -609,7 +652,7 @@ static int __init pkg_temp_thermal_init(void)
5268         kfree(pkg_work_scheduled);
5269         platform_thermal_package_notify = NULL;
5270         platform_thermal_package_rate_control = NULL;
5271 -
5272 +       thermal_notify_work_cleanup();
5273         return -ENODEV;
5274  }
5275  
5276 @@ -634,6 +677,7 @@ static void __exit pkg_temp_thermal_exit(void)
5277         mutex_unlock(&phy_dev_list_mutex);
5278         platform_thermal_package_notify = NULL;
5279         platform_thermal_package_rate_control = NULL;
5280 +       thermal_notify_work_cleanup();
5281         for_each_online_cpu(i)
5282                 cancel_delayed_work_sync(
5283                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5284 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
5285 index e8819aa20415..dd7f9bf45d6c 100644
5286 --- a/drivers/tty/serial/8250/8250_core.c
5287 +++ b/drivers/tty/serial/8250/8250_core.c
5288 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
5289  
5290  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5291  
5292 -#define PASS_LIMIT     512
5293 +/*
5294 + * On -rt we can have a more delays, and legitimately
5295 + * so - so don't drop work spuriously and spam the
5296 + * syslog:
5297 + */
5298 +#ifdef CONFIG_PREEMPT_RT_FULL
5299 +# define PASS_LIMIT    1000000
5300 +#else
5301 +# define PASS_LIMIT    512
5302 +#endif
5303  
5304  #include <asm/serial.h>
5305  /*
5306 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
5307 index 080d5a59d0a7..eecc4f111473 100644
5308 --- a/drivers/tty/serial/8250/8250_port.c
5309 +++ b/drivers/tty/serial/8250/8250_port.c
5310 @@ -35,6 +35,7 @@
5311  #include <linux/nmi.h>
5312  #include <linux/mutex.h>
5313  #include <linux/slab.h>
5314 +#include <linux/kdb.h>
5315  #include <linux/uaccess.h>
5316  #include <linux/pm_runtime.h>
5317  #include <linux/timer.h>
5318 @@ -3144,9 +3145,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
5319  
5320         serial8250_rpm_get(up);
5321  
5322 -       if (port->sysrq)
5323 +       if (port->sysrq || oops_in_progress)
5324                 locked = 0;
5325 -       else if (oops_in_progress)
5326 +       else if (in_kdb_printk())
5327                 locked = spin_trylock_irqsave(&port->lock, flags);
5328         else
5329                 spin_lock_irqsave(&port->lock, flags);
5330 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
5331 index e2c33b9528d8..53af53c43e8c 100644
5332 --- a/drivers/tty/serial/amba-pl011.c
5333 +++ b/drivers/tty/serial/amba-pl011.c
5334 @@ -2194,13 +2194,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5335  
5336         clk_enable(uap->clk);
5337  
5338 -       local_irq_save(flags);
5339 +       /*
5340 +        * local_irq_save(flags);
5341 +        *
5342 +        * This local_irq_save() is nonsense. If we come in via sysrq
5343 +        * handling then interrupts are already disabled. Aside of
5344 +        * that the port.sysrq check is racy on SMP regardless.
5345 +       */
5346         if (uap->port.sysrq)
5347                 locked = 0;
5348         else if (oops_in_progress)
5349 -               locked = spin_trylock(&uap->port.lock);
5350 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5351         else
5352 -               spin_lock(&uap->port.lock);
5353 +               spin_lock_irqsave(&uap->port.lock, flags);
5354  
5355         /*
5356          *      First save the CR then disable the interrupts
5357 @@ -2224,8 +2230,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5358                 pl011_write(old_cr, uap, REG_CR);
5359  
5360         if (locked)
5361 -               spin_unlock(&uap->port.lock);
5362 -       local_irq_restore(flags);
5363 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5364  
5365         clk_disable(uap->clk);
5366  }
5367 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
5368 index a2a529994ba5..0ee7c4c518df 100644
5369 --- a/drivers/tty/serial/omap-serial.c
5370 +++ b/drivers/tty/serial/omap-serial.c
5371 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
5372  
5373         pm_runtime_get_sync(up->dev);
5374  
5375 -       local_irq_save(flags);
5376 -       if (up->port.sysrq)
5377 -               locked = 0;
5378 -       else if (oops_in_progress)
5379 -               locked = spin_trylock(&up->port.lock);
5380 +       if (up->port.sysrq || oops_in_progress)
5381 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5382         else
5383 -               spin_lock(&up->port.lock);
5384 +               spin_lock_irqsave(&up->port.lock, flags);
5385  
5386         /*
5387          * First save the IER then disable the interrupts
5388 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
5389         pm_runtime_mark_last_busy(up->dev);
5390         pm_runtime_put_autosuspend(up->dev);
5391         if (locked)
5392 -               spin_unlock(&up->port.lock);
5393 -       local_irq_restore(flags);
5394 +               spin_unlock_irqrestore(&up->port.lock, flags);
5395  }
5396  
5397  static int __init
5398 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
5399 index 479e223f9cff..3418a54b4131 100644
5400 --- a/drivers/usb/core/hcd.c
5401 +++ b/drivers/usb/core/hcd.c
5402 @@ -1761,9 +1761,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
5403          * and no one may trigger the above deadlock situation when
5404          * running complete() in tasklet.
5405          */
5406 -       local_irq_save(flags);
5407 +       local_irq_save_nort(flags);
5408         urb->complete(urb);
5409 -       local_irq_restore(flags);
5410 +       local_irq_restore_nort(flags);
5411  
5412         usb_anchor_resume_wakeups(anchor);
5413         atomic_dec(&urb->use_count);
5414 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
5415 index 89081b834615..90b231b7ad0a 100644
5416 --- a/drivers/usb/gadget/function/f_fs.c
5417 +++ b/drivers/usb/gadget/function/f_fs.c
5418 @@ -1593,7 +1593,7 @@ static void ffs_data_put(struct ffs_data *ffs)
5419                 pr_info("%s(): freeing\n", __func__);
5420                 ffs_data_clear(ffs);
5421                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5422 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5423 +                      swait_active(&ffs->ep0req_completion.wait));
5424                 kfree(ffs->dev_name);
5425                 kfree(ffs);
5426         }
5427 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
5428 index 1468d8f085a3..6aae3ae25c18 100644
5429 --- a/drivers/usb/gadget/legacy/inode.c
5430 +++ b/drivers/usb/gadget/legacy/inode.c
5431 @@ -346,7 +346,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5432         spin_unlock_irq (&epdata->dev->lock);
5433  
5434         if (likely (value == 0)) {
5435 -               value = wait_event_interruptible (done.wait, done.done);
5436 +               value = swait_event_interruptible (done.wait, done.done);
5437                 if (value != 0) {
5438                         spin_lock_irq (&epdata->dev->lock);
5439                         if (likely (epdata->ep != NULL)) {
5440 @@ -355,7 +355,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5441                                 usb_ep_dequeue (epdata->ep, epdata->req);
5442                                 spin_unlock_irq (&epdata->dev->lock);
5443  
5444 -                               wait_event (done.wait, done.done);
5445 +                               swait_event (done.wait, done.done);
5446                                 if (epdata->status == -ECONNRESET)
5447                                         epdata->status = -EINTR;
5448                         } else {
5449 diff --git a/fs/aio.c b/fs/aio.c
5450 index 428484f2f841..2b02e2eb2158 100644
5451 --- a/fs/aio.c
5452 +++ b/fs/aio.c
5453 @@ -40,6 +40,7 @@
5454  #include <linux/ramfs.h>
5455  #include <linux/percpu-refcount.h>
5456  #include <linux/mount.h>
5457 +#include <linux/swork.h>
5458  
5459  #include <asm/kmap_types.h>
5460  #include <asm/uaccess.h>
5461 @@ -115,7 +116,7 @@ struct kioctx {
5462         struct page             **ring_pages;
5463         long                    nr_pages;
5464  
5465 -       struct work_struct      free_work;
5466 +       struct swork_event      free_work;
5467  
5468         /*
5469          * signals when all in-flight requests are done
5470 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
5471                 .mount          = aio_mount,
5472                 .kill_sb        = kill_anon_super,
5473         };
5474 +       BUG_ON(swork_get());
5475         aio_mnt = kern_mount(&aio_fs);
5476         if (IS_ERR(aio_mnt))
5477                 panic("Failed to create aio fs mount.");
5478 @@ -581,9 +583,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
5479         return cancel(&kiocb->common);
5480  }
5481  
5482 -static void free_ioctx(struct work_struct *work)
5483 +static void free_ioctx(struct swork_event *sev)
5484  {
5485 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5486 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5487  
5488         pr_debug("freeing %p\n", ctx);
5489  
5490 @@ -602,8 +604,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5491         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5492                 complete(&ctx->rq_wait->comp);
5493  
5494 -       INIT_WORK(&ctx->free_work, free_ioctx);
5495 -       schedule_work(&ctx->free_work);
5496 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5497 +       swork_queue(&ctx->free_work);
5498  }
5499  
5500  /*
5501 @@ -611,9 +613,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5502   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5503   * now it's safe to cancel any that need to be.
5504   */
5505 -static void free_ioctx_users(struct percpu_ref *ref)
5506 +static void free_ioctx_users_work(struct swork_event *sev)
5507  {
5508 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5509 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5510         struct aio_kiocb *req;
5511  
5512         spin_lock_irq(&ctx->ctx_lock);
5513 @@ -632,6 +634,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
5514         percpu_ref_put(&ctx->reqs);
5515  }
5516  
5517 +static void free_ioctx_users(struct percpu_ref *ref)
5518 +{
5519 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5520 +
5521 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5522 +       swork_queue(&ctx->free_work);
5523 +}
5524 +
5525  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5526  {
5527         unsigned i, new_nr;
5528 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
5529 index a1fba4285277..3796769b4cd1 100644
5530 --- a/fs/autofs4/autofs_i.h
5531 +++ b/fs/autofs4/autofs_i.h
5532 @@ -31,6 +31,7 @@
5533  #include <linux/sched.h>
5534  #include <linux/mount.h>
5535  #include <linux/namei.h>
5536 +#include <linux/delay.h>
5537  #include <asm/current.h>
5538  #include <linux/uaccess.h>
5539  
5540 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
5541 index d8e6d421c27f..2e689ab1306b 100644
5542 --- a/fs/autofs4/expire.c
5543 +++ b/fs/autofs4/expire.c
5544 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
5545                         parent = p->d_parent;
5546                         if (!spin_trylock(&parent->d_lock)) {
5547                                 spin_unlock(&p->d_lock);
5548 -                               cpu_relax();
5549 +                               cpu_chill();
5550                                 goto relock;
5551                         }
5552                         spin_unlock(&p->d_lock);
5553 diff --git a/fs/buffer.c b/fs/buffer.c
5554 index b205a629001d..5646afc022ba 100644
5555 --- a/fs/buffer.c
5556 +++ b/fs/buffer.c
5557 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5558          * decide that the page is now completely done.
5559          */
5560         first = page_buffers(page);
5561 -       local_irq_save(flags);
5562 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5563 +       flags = bh_uptodate_lock_irqsave(first);
5564         clear_buffer_async_read(bh);
5565         unlock_buffer(bh);
5566         tmp = bh;
5567 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5568                 }
5569                 tmp = tmp->b_this_page;
5570         } while (tmp != bh);
5571 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5572 -       local_irq_restore(flags);
5573 +       bh_uptodate_unlock_irqrestore(first, flags);
5574  
5575         /*
5576          * If none of the buffers had errors and they are all
5577 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5578         return;
5579  
5580  still_busy:
5581 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5582 -       local_irq_restore(flags);
5583 -       return;
5584 +       bh_uptodate_unlock_irqrestore(first, flags);
5585  }
5586  
5587  /*
5588 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5589         }
5590  
5591         first = page_buffers(page);
5592 -       local_irq_save(flags);
5593 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5594 +       flags = bh_uptodate_lock_irqsave(first);
5595  
5596         clear_buffer_async_write(bh);
5597         unlock_buffer(bh);
5598 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5599                 }
5600                 tmp = tmp->b_this_page;
5601         }
5602 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5603 -       local_irq_restore(flags);
5604 +       bh_uptodate_unlock_irqrestore(first, flags);
5605         end_page_writeback(page);
5606         return;
5607  
5608  still_busy:
5609 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5610 -       local_irq_restore(flags);
5611 -       return;
5612 +       bh_uptodate_unlock_irqrestore(first, flags);
5613  }
5614  EXPORT_SYMBOL(end_buffer_async_write);
5615  
5616 @@ -3383,6 +3375,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
5617         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5618         if (ret) {
5619                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5620 +               buffer_head_init_locks(ret);
5621                 preempt_disable();
5622                 __this_cpu_inc(bh_accounting.nr);
5623                 recalc_bh_state();
5624 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5625 index a27fc8791551..791aecb7c1ac 100644
5626 --- a/fs/cifs/readdir.c
5627 +++ b/fs/cifs/readdir.c
5628 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
5629         struct inode *inode;
5630         struct super_block *sb = parent->d_sb;
5631         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5632 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5633 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5634  
5635         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5636  
5637 diff --git a/fs/dcache.c b/fs/dcache.c
5638 index 4485a48f4091..691039a6a872 100644
5639 --- a/fs/dcache.c
5640 +++ b/fs/dcache.c
5641 @@ -19,6 +19,7 @@
5642  #include <linux/mm.h>
5643  #include <linux/fs.h>
5644  #include <linux/fsnotify.h>
5645 +#include <linux/delay.h>
5646  #include <linux/slab.h>
5647  #include <linux/init.h>
5648  #include <linux/hash.h>
5649 @@ -750,6 +751,8 @@ static inline bool fast_dput(struct dentry *dentry)
5650   */
5651  void dput(struct dentry *dentry)
5652  {
5653 +       struct dentry *parent;
5654 +
5655         if (unlikely(!dentry))
5656                 return;
5657  
5658 @@ -788,9 +791,18 @@ void dput(struct dentry *dentry)
5659         return;
5660  
5661  kill_it:
5662 -       dentry = dentry_kill(dentry);
5663 -       if (dentry) {
5664 -               cond_resched();
5665 +       parent = dentry_kill(dentry);
5666 +       if (parent) {
5667 +               int r;
5668 +
5669 +               if (parent == dentry) {
5670 +                       /* the task with the highest priority won't schedule */
5671 +                       r = cond_resched();
5672 +                       if (!r)
5673 +                               cpu_chill();
5674 +               } else {
5675 +                       dentry = parent;
5676 +               }
5677                 goto repeat;
5678         }
5679  }
5680 @@ -2324,7 +2336,7 @@ void d_delete(struct dentry * dentry)
5681         if (dentry->d_lockref.count == 1) {
5682                 if (!spin_trylock(&inode->i_lock)) {
5683                         spin_unlock(&dentry->d_lock);
5684 -                       cpu_relax();
5685 +                       cpu_chill();
5686                         goto again;
5687                 }
5688                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5689 @@ -2384,21 +2396,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
5690  
5691  static void d_wait_lookup(struct dentry *dentry)
5692  {
5693 -       if (d_in_lookup(dentry)) {
5694 -               DECLARE_WAITQUEUE(wait, current);
5695 -               add_wait_queue(dentry->d_wait, &wait);
5696 -               do {
5697 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5698 -                       spin_unlock(&dentry->d_lock);
5699 -                       schedule();
5700 -                       spin_lock(&dentry->d_lock);
5701 -               } while (d_in_lookup(dentry));
5702 -       }
5703 +       struct swait_queue __wait;
5704 +
5705 +       if (!d_in_lookup(dentry))
5706 +               return;
5707 +
5708 +       INIT_LIST_HEAD(&__wait.task_list);
5709 +       do {
5710 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5711 +               spin_unlock(&dentry->d_lock);
5712 +               schedule();
5713 +               spin_lock(&dentry->d_lock);
5714 +       } while (d_in_lookup(dentry));
5715 +       finish_swait(dentry->d_wait, &__wait);
5716  }
5717  
5718  struct dentry *d_alloc_parallel(struct dentry *parent,
5719                                 const struct qstr *name,
5720 -                               wait_queue_head_t *wq)
5721 +                               struct swait_queue_head *wq)
5722  {
5723         unsigned int hash = name->hash;
5724         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5725 @@ -2507,7 +2522,7 @@ void __d_lookup_done(struct dentry *dentry)
5726         hlist_bl_lock(b);
5727         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5728         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5729 -       wake_up_all(dentry->d_wait);
5730 +       swake_up_all(dentry->d_wait);
5731         dentry->d_wait = NULL;
5732         hlist_bl_unlock(b);
5733         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5734 @@ -3604,6 +3619,11 @@ EXPORT_SYMBOL(d_genocide);
5735  
5736  void __init vfs_caches_init_early(void)
5737  {
5738 +       int i;
5739 +
5740 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5741 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5742 +
5743         dcache_init_early();
5744         inode_init_early();
5745  }
5746 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
5747 index 10db91218933..42af0a06f657 100644
5748 --- a/fs/eventpoll.c
5749 +++ b/fs/eventpoll.c
5750 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
5751   */
5752  static void ep_poll_safewake(wait_queue_head_t *wq)
5753  {
5754 -       int this_cpu = get_cpu();
5755 +       int this_cpu = get_cpu_light();
5756  
5757         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5758                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5759  
5760 -       put_cpu();
5761 +       put_cpu_light();
5762  }
5763  
5764  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5765 diff --git a/fs/exec.c b/fs/exec.c
5766 index 67e86571685a..fe14cdd84016 100644
5767 --- a/fs/exec.c
5768 +++ b/fs/exec.c
5769 @@ -1017,12 +1017,14 @@ static int exec_mmap(struct mm_struct *mm)
5770                 }
5771         }
5772         task_lock(tsk);
5773 +       preempt_disable_rt();
5774         active_mm = tsk->active_mm;
5775         tsk->mm = mm;
5776         tsk->active_mm = mm;
5777         activate_mm(active_mm, mm);
5778         tsk->mm->vmacache_seqnum = 0;
5779         vmacache_flush(tsk);
5780 +       preempt_enable_rt();
5781         task_unlock(tsk);
5782         if (old_mm) {
5783                 up_read(&old_mm->mmap_sem);
5784 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
5785 index 642c57b8de7b..8494b9308333 100644
5786 --- a/fs/fuse/dir.c
5787 +++ b/fs/fuse/dir.c
5788 @@ -1191,7 +1191,7 @@ static int fuse_direntplus_link(struct file *file,
5789         struct inode *dir = d_inode(parent);
5790         struct fuse_conn *fc;
5791         struct inode *inode;
5792 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5793 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5794  
5795         if (!o->nodeid) {
5796                 /*
5797 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
5798 index 684996c8a3a4..6e18a06aaabe 100644
5799 --- a/fs/jbd2/checkpoint.c
5800 +++ b/fs/jbd2/checkpoint.c
5801 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
5802         nblocks = jbd2_space_needed(journal);
5803         while (jbd2_log_space_left(journal) < nblocks) {
5804                 write_unlock(&journal->j_state_lock);
5805 +               if (current->plug)
5806 +                       io_schedule();
5807                 mutex_lock(&journal->j_checkpoint_mutex);
5808  
5809                 /*
5810 diff --git a/fs/locks.c b/fs/locks.c
5811 index 22c5b4aa4961..269c6a44449a 100644
5812 --- a/fs/locks.c
5813 +++ b/fs/locks.c
5814 @@ -935,7 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5815                         return -ENOMEM;
5816         }
5817  
5818 -       percpu_down_read_preempt_disable(&file_rwsem);
5819 +       percpu_down_read(&file_rwsem);
5820         spin_lock(&ctx->flc_lock);
5821         if (request->fl_flags & FL_ACCESS)
5822                 goto find_conflict;
5823 @@ -976,7 +976,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5824  
5825  out:
5826         spin_unlock(&ctx->flc_lock);
5827 -       percpu_up_read_preempt_enable(&file_rwsem);
5828 +       percpu_up_read(&file_rwsem);
5829         if (new_fl)
5830                 locks_free_lock(new_fl);
5831         locks_dispose_list(&dispose);
5832 @@ -1013,7 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5833                 new_fl2 = locks_alloc_lock();
5834         }
5835  
5836 -       percpu_down_read_preempt_disable(&file_rwsem);
5837 +       percpu_down_read(&file_rwsem);
5838         spin_lock(&ctx->flc_lock);
5839         /*
5840          * New lock request. Walk all POSIX locks and look for conflicts. If
5841 @@ -1185,7 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5842         }
5843   out:
5844         spin_unlock(&ctx->flc_lock);
5845 -       percpu_up_read_preempt_enable(&file_rwsem);
5846 +       percpu_up_read(&file_rwsem);
5847         /*
5848          * Free any unused locks.
5849          */
5850 @@ -1460,7 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5851                 return error;
5852         }
5853  
5854 -       percpu_down_read_preempt_disable(&file_rwsem);
5855 +       percpu_down_read(&file_rwsem);
5856         spin_lock(&ctx->flc_lock);
5857  
5858         time_out_leases(inode, &dispose);
5859 @@ -1512,13 +1512,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5860         locks_insert_block(fl, new_fl);
5861         trace_break_lease_block(inode, new_fl);
5862         spin_unlock(&ctx->flc_lock);
5863 -       percpu_up_read_preempt_enable(&file_rwsem);
5864 +       percpu_up_read(&file_rwsem);
5865  
5866         locks_dispose_list(&dispose);
5867         error = wait_event_interruptible_timeout(new_fl->fl_wait,
5868                                                 !new_fl->fl_next, break_time);
5869  
5870 -       percpu_down_read_preempt_disable(&file_rwsem);
5871 +       percpu_down_read(&file_rwsem);
5872         spin_lock(&ctx->flc_lock);
5873         trace_break_lease_unblock(inode, new_fl);
5874         locks_delete_block(new_fl);
5875 @@ -1535,7 +1535,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5876         }
5877  out:
5878         spin_unlock(&ctx->flc_lock);
5879 -       percpu_up_read_preempt_enable(&file_rwsem);
5880 +       percpu_up_read(&file_rwsem);
5881         locks_dispose_list(&dispose);
5882         locks_free_lock(new_fl);
5883         return error;
5884 @@ -1609,7 +1609,7 @@ int fcntl_getlease(struct file *filp)
5885  
5886         ctx = smp_load_acquire(&inode->i_flctx);
5887         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5888 -               percpu_down_read_preempt_disable(&file_rwsem);
5889 +               percpu_down_read(&file_rwsem);
5890                 spin_lock(&ctx->flc_lock);
5891                 time_out_leases(inode, &dispose);
5892                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5893 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
5894                         break;
5895                 }
5896                 spin_unlock(&ctx->flc_lock);
5897 -               percpu_up_read_preempt_enable(&file_rwsem);
5898 +               percpu_up_read(&file_rwsem);
5899  
5900                 locks_dispose_list(&dispose);
5901         }
5902 @@ -1694,7 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5903                 return -EINVAL;
5904         }
5905  
5906 -       percpu_down_read_preempt_disable(&file_rwsem);
5907 +       percpu_down_read(&file_rwsem);
5908         spin_lock(&ctx->flc_lock);
5909         time_out_leases(inode, &dispose);
5910         error = check_conflicting_open(dentry, arg, lease->fl_flags);
5911 @@ -1765,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5912                 lease->fl_lmops->lm_setup(lease, priv);
5913  out:
5914         spin_unlock(&ctx->flc_lock);
5915 -       percpu_up_read_preempt_enable(&file_rwsem);
5916 +       percpu_up_read(&file_rwsem);
5917         locks_dispose_list(&dispose);
5918         if (is_deleg)
5919                 inode_unlock(inode);
5920 @@ -1788,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
5921                 return error;
5922         }
5923  
5924 -       percpu_down_read_preempt_disable(&file_rwsem);
5925 +       percpu_down_read(&file_rwsem);
5926         spin_lock(&ctx->flc_lock);
5927         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5928                 if (fl->fl_file == filp &&
5929 @@ -1801,7 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
5930         if (victim)
5931                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
5932         spin_unlock(&ctx->flc_lock);
5933 -       percpu_up_read_preempt_enable(&file_rwsem);
5934 +       percpu_up_read(&file_rwsem);
5935         locks_dispose_list(&dispose);
5936         return error;
5937  }
5938 @@ -2532,13 +2532,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
5939         if (list_empty(&ctx->flc_lease))
5940                 return;
5941  
5942 -       percpu_down_read_preempt_disable(&file_rwsem);
5943 +       percpu_down_read(&file_rwsem);
5944         spin_lock(&ctx->flc_lock);
5945         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
5946                 if (filp == fl->fl_file)
5947                         lease_modify(fl, F_UNLCK, &dispose);
5948         spin_unlock(&ctx->flc_lock);
5949 -       percpu_up_read_preempt_enable(&file_rwsem);
5950 +       percpu_up_read(&file_rwsem);
5951  
5952         locks_dispose_list(&dispose);
5953  }
5954 diff --git a/fs/namei.c b/fs/namei.c
5955 index d5e5140c1045..150fbdd8e04c 100644
5956 --- a/fs/namei.c
5957 +++ b/fs/namei.c
5958 @@ -1626,7 +1626,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
5959  {
5960         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
5961         struct inode *inode = dir->d_inode;
5962 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5963 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5964  
5965         inode_lock_shared(inode);
5966         /* Don't go there if it's already dead */
5967 @@ -3083,7 +3083,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
5968         struct dentry *dentry;
5969         int error, create_error = 0;
5970         umode_t mode = op->mode;
5971 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5972 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5973  
5974         if (unlikely(IS_DEADDIR(dir_inode)))
5975                 return -ENOENT;
5976 diff --git a/fs/namespace.c b/fs/namespace.c
5977 index 5e35057f07ac..843d274ba167 100644
5978 --- a/fs/namespace.c
5979 +++ b/fs/namespace.c
5980 @@ -14,6 +14,7 @@
5981  #include <linux/mnt_namespace.h>
5982  #include <linux/user_namespace.h>
5983  #include <linux/namei.h>
5984 +#include <linux/delay.h>
5985  #include <linux/security.h>
5986  #include <linux/idr.h>
5987  #include <linux/init.h>                /* init_rootfs */
5988 @@ -356,8 +357,11 @@ int __mnt_want_write(struct vfsmount *m)
5989          * incremented count after it has set MNT_WRITE_HOLD.
5990          */
5991         smp_mb();
5992 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
5993 -               cpu_relax();
5994 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
5995 +               preempt_enable();
5996 +               cpu_chill();
5997 +               preempt_disable();
5998 +       }
5999         /*
6000          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
6001          * be set to match its requirements. So we must not load that until
6002 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
6003 index dff600ae0d74..d726d2e09353 100644
6004 --- a/fs/nfs/delegation.c
6005 +++ b/fs/nfs/delegation.c
6006 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
6007                 sp = state->owner;
6008                 /* Block nfs4_proc_unlck */
6009                 mutex_lock(&sp->so_delegreturn_mutex);
6010 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6011 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
6012                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
6013                 if (!err)
6014                         err = nfs_delegation_claim_locks(ctx, state, stateid);
6015 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6016 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
6017                         err = -EAGAIN;
6018                 mutex_unlock(&sp->so_delegreturn_mutex);
6019                 put_nfs_open_context(ctx);
6020 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
6021 index 53e02b8bd9bd..a66e7d77cfbb 100644
6022 --- a/fs/nfs/dir.c
6023 +++ b/fs/nfs/dir.c
6024 @@ -485,7 +485,7 @@ static
6025  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
6026  {
6027         struct qstr filename = QSTR_INIT(entry->name, entry->len);
6028 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6029 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6030         struct dentry *dentry;
6031         struct dentry *alias;
6032         struct inode *dir = d_inode(parent);
6033 @@ -1487,7 +1487,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
6034                     struct file *file, unsigned open_flags,
6035                     umode_t mode, int *opened)
6036  {
6037 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6038 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6039         struct nfs_open_context *ctx;
6040         struct dentry *res;
6041         struct iattr attr = { .ia_valid = ATTR_OPEN };
6042 @@ -1802,7 +1802,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6043  
6044         trace_nfs_rmdir_enter(dir, dentry);
6045         if (d_really_is_positive(dentry)) {
6046 +#ifdef CONFIG_PREEMPT_RT_BASE
6047 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
6048 +#else
6049                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6050 +#endif
6051                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6052                 /* Ensure the VFS deletes this inode */
6053                 switch (error) {
6054 @@ -1812,7 +1816,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6055                 case -ENOENT:
6056                         nfs_dentry_handle_enoent(dentry);
6057                 }
6058 +#ifdef CONFIG_PREEMPT_RT_BASE
6059 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
6060 +#else
6061                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6062 +#endif
6063         } else
6064                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6065         trace_nfs_rmdir_exit(dir, dentry, error);
6066 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
6067 index bf4ec5ecc97e..36cd5fc9192c 100644
6068 --- a/fs/nfs/inode.c
6069 +++ b/fs/nfs/inode.c
6070 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
6071         nfsi->nrequests = 0;
6072         nfsi->commit_info.ncommit = 0;
6073         atomic_set(&nfsi->commit_info.rpcs_out, 0);
6074 +#ifdef CONFIG_PREEMPT_RT_BASE
6075 +       sema_init(&nfsi->rmdir_sem, 1);
6076 +#else
6077         init_rwsem(&nfsi->rmdir_sem);
6078 +#endif
6079         nfs4_init_once(nfsi);
6080  }
6081  
6082 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
6083 index 1452177c822d..f43b01d54c59 100644
6084 --- a/fs/nfs/nfs4_fs.h
6085 +++ b/fs/nfs/nfs4_fs.h
6086 @@ -111,7 +111,7 @@ struct nfs4_state_owner {
6087         unsigned long        so_flags;
6088         struct list_head     so_states;
6089         struct nfs_seqid_counter so_seqid;
6090 -       seqcount_t           so_reclaim_seqcount;
6091 +       seqlock_t            so_reclaim_seqlock;
6092         struct mutex         so_delegreturn_mutex;
6093  };
6094  
6095 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
6096 index 1536aeb0abab..0a8bc7eab083 100644
6097 --- a/fs/nfs/nfs4proc.c
6098 +++ b/fs/nfs/nfs4proc.c
6099 @@ -2698,7 +2698,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6100         unsigned int seq;
6101         int ret;
6102  
6103 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6104 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6105  
6106         ret = _nfs4_proc_open(opendata);
6107         if (ret != 0)
6108 @@ -2736,7 +2736,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6109  
6110         if (d_inode(dentry) == state->inode) {
6111                 nfs_inode_attach_open_context(ctx);
6112 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6113 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
6114                         nfs4_schedule_stateid_recovery(server, state);
6115         }
6116  out:
6117 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
6118 index 0959c9661662..dabd834d7686 100644
6119 --- a/fs/nfs/nfs4state.c
6120 +++ b/fs/nfs/nfs4state.c
6121 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
6122         nfs4_init_seqid_counter(&sp->so_seqid);
6123         atomic_set(&sp->so_count, 1);
6124         INIT_LIST_HEAD(&sp->so_lru);
6125 -       seqcount_init(&sp->so_reclaim_seqcount);
6126 +       seqlock_init(&sp->so_reclaim_seqlock);
6127         mutex_init(&sp->so_delegreturn_mutex);
6128         return sp;
6129  }
6130 @@ -1497,8 +1497,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6131          * recovering after a network partition or a reboot from a
6132          * server that doesn't support a grace period.
6133          */
6134 +#ifdef CONFIG_PREEMPT_RT_FULL
6135 +       write_seqlock(&sp->so_reclaim_seqlock);
6136 +#else
6137 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6138 +#endif
6139         spin_lock(&sp->so_lock);
6140 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
6141  restart:
6142         list_for_each_entry(state, &sp->so_states, open_states) {
6143                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
6144 @@ -1567,14 +1571,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6145                 spin_lock(&sp->so_lock);
6146                 goto restart;
6147         }
6148 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6149         spin_unlock(&sp->so_lock);
6150 +#ifdef CONFIG_PREEMPT_RT_FULL
6151 +       write_sequnlock(&sp->so_reclaim_seqlock);
6152 +#else
6153 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6154 +#endif
6155         return 0;
6156  out_err:
6157         nfs4_put_open_state(state);
6158 -       spin_lock(&sp->so_lock);
6159 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6160 -       spin_unlock(&sp->so_lock);
6161 +#ifdef CONFIG_PREEMPT_RT_FULL
6162 +       write_sequnlock(&sp->so_reclaim_seqlock);
6163 +#else
6164 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6165 +#endif
6166         return status;
6167  }
6168  
6169 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
6170 index 191aa577dd1f..58990c8f52e0 100644
6171 --- a/fs/nfs/unlink.c
6172 +++ b/fs/nfs/unlink.c
6173 @@ -12,7 +12,7 @@
6174  #include <linux/sunrpc/clnt.h>
6175  #include <linux/nfs_fs.h>
6176  #include <linux/sched.h>
6177 -#include <linux/wait.h>
6178 +#include <linux/swait.h>
6179  #include <linux/namei.h>
6180  #include <linux/fsnotify.h>
6181  
6182 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
6183                 rpc_restart_call_prepare(task);
6184  }
6185  
6186 +#ifdef CONFIG_PREEMPT_RT_BASE
6187 +static void nfs_down_anon(struct semaphore *sema)
6188 +{
6189 +       down(sema);
6190 +}
6191 +
6192 +static void nfs_up_anon(struct semaphore *sema)
6193 +{
6194 +       up(sema);
6195 +}
6196 +
6197 +#else
6198 +static void nfs_down_anon(struct rw_semaphore *rwsem)
6199 +{
6200 +       down_read_non_owner(rwsem);
6201 +}
6202 +
6203 +static void nfs_up_anon(struct rw_semaphore *rwsem)
6204 +{
6205 +       up_read_non_owner(rwsem);
6206 +}
6207 +#endif
6208 +
6209  /**
6210   * nfs_async_unlink_release - Release the sillydelete data.
6211   * @task: rpc_task of the sillydelete
6212 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
6213         struct dentry *dentry = data->dentry;
6214         struct super_block *sb = dentry->d_sb;
6215  
6216 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6217 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6218         d_lookup_done(dentry);
6219         nfs_free_unlinkdata(data);
6220         dput(dentry);
6221 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6222         struct inode *dir = d_inode(dentry->d_parent);
6223         struct dentry *alias;
6224  
6225 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6226 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6227         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6228         if (IS_ERR(alias)) {
6229 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6230 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6231                 return 0;
6232         }
6233         if (!d_in_lookup(alias)) {
6234 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6235                         ret = 0;
6236                 spin_unlock(&alias->d_lock);
6237                 dput(alias);
6238 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6239 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6240                 /*
6241                  * If we'd displaced old cached devname, free it.  At that
6242                  * point dentry is definitely not a root, so we won't need
6243 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
6244                 goto out_free_name;
6245         }
6246         data->res.dir_attr = &data->dir_attr;
6247 -       init_waitqueue_head(&data->wq);
6248 +       init_swait_queue_head(&data->wq);
6249  
6250         status = -EBUSY;
6251         spin_lock(&dentry->d_lock);
6252 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
6253 index fe251f187ff8..e89da4fb14c2 100644
6254 --- a/fs/ntfs/aops.c
6255 +++ b/fs/ntfs/aops.c
6256 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6257                         ofs = 0;
6258                         if (file_ofs < init_size)
6259                                 ofs = init_size - file_ofs;
6260 -                       local_irq_save(flags);
6261 +                       local_irq_save_nort(flags);
6262                         kaddr = kmap_atomic(page);
6263                         memset(kaddr + bh_offset(bh) + ofs, 0,
6264                                         bh->b_size - ofs);
6265                         flush_dcache_page(page);
6266                         kunmap_atomic(kaddr);
6267 -                       local_irq_restore(flags);
6268 +                       local_irq_restore_nort(flags);
6269                 }
6270         } else {
6271                 clear_buffer_uptodate(bh);
6272 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6273                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6274         }
6275         first = page_buffers(page);
6276 -       local_irq_save(flags);
6277 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6278 +       flags = bh_uptodate_lock_irqsave(first);
6279         clear_buffer_async_read(bh);
6280         unlock_buffer(bh);
6281         tmp = bh;
6282 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6283                 }
6284                 tmp = tmp->b_this_page;
6285         } while (tmp != bh);
6286 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6287 -       local_irq_restore(flags);
6288 +       bh_uptodate_unlock_irqrestore(first, flags);
6289         /*
6290          * If none of the buffers had errors then we can set the page uptodate,
6291          * but we first have to perform the post read mst fixups, if the
6292 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6293                 recs = PAGE_SIZE / rec_size;
6294                 /* Should have been verified before we got here... */
6295                 BUG_ON(!recs);
6296 -               local_irq_save(flags);
6297 +               local_irq_save_nort(flags);
6298                 kaddr = kmap_atomic(page);
6299                 for (i = 0; i < recs; i++)
6300                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6301                                         i * rec_size), rec_size);
6302                 kunmap_atomic(kaddr);
6303 -               local_irq_restore(flags);
6304 +               local_irq_restore_nort(flags);
6305                 flush_dcache_page(page);
6306                 if (likely(page_uptodate && !PageError(page)))
6307                         SetPageUptodate(page);
6308 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6309         unlock_page(page);
6310         return;
6311  still_busy:
6312 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6313 -       local_irq_restore(flags);
6314 -       return;
6315 +       bh_uptodate_unlock_irqrestore(first, flags);
6316  }
6317  
6318  /**
6319 diff --git a/fs/proc/base.c b/fs/proc/base.c
6320 index ca651ac00660..41d9dc789285 100644
6321 --- a/fs/proc/base.c
6322 +++ b/fs/proc/base.c
6323 @@ -1834,7 +1834,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
6324  
6325         child = d_hash_and_lookup(dir, &qname);
6326         if (!child) {
6327 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6328 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6329                 child = d_alloc_parallel(dir, &qname, &wq);
6330                 if (IS_ERR(child))
6331                         goto end_instantiate;
6332 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
6333 index d4e37acd4821..000cea46434a 100644
6334 --- a/fs/proc/proc_sysctl.c
6335 +++ b/fs/proc/proc_sysctl.c
6336 @@ -632,7 +632,7 @@ static bool proc_sys_fill_cache(struct file *file,
6337  
6338         child = d_lookup(dir, &qname);
6339         if (!child) {
6340 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6341 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6342                 child = d_alloc_parallel(dir, &qname, &wq);
6343                 if (IS_ERR(child))
6344                         return false;
6345 diff --git a/fs/timerfd.c b/fs/timerfd.c
6346 index 9ae4abb4110b..8644b67c48fd 100644
6347 --- a/fs/timerfd.c
6348 +++ b/fs/timerfd.c
6349 @@ -460,7 +460,10 @@ static int do_timerfd_settime(int ufd, int flags,
6350                                 break;
6351                 }
6352                 spin_unlock_irq(&ctx->wqh.lock);
6353 -               cpu_relax();
6354 +               if (isalarm(ctx))
6355 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6356 +               else
6357 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6358         }
6359  
6360         /*
6361 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
6362 index e861a24f06f2..b5c97d3059c7 100644
6363 --- a/include/acpi/platform/aclinux.h
6364 +++ b/include/acpi/platform/aclinux.h
6365 @@ -133,6 +133,7 @@
6366  
6367  #define acpi_cache_t                        struct kmem_cache
6368  #define acpi_spinlock                       spinlock_t *
6369 +#define acpi_raw_spinlock              raw_spinlock_t *
6370  #define acpi_cpu_flags                      unsigned long
6371  
6372  /* Use native linux version of acpi_os_allocate_zeroed */
6373 @@ -151,6 +152,20 @@
6374  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6375  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6376  
6377 +#define acpi_os_create_raw_lock(__handle)                      \
6378 +({                                                             \
6379 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6380 +                                                               \
6381 +        if (lock) {                                            \
6382 +               *(__handle) = lock;                             \
6383 +               raw_spin_lock_init(*(__handle));                \
6384 +        }                                                      \
6385 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6386 + })
6387 +
6388 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6389 +
6390 +
6391  /*
6392   * OSL interfaces used by debugger/disassembler
6393   */
6394 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
6395 index 6f96247226a4..fa53a21263c2 100644
6396 --- a/include/asm-generic/bug.h
6397 +++ b/include/asm-generic/bug.h
6398 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
6399  # define WARN_ON_SMP(x)                        ({0;})
6400  #endif
6401  
6402 +#ifdef CONFIG_PREEMPT_RT_BASE
6403 +# define BUG_ON_RT(c)                  BUG_ON(c)
6404 +# define BUG_ON_NONRT(c)               do { } while (0)
6405 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6406 +# define WARN_ON_NONRT(condition)      do { } while (0)
6407 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6408 +#else
6409 +# define BUG_ON_RT(c)                  do { } while (0)
6410 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6411 +# define WARN_ON_RT(condition)         do { } while (0)
6412 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6413 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6414 +#endif
6415 +
6416  #endif /* __ASSEMBLY__ */
6417  
6418  #endif
6419 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
6420 index 535ab2e13d2e..cfc246899473 100644
6421 --- a/include/linux/blk-mq.h
6422 +++ b/include/linux/blk-mq.h
6423 @@ -209,7 +209,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
6424         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6425  }
6426  
6427 -
6428 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6429  int blk_mq_request_started(struct request *rq);
6430  void blk_mq_start_request(struct request *rq);
6431  void blk_mq_end_request(struct request *rq, int error);
6432 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
6433 index f6a816129856..ec7a4676f8a8 100644
6434 --- a/include/linux/blkdev.h
6435 +++ b/include/linux/blkdev.h
6436 @@ -89,6 +89,7 @@ struct request {
6437         struct list_head queuelist;
6438         union {
6439                 struct call_single_data csd;
6440 +               struct work_struct work;
6441                 u64 fifo_time;
6442         };
6443  
6444 @@ -467,7 +468,7 @@ struct request_queue {
6445         struct throtl_data *td;
6446  #endif
6447         struct rcu_head         rcu_head;
6448 -       wait_queue_head_t       mq_freeze_wq;
6449 +       struct swait_queue_head mq_freeze_wq;
6450         struct percpu_ref       q_usage_counter;
6451         struct list_head        all_q_node;
6452  
6453 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
6454 index 8fdcb783197d..d07dbeec7bc1 100644
6455 --- a/include/linux/bottom_half.h
6456 +++ b/include/linux/bottom_half.h
6457 @@ -3,6 +3,39 @@
6458  
6459  #include <linux/preempt.h>
6460  
6461 +#ifdef CONFIG_PREEMPT_RT_FULL
6462 +
6463 +extern void __local_bh_disable(void);
6464 +extern void _local_bh_enable(void);
6465 +extern void __local_bh_enable(void);
6466 +
6467 +static inline void local_bh_disable(void)
6468 +{
6469 +       __local_bh_disable();
6470 +}
6471 +
6472 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6473 +{
6474 +       __local_bh_disable();
6475 +}
6476 +
6477 +static inline void local_bh_enable(void)
6478 +{
6479 +       __local_bh_enable();
6480 +}
6481 +
6482 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6483 +{
6484 +       __local_bh_enable();
6485 +}
6486 +
6487 +static inline void local_bh_enable_ip(unsigned long ip)
6488 +{
6489 +       __local_bh_enable();
6490 +}
6491 +
6492 +#else
6493 +
6494  #ifdef CONFIG_TRACE_IRQFLAGS
6495  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6496  #else
6497 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
6498  {
6499         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6500  }
6501 +#endif
6502  
6503  #endif /* _LINUX_BH_H */
6504 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
6505 index ebbacd14d450..be5e87f6360a 100644
6506 --- a/include/linux/buffer_head.h
6507 +++ b/include/linux/buffer_head.h
6508 @@ -75,8 +75,50 @@ struct buffer_head {
6509         struct address_space *b_assoc_map;      /* mapping this buffer is
6510                                                    associated with */
6511         atomic_t b_count;               /* users using this buffer_head */
6512 +#ifdef CONFIG_PREEMPT_RT_BASE
6513 +       spinlock_t b_uptodate_lock;
6514 +#if IS_ENABLED(CONFIG_JBD2)
6515 +       spinlock_t b_state_lock;
6516 +       spinlock_t b_journal_head_lock;
6517 +#endif
6518 +#endif
6519  };
6520  
6521 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6522 +{
6523 +       unsigned long flags;
6524 +
6525 +#ifndef CONFIG_PREEMPT_RT_BASE
6526 +       local_irq_save(flags);
6527 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6528 +#else
6529 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6530 +#endif
6531 +       return flags;
6532 +}
6533 +
6534 +static inline void
6535 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6536 +{
6537 +#ifndef CONFIG_PREEMPT_RT_BASE
6538 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6539 +       local_irq_restore(flags);
6540 +#else
6541 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6542 +#endif
6543 +}
6544 +
6545 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6546 +{
6547 +#ifdef CONFIG_PREEMPT_RT_BASE
6548 +       spin_lock_init(&bh->b_uptodate_lock);
6549 +#if IS_ENABLED(CONFIG_JBD2)
6550 +       spin_lock_init(&bh->b_state_lock);
6551 +       spin_lock_init(&bh->b_journal_head_lock);
6552 +#endif
6553 +#endif
6554 +}
6555 +
6556  /*
6557   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6558   * and buffer_foo() functions.
6559 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
6560 index 5b17de62c962..56027cc01a56 100644
6561 --- a/include/linux/cgroup-defs.h
6562 +++ b/include/linux/cgroup-defs.h
6563 @@ -16,6 +16,7 @@
6564  #include <linux/percpu-refcount.h>
6565  #include <linux/percpu-rwsem.h>
6566  #include <linux/workqueue.h>
6567 +#include <linux/swork.h>
6568  
6569  #ifdef CONFIG_CGROUPS
6570  
6571 @@ -137,6 +138,7 @@ struct cgroup_subsys_state {
6572         /* percpu_ref killing and RCU release */
6573         struct rcu_head rcu_head;
6574         struct work_struct destroy_work;
6575 +       struct swork_event destroy_swork;
6576  };
6577  
6578  /*
6579 diff --git a/include/linux/completion.h b/include/linux/completion.h
6580 index 5d5aaae3af43..3bca1590e29f 100644
6581 --- a/include/linux/completion.h
6582 +++ b/include/linux/completion.h
6583 @@ -7,8 +7,7 @@
6584   * Atomic wait-for-completion handler data structures.
6585   * See kernel/sched/completion.c for details.
6586   */
6587 -
6588 -#include <linux/wait.h>
6589 +#include <linux/swait.h>
6590  
6591  /*
6592   * struct completion - structure used to maintain state for a "completion"
6593 @@ -24,11 +23,11 @@
6594   */
6595  struct completion {
6596         unsigned int done;
6597 -       wait_queue_head_t wait;
6598 +       struct swait_queue_head wait;
6599  };
6600  
6601  #define COMPLETION_INITIALIZER(work) \
6602 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6603 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6604  
6605  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6606         ({ init_completion(&work); work; })
6607 @@ -73,7 +72,7 @@ struct completion {
6608  static inline void init_completion(struct completion *x)
6609  {
6610         x->done = 0;
6611 -       init_waitqueue_head(&x->wait);
6612 +       init_swait_queue_head(&x->wait);
6613  }
6614  
6615  /**
6616 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
6617 index e571128ad99a..5e52d28c20c1 100644
6618 --- a/include/linux/cpu.h
6619 +++ b/include/linux/cpu.h
6620 @@ -182,6 +182,8 @@ extern void get_online_cpus(void);
6621  extern void put_online_cpus(void);
6622  extern void cpu_hotplug_disable(void);
6623  extern void cpu_hotplug_enable(void);
6624 +extern void pin_current_cpu(void);
6625 +extern void unpin_current_cpu(void);
6626  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6627  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6628  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6629 @@ -199,6 +201,8 @@ static inline void cpu_hotplug_done(void) {}
6630  #define put_online_cpus()      do { } while (0)
6631  #define cpu_hotplug_disable()  do { } while (0)
6632  #define cpu_hotplug_enable()   do { } while (0)
6633 +static inline void pin_current_cpu(void) { }
6634 +static inline void unpin_current_cpu(void) { }
6635  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6636  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6637  /* These aren't inline functions due to a GCC bug. */
6638 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
6639 index 5beed7b30561..61cab7ef458e 100644
6640 --- a/include/linux/dcache.h
6641 +++ b/include/linux/dcache.h
6642 @@ -11,6 +11,7 @@
6643  #include <linux/rcupdate.h>
6644  #include <linux/lockref.h>
6645  #include <linux/stringhash.h>
6646 +#include <linux/wait.h>
6647  
6648  struct path;
6649  struct vfsmount;
6650 @@ -100,7 +101,7 @@ struct dentry {
6651  
6652         union {
6653                 struct list_head d_lru;         /* LRU list */
6654 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6655 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6656         };
6657         struct list_head d_child;       /* child of parent list */
6658         struct list_head d_subdirs;     /* our children */
6659 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
6660  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6661  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6662  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6663 -                                       wait_queue_head_t *);
6664 +                                       struct swait_queue_head *);
6665  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6666  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6667  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6668 diff --git a/include/linux/delay.h b/include/linux/delay.h
6669 index a6ecb34cf547..37caab306336 100644
6670 --- a/include/linux/delay.h
6671 +++ b/include/linux/delay.h
6672 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
6673         msleep(seconds * 1000);
6674  }
6675  
6676 +#ifdef CONFIG_PREEMPT_RT_FULL
6677 +extern void cpu_chill(void);
6678 +#else
6679 +# define cpu_chill()   cpu_relax()
6680 +#endif
6681 +
6682  #endif /* defined(_LINUX_DELAY_H) */
6683 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
6684 index bb3f3297062a..a117a33ef72c 100644
6685 --- a/include/linux/highmem.h
6686 +++ b/include/linux/highmem.h
6687 @@ -7,6 +7,7 @@
6688  #include <linux/mm.h>
6689  #include <linux/uaccess.h>
6690  #include <linux/hardirq.h>
6691 +#include <linux/sched.h>
6692  
6693  #include <asm/cacheflush.h>
6694  
6695 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
6696  
6697  static inline void *kmap_atomic(struct page *page)
6698  {
6699 -       preempt_disable();
6700 +       preempt_disable_nort();
6701         pagefault_disable();
6702         return page_address(page);
6703  }
6704 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
6705  static inline void __kunmap_atomic(void *addr)
6706  {
6707         pagefault_enable();
6708 -       preempt_enable();
6709 +       preempt_enable_nort();
6710  }
6711  
6712  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
6713 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
6714  
6715  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6716  
6717 +#ifndef CONFIG_PREEMPT_RT_FULL
6718  DECLARE_PER_CPU(int, __kmap_atomic_idx);
6719 +#endif
6720  
6721  static inline int kmap_atomic_idx_push(void)
6722  {
6723 +#ifndef CONFIG_PREEMPT_RT_FULL
6724         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6725  
6726 -#ifdef CONFIG_DEBUG_HIGHMEM
6727 +# ifdef CONFIG_DEBUG_HIGHMEM
6728         WARN_ON_ONCE(in_irq() && !irqs_disabled());
6729         BUG_ON(idx >= KM_TYPE_NR);
6730 -#endif
6731 +# endif
6732         return idx;
6733 +#else
6734 +       current->kmap_idx++;
6735 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
6736 +       return current->kmap_idx - 1;
6737 +#endif
6738  }
6739  
6740  static inline int kmap_atomic_idx(void)
6741  {
6742 +#ifndef CONFIG_PREEMPT_RT_FULL
6743         return __this_cpu_read(__kmap_atomic_idx) - 1;
6744 +#else
6745 +       return current->kmap_idx - 1;
6746 +#endif
6747  }
6748  
6749  static inline void kmap_atomic_idx_pop(void)
6750  {
6751 -#ifdef CONFIG_DEBUG_HIGHMEM
6752 +#ifndef CONFIG_PREEMPT_RT_FULL
6753 +# ifdef CONFIG_DEBUG_HIGHMEM
6754         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6755  
6756         BUG_ON(idx < 0);
6757 -#else
6758 +# else
6759         __this_cpu_dec(__kmap_atomic_idx);
6760 +# endif
6761 +#else
6762 +       current->kmap_idx--;
6763 +# ifdef CONFIG_DEBUG_HIGHMEM
6764 +       BUG_ON(current->kmap_idx < 0);
6765 +# endif
6766  #endif
6767  }
6768  
6769 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
6770 index 5e00f80b1535..a34e10b55cde 100644
6771 --- a/include/linux/hrtimer.h
6772 +++ b/include/linux/hrtimer.h
6773 @@ -87,6 +87,9 @@ enum hrtimer_restart {
6774   * @function:  timer expiry callback function
6775   * @base:      pointer to the timer base (per cpu and per clock)
6776   * @state:     state information (See bit values above)
6777 + * @cb_entry:  list entry to defer timers from hardirq context
6778 + * @irqsafe:   timer can run in hardirq context
6779 + * @praecox:   timer expiry time if expired at the time of programming
6780   * @is_rel:    Set if the timer was armed relative
6781   * @start_pid:  timer statistics field to store the pid of the task which
6782   *             started the timer
6783 @@ -103,6 +106,11 @@ struct hrtimer {
6784         enum hrtimer_restart            (*function)(struct hrtimer *);
6785         struct hrtimer_clock_base       *base;
6786         u8                              state;
6787 +       struct list_head                cb_entry;
6788 +       int                             irqsafe;
6789 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6790 +       ktime_t                         praecox;
6791 +#endif
6792         u8                              is_rel;
6793  #ifdef CONFIG_TIMER_STATS
6794         int                             start_pid;
6795 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
6796         struct task_struct *task;
6797  };
6798  
6799 -#ifdef CONFIG_64BIT
6800  # define HRTIMER_CLOCK_BASE_ALIGN      64
6801 -#else
6802 -# define HRTIMER_CLOCK_BASE_ALIGN      32
6803 -#endif
6804  
6805  /**
6806   * struct hrtimer_clock_base - the timer base for a specific clock
6807 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
6808   *                     timer to a base on another cpu.
6809   * @clockid:           clock id for per_cpu support
6810   * @active:            red black tree root node for the active timers
6811 + * @expired:           list head for deferred timers.
6812   * @get_time:          function to retrieve the current time of the clock
6813   * @offset:            offset of this clock to the monotonic base
6814   */
6815 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
6816         int                     index;
6817         clockid_t               clockid;
6818         struct timerqueue_head  active;
6819 +       struct list_head        expired;
6820         ktime_t                 (*get_time)(void);
6821         ktime_t                 offset;
6822  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6823 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
6824         raw_spinlock_t                  lock;
6825         seqcount_t                      seq;
6826         struct hrtimer                  *running;
6827 +       struct hrtimer                  *running_soft;
6828         unsigned int                    cpu;
6829         unsigned int                    active_bases;
6830         unsigned int                    clock_was_set_seq;
6831 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
6832         unsigned int                    nr_hangs;
6833         unsigned int                    max_hang_time;
6834  #endif
6835 +#ifdef CONFIG_PREEMPT_RT_BASE
6836 +       wait_queue_head_t               wait;
6837 +#endif
6838         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
6839  } ____cacheline_aligned;
6840  
6841 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
6842         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6843  }
6844  
6845 +/* Softirq preemption could deadlock timer removal */
6846 +#ifdef CONFIG_PREEMPT_RT_BASE
6847 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6848 +#else
6849 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6850 +#endif
6851 +
6852  /* Query timers: */
6853  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6854  
6855 @@ -436,9 +453,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
6856   * Helper function to check, whether the timer is running the callback
6857   * function
6858   */
6859 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6860 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6861  {
6862 -       return timer->base->cpu_base->running == timer;
6863 +       if (timer->base->cpu_base->running == timer)
6864 +               return 1;
6865 +#ifdef CONFIG_PREEMPT_RT_BASE
6866 +       if (timer->base->cpu_base->running_soft == timer)
6867 +               return 1;
6868 +#endif
6869 +       return 0;
6870  }
6871  
6872  /* Forward a hrtimer so it expires after now: */
6873 diff --git a/include/linux/idr.h b/include/linux/idr.h
6874 index 083d61e92706..5899796f50cb 100644
6875 --- a/include/linux/idr.h
6876 +++ b/include/linux/idr.h
6877 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
6878   * Each idr_preload() should be matched with an invocation of this
6879   * function.  See idr_preload() for details.
6880   */
6881 +#ifdef CONFIG_PREEMPT_RT_FULL
6882 +void idr_preload_end(void);
6883 +#else
6884  static inline void idr_preload_end(void)
6885  {
6886         preempt_enable();
6887  }
6888 +#endif
6889  
6890  /**
6891   * idr_find - return pointer for given id
6892 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
6893 index 325f649d77ff..8af70bcc799b 100644
6894 --- a/include/linux/init_task.h
6895 +++ b/include/linux/init_task.h
6896 @@ -150,6 +150,12 @@ extern struct task_group root_task_group;
6897  # define INIT_PERF_EVENTS(tsk)
6898  #endif
6899  
6900 +#ifdef CONFIG_PREEMPT_RT_BASE
6901 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
6902 +#else
6903 +# define INIT_TIMER_LIST
6904 +#endif
6905 +
6906  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6907  # define INIT_VTIME(tsk)                                               \
6908         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
6909 @@ -250,6 +256,7 @@ extern struct task_group root_task_group;
6910         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
6911         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
6912         .timer_slack_ns = 50000, /* 50 usec default slack */            \
6913 +       INIT_TIMER_LIST                                                 \
6914         .pids = {                                                       \
6915                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
6916                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
6917 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
6918 index 72f0721f75e7..480972ae47d3 100644
6919 --- a/include/linux/interrupt.h
6920 +++ b/include/linux/interrupt.h
6921 @@ -14,6 +14,7 @@
6922  #include <linux/hrtimer.h>
6923  #include <linux/kref.h>
6924  #include <linux/workqueue.h>
6925 +#include <linux/swork.h>
6926  
6927  #include <linux/atomic.h>
6928  #include <asm/ptrace.h>
6929 @@ -61,6 +62,7 @@
6930   *                interrupt handler after suspending interrupts. For system
6931   *                wakeup devices users need to implement wakeup detection in
6932   *                their interrupt handlers.
6933 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
6934   */
6935  #define IRQF_SHARED            0x00000080
6936  #define IRQF_PROBE_SHARED      0x00000100
6937 @@ -74,6 +76,7 @@
6938  #define IRQF_NO_THREAD         0x00010000
6939  #define IRQF_EARLY_RESUME      0x00020000
6940  #define IRQF_COND_SUSPEND      0x00040000
6941 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
6942  
6943  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
6944  
6945 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
6946  #ifdef CONFIG_LOCKDEP
6947  # define local_irq_enable_in_hardirq() do { } while (0)
6948  #else
6949 -# define local_irq_enable_in_hardirq() local_irq_enable()
6950 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
6951  #endif
6952  
6953  extern void disable_irq_nosync(unsigned int irq);
6954 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
6955   * struct irq_affinity_notify - context for notification of IRQ affinity changes
6956   * @irq:               Interrupt to which notification applies
6957   * @kref:              Reference count, for internal use
6958 + * @swork:             Swork item, for internal use
6959   * @work:              Work item, for internal use
6960   * @notify:            Function to be called on change.  This will be
6961   *                     called in process context.
6962 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
6963  struct irq_affinity_notify {
6964         unsigned int irq;
6965         struct kref kref;
6966 +#ifdef CONFIG_PREEMPT_RT_BASE
6967 +       struct swork_event swork;
6968 +#else
6969         struct work_struct work;
6970 +#endif
6971         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
6972         void (*release)(struct kref *ref);
6973  };
6974 @@ -406,9 +414,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
6975                                  bool state);
6976  
6977  #ifdef CONFIG_IRQ_FORCED_THREADING
6978 +# ifndef CONFIG_PREEMPT_RT_BASE
6979  extern bool force_irqthreads;
6980 +# else
6981 +#  define force_irqthreads     (true)
6982 +# endif
6983  #else
6984 -#define force_irqthreads       (0)
6985 +#define force_irqthreads       (false)
6986  #endif
6987  
6988  #ifndef __ARCH_SET_SOFTIRQ_PENDING
6989 @@ -465,9 +477,10 @@ struct softirq_action
6990         void    (*action)(struct softirq_action *);
6991  };
6992  
6993 +#ifndef CONFIG_PREEMPT_RT_FULL
6994  asmlinkage void do_softirq(void);
6995  asmlinkage void __do_softirq(void);
6996 -
6997 +static inline void thread_do_softirq(void) { do_softirq(); }
6998  #ifdef __ARCH_HAS_DO_SOFTIRQ
6999  void do_softirq_own_stack(void);
7000  #else
7001 @@ -476,13 +489,25 @@ static inline void do_softirq_own_stack(void)
7002         __do_softirq();
7003  }
7004  #endif
7005 +#else
7006 +extern void thread_do_softirq(void);
7007 +#endif
7008  
7009  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
7010  extern void softirq_init(void);
7011  extern void __raise_softirq_irqoff(unsigned int nr);
7012 +#ifdef CONFIG_PREEMPT_RT_FULL
7013 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
7014 +#else
7015 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
7016 +{
7017 +       __raise_softirq_irqoff(nr);
7018 +}
7019 +#endif
7020  
7021  extern void raise_softirq_irqoff(unsigned int nr);
7022  extern void raise_softirq(unsigned int nr);
7023 +extern void softirq_check_pending_idle(void);
7024  
7025  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
7026  
7027 @@ -504,8 +529,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
7028       to be executed on some cpu at least once after this.
7029     * If the tasklet is already scheduled, but its execution is still not
7030       started, it will be executed only once.
7031 -   * If this tasklet is already running on another CPU (or schedule is called
7032 -     from tasklet itself), it is rescheduled for later.
7033 +   * If this tasklet is already running on another CPU, it is rescheduled
7034 +     for later.
7035 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
7036     * Tasklet is strictly serialized wrt itself, but not
7037       wrt another tasklets. If client needs some intertask synchronization,
7038       he makes it with spinlocks.
7039 @@ -530,27 +556,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
7040  enum
7041  {
7042         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
7043 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
7044 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
7045 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
7046  };
7047  
7048 -#ifdef CONFIG_SMP
7049 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
7050 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
7051 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
7052 +
7053 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
7054  static inline int tasklet_trylock(struct tasklet_struct *t)
7055  {
7056         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
7057  }
7058  
7059 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
7060 +{
7061 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
7062 +}
7063 +
7064  static inline void tasklet_unlock(struct tasklet_struct *t)
7065  {
7066         smp_mb__before_atomic();
7067         clear_bit(TASKLET_STATE_RUN, &(t)->state);
7068  }
7069  
7070 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
7071 -{
7072 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
7073 -}
7074 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
7075 +
7076  #else
7077  #define tasklet_trylock(t) 1
7078 +#define tasklet_tryunlock(t)   1
7079  #define tasklet_unlock_wait(t) do { } while (0)
7080  #define tasklet_unlock(t) do { } while (0)
7081  #endif
7082 @@ -599,12 +634,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
7083         smp_mb();
7084  }
7085  
7086 -static inline void tasklet_enable(struct tasklet_struct *t)
7087 -{
7088 -       smp_mb__before_atomic();
7089 -       atomic_dec(&t->count);
7090 -}
7091 -
7092 +extern void tasklet_enable(struct tasklet_struct *t);
7093  extern void tasklet_kill(struct tasklet_struct *t);
7094  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
7095  extern void tasklet_init(struct tasklet_struct *t,
7096 @@ -635,6 +665,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
7097         tasklet_kill(&ttimer->tasklet);
7098  }
7099  
7100 +#ifdef CONFIG_PREEMPT_RT_FULL
7101 +extern void softirq_early_init(void);
7102 +#else
7103 +static inline void softirq_early_init(void) { }
7104 +#endif
7105 +
7106  /*
7107   * Autoprobing for irqs:
7108   *
7109 diff --git a/include/linux/irq.h b/include/linux/irq.h
7110 index 39e3254e5769..8ebac94fbb9f 100644
7111 --- a/include/linux/irq.h
7112 +++ b/include/linux/irq.h
7113 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
7114   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
7115   *                               it from the spurious interrupt detection
7116   *                               mechanism and from core side polling.
7117 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
7118   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
7119   */
7120  enum {
7121 @@ -99,13 +100,14 @@ enum {
7122         IRQ_PER_CPU_DEVID       = (1 << 17),
7123         IRQ_IS_POLLED           = (1 << 18),
7124         IRQ_DISABLE_UNLAZY      = (1 << 19),
7125 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
7126  };
7127  
7128  #define IRQF_MODIFY_MASK       \
7129         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
7130          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
7131          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
7132 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
7133 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
7134  
7135  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
7136  
7137 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
7138 index 47b9ebd4a74f..2543aab05daa 100644
7139 --- a/include/linux/irq_work.h
7140 +++ b/include/linux/irq_work.h
7141 @@ -16,6 +16,7 @@
7142  #define IRQ_WORK_BUSY          2UL
7143  #define IRQ_WORK_FLAGS         3UL
7144  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
7145 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
7146  
7147  struct irq_work {
7148         unsigned long flags;
7149 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
7150  static inline void irq_work_run(void) { }
7151  #endif
7152  
7153 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
7154 +void irq_work_tick_soft(void);
7155 +#else
7156 +static inline void irq_work_tick_soft(void) { }
7157 +#endif
7158 +
7159  #endif /* _LINUX_IRQ_WORK_H */
7160 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
7161 index c9be57931b58..eeeb540971ae 100644
7162 --- a/include/linux/irqdesc.h
7163 +++ b/include/linux/irqdesc.h
7164 @@ -66,6 +66,7 @@ struct irq_desc {
7165         unsigned int            irqs_unhandled;
7166         atomic_t                threads_handled;
7167         int                     threads_handled_last;
7168 +       u64                     random_ip;
7169         raw_spinlock_t          lock;
7170         struct cpumask          *percpu_enabled;
7171         const struct cpumask    *percpu_affinity;
7172 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
7173 index 5dd1272d1ab2..9b77034f7c5e 100644
7174 --- a/include/linux/irqflags.h
7175 +++ b/include/linux/irqflags.h
7176 @@ -25,8 +25,6 @@
7177  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
7178  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
7179  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
7180 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
7181 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
7182  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
7183  #else
7184  # define trace_hardirqs_on()           do { } while (0)
7185 @@ -39,9 +37,15 @@
7186  # define trace_softirqs_enabled(p)     0
7187  # define trace_hardirq_enter()         do { } while (0)
7188  # define trace_hardirq_exit()          do { } while (0)
7189 +# define INIT_TRACE_IRQFLAGS
7190 +#endif
7191 +
7192 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
7193 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
7194 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
7195 +#else
7196  # define lockdep_softirq_enter()       do { } while (0)
7197  # define lockdep_softirq_exit()                do { } while (0)
7198 -# define INIT_TRACE_IRQFLAGS
7199  #endif
7200  
7201  #if defined(CONFIG_IRQSOFF_TRACER) || \
7202 @@ -148,4 +152,23 @@
7203  
7204  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
7205  
7206 +/*
7207 + * local_irq* variants depending on RT/!RT
7208 + */
7209 +#ifdef CONFIG_PREEMPT_RT_FULL
7210 +# define local_irq_disable_nort()      do { } while (0)
7211 +# define local_irq_enable_nort()       do { } while (0)
7212 +# define local_irq_save_nort(flags)    local_save_flags(flags)
7213 +# define local_irq_restore_nort(flags) (void)(flags)
7214 +# define local_irq_disable_rt()                local_irq_disable()
7215 +# define local_irq_enable_rt()         local_irq_enable()
7216 +#else
7217 +# define local_irq_disable_nort()      local_irq_disable()
7218 +# define local_irq_enable_nort()       local_irq_enable()
7219 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7220 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7221 +# define local_irq_disable_rt()                do { } while (0)
7222 +# define local_irq_enable_rt()         do { } while (0)
7223 +#endif
7224 +
7225  #endif
7226 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
7227 index dfaa1f4dcb0c..d57dd06544a1 100644
7228 --- a/include/linux/jbd2.h
7229 +++ b/include/linux/jbd2.h
7230 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
7231  
7232  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7233  {
7234 +#ifndef CONFIG_PREEMPT_RT_BASE
7235         bit_spin_lock(BH_State, &bh->b_state);
7236 +#else
7237 +       spin_lock(&bh->b_state_lock);
7238 +#endif
7239  }
7240  
7241  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7242  {
7243 +#ifndef CONFIG_PREEMPT_RT_BASE
7244         return bit_spin_trylock(BH_State, &bh->b_state);
7245 +#else
7246 +       return spin_trylock(&bh->b_state_lock);
7247 +#endif
7248  }
7249  
7250  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7251  {
7252 +#ifndef CONFIG_PREEMPT_RT_BASE
7253         return bit_spin_is_locked(BH_State, &bh->b_state);
7254 +#else
7255 +       return spin_is_locked(&bh->b_state_lock);
7256 +#endif
7257  }
7258  
7259  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7260  {
7261 +#ifndef CONFIG_PREEMPT_RT_BASE
7262         bit_spin_unlock(BH_State, &bh->b_state);
7263 +#else
7264 +       spin_unlock(&bh->b_state_lock);
7265 +#endif
7266  }
7267  
7268  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7269  {
7270 +#ifndef CONFIG_PREEMPT_RT_BASE
7271         bit_spin_lock(BH_JournalHead, &bh->b_state);
7272 +#else
7273 +       spin_lock(&bh->b_journal_head_lock);
7274 +#endif
7275  }
7276  
7277  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7278  {
7279 +#ifndef CONFIG_PREEMPT_RT_BASE
7280         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7281 +#else
7282 +       spin_unlock(&bh->b_journal_head_lock);
7283 +#endif
7284  }
7285  
7286  #define J_ASSERT(assert)       BUG_ON(!(assert))
7287 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
7288 index 410decacff8f..0861bebfc188 100644
7289 --- a/include/linux/kdb.h
7290 +++ b/include/linux/kdb.h
7291 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
7292  extern __printf(1, 2) int kdb_printf(const char *, ...);
7293  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7294  
7295 +#define in_kdb_printk()        (kdb_trap_printk)
7296  extern void kdb_init(int level);
7297  
7298  /* Access to kdb specific polling devices */
7299 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
7300  extern int kdb_unregister(char *);
7301  #else /* ! CONFIG_KGDB_KDB */
7302  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7303 +#define in_kdb_printk() (0)
7304  static inline void kdb_init(int level) {}
7305  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7306                                char *help, short minlen) { return 0; }
7307 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
7308 index bc6ed52a39b9..7894d55e4998 100644
7309 --- a/include/linux/kernel.h
7310 +++ b/include/linux/kernel.h
7311 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
7312   */
7313  # define might_sleep() \
7314         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7315 +
7316 +# define might_sleep_no_state_check() \
7317 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7318  # define sched_annotate_sleep()        (current->task_state_change = 0)
7319  #else
7320    static inline void ___might_sleep(const char *file, int line,
7321 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
7322    static inline void __might_sleep(const char *file, int line,
7323                                    int preempt_offset) { }
7324  # define might_sleep() do { might_resched(); } while (0)
7325 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7326  # define sched_annotate_sleep() do { } while (0)
7327  #endif
7328  
7329 @@ -488,6 +492,7 @@ extern enum system_states {
7330         SYSTEM_HALT,
7331         SYSTEM_POWER_OFF,
7332         SYSTEM_RESTART,
7333 +       SYSTEM_SUSPEND,
7334  } system_state;
7335  
7336  #define TAINT_PROPRIETARY_MODULE       0
7337 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
7338 index cb483305e1f5..4e5062316bb6 100644
7339 --- a/include/linux/list_bl.h
7340 +++ b/include/linux/list_bl.h
7341 @@ -2,6 +2,7 @@
7342  #define _LINUX_LIST_BL_H
7343  
7344  #include <linux/list.h>
7345 +#include <linux/spinlock.h>
7346  #include <linux/bit_spinlock.h>
7347  
7348  /*
7349 @@ -32,13 +33,24 @@
7350  
7351  struct hlist_bl_head {
7352         struct hlist_bl_node *first;
7353 +#ifdef CONFIG_PREEMPT_RT_BASE
7354 +       raw_spinlock_t lock;
7355 +#endif
7356  };
7357  
7358  struct hlist_bl_node {
7359         struct hlist_bl_node *next, **pprev;
7360  };
7361 -#define INIT_HLIST_BL_HEAD(ptr) \
7362 -       ((ptr)->first = NULL)
7363 +
7364 +#ifdef CONFIG_PREEMPT_RT_BASE
7365 +#define INIT_HLIST_BL_HEAD(h)          \
7366 +do {                                   \
7367 +       (h)->first = NULL;              \
7368 +       raw_spin_lock_init(&(h)->lock); \
7369 +} while (0)
7370 +#else
7371 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7372 +#endif
7373  
7374  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7375  {
7376 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
7377  
7378  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7379  {
7380 +#ifndef CONFIG_PREEMPT_RT_BASE
7381         bit_spin_lock(0, (unsigned long *)b);
7382 +#else
7383 +       raw_spin_lock(&b->lock);
7384 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7385 +       __set_bit(0, (unsigned long *)b);
7386 +#endif
7387 +#endif
7388  }
7389  
7390  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7391  {
7392 +#ifndef CONFIG_PREEMPT_RT_BASE
7393         __bit_spin_unlock(0, (unsigned long *)b);
7394 +#else
7395 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7396 +       __clear_bit(0, (unsigned long *)b);
7397 +#endif
7398 +       raw_spin_unlock(&b->lock);
7399 +#endif
7400  }
7401  
7402  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7403 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
7404 new file mode 100644
7405 index 000000000000..845c77f1a5ca
7406 --- /dev/null
7407 +++ b/include/linux/locallock.h
7408 @@ -0,0 +1,278 @@
7409 +#ifndef _LINUX_LOCALLOCK_H
7410 +#define _LINUX_LOCALLOCK_H
7411 +
7412 +#include <linux/percpu.h>
7413 +#include <linux/spinlock.h>
7414 +
7415 +#ifdef CONFIG_PREEMPT_RT_BASE
7416 +
7417 +#ifdef CONFIG_DEBUG_SPINLOCK
7418 +# define LL_WARN(cond) WARN_ON(cond)
7419 +#else
7420 +# define LL_WARN(cond) do { } while (0)
7421 +#endif
7422 +
7423 +/*
7424 + * per cpu lock based substitute for local_irq_*()
7425 + */
7426 +struct local_irq_lock {
7427 +       spinlock_t              lock;
7428 +       struct task_struct      *owner;
7429 +       int                     nestcnt;
7430 +       unsigned long           flags;
7431 +};
7432 +
7433 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7434 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7435 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7436 +
7437 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7438 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7439 +
7440 +#define local_irq_lock_init(lvar)                                      \
7441 +       do {                                                            \
7442 +               int __cpu;                                              \
7443 +               for_each_possible_cpu(__cpu)                            \
7444 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7445 +       } while (0)
7446 +
7447 +/*
7448 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7449 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7450 + * already takes care of the migrate_disable/enable
7451 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7452 + */
7453 +#ifdef CONFIG_PREEMPT_RT_FULL
7454 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7455 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7456 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7457 +#else
7458 +# define spin_lock_local(lock)                 spin_lock(lock)
7459 +# define spin_trylock_local(lock)              spin_trylock(lock)
7460 +# define spin_unlock_local(lock)               spin_unlock(lock)
7461 +#endif
7462 +
7463 +static inline void __local_lock(struct local_irq_lock *lv)
7464 +{
7465 +       if (lv->owner != current) {
7466 +               spin_lock_local(&lv->lock);
7467 +               LL_WARN(lv->owner);
7468 +               LL_WARN(lv->nestcnt);
7469 +               lv->owner = current;
7470 +       }
7471 +       lv->nestcnt++;
7472 +}
7473 +
7474 +#define local_lock(lvar)                                       \
7475 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7476 +
7477 +#define local_lock_on(lvar, cpu)                               \
7478 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7479 +
7480 +static inline int __local_trylock(struct local_irq_lock *lv)
7481 +{
7482 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7483 +               LL_WARN(lv->owner);
7484 +               LL_WARN(lv->nestcnt);
7485 +               lv->owner = current;
7486 +               lv->nestcnt = 1;
7487 +               return 1;
7488 +       }
7489 +       return 0;
7490 +}
7491 +
7492 +#define local_trylock(lvar)                                            \
7493 +       ({                                                              \
7494 +               int __locked;                                           \
7495 +               __locked = __local_trylock(&get_local_var(lvar));       \
7496 +               if (!__locked)                                          \
7497 +                       put_local_var(lvar);                            \
7498 +               __locked;                                               \
7499 +       })
7500 +
7501 +static inline void __local_unlock(struct local_irq_lock *lv)
7502 +{
7503 +       LL_WARN(lv->nestcnt == 0);
7504 +       LL_WARN(lv->owner != current);
7505 +       if (--lv->nestcnt)
7506 +               return;
7507 +
7508 +       lv->owner = NULL;
7509 +       spin_unlock_local(&lv->lock);
7510 +}
7511 +
7512 +#define local_unlock(lvar)                                     \
7513 +       do {                                                    \
7514 +               __local_unlock(this_cpu_ptr(&lvar));            \
7515 +               put_local_var(lvar);                            \
7516 +       } while (0)
7517 +
7518 +#define local_unlock_on(lvar, cpu)                       \
7519 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7520 +
7521 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7522 +{
7523 +       spin_lock_irqsave(&lv->lock, lv->flags);
7524 +       LL_WARN(lv->owner);
7525 +       LL_WARN(lv->nestcnt);
7526 +       lv->owner = current;
7527 +       lv->nestcnt = 1;
7528 +}
7529 +
7530 +#define local_lock_irq(lvar)                                           \
7531 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7532 +
7533 +#define local_lock_irq_on(lvar, cpu)                                   \
7534 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7535 +
7536 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7537 +{
7538 +       LL_WARN(!lv->nestcnt);
7539 +       LL_WARN(lv->owner != current);
7540 +       lv->owner = NULL;
7541 +       lv->nestcnt = 0;
7542 +       spin_unlock_irq(&lv->lock);
7543 +}
7544 +
7545 +#define local_unlock_irq(lvar)                                         \
7546 +       do {                                                            \
7547 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7548 +               put_local_var(lvar);                                    \
7549 +       } while (0)
7550 +
7551 +#define local_unlock_irq_on(lvar, cpu)                                 \
7552 +       do {                                                            \
7553 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7554 +       } while (0)
7555 +
7556 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7557 +{
7558 +       if (lv->owner != current) {
7559 +               __local_lock_irq(lv);
7560 +               return 0;
7561 +       } else {
7562 +               lv->nestcnt++;
7563 +               return 1;
7564 +       }
7565 +}
7566 +
7567 +#define local_lock_irqsave(lvar, _flags)                               \
7568 +       do {                                                            \
7569 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7570 +                       put_local_var(lvar);                            \
7571 +               _flags = __this_cpu_read(lvar.flags);                   \
7572 +       } while (0)
7573 +
7574 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7575 +       do {                                                            \
7576 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7577 +               _flags = per_cpu(lvar, cpu).flags;                      \
7578 +       } while (0)
7579 +
7580 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7581 +                                           unsigned long flags)
7582 +{
7583 +       LL_WARN(!lv->nestcnt);
7584 +       LL_WARN(lv->owner != current);
7585 +       if (--lv->nestcnt)
7586 +               return 0;
7587 +
7588 +       lv->owner = NULL;
7589 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7590 +       return 1;
7591 +}
7592 +
7593 +#define local_unlock_irqrestore(lvar, flags)                           \
7594 +       do {                                                            \
7595 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7596 +                       put_local_var(lvar);                            \
7597 +       } while (0)
7598 +
7599 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7600 +       do {                                                            \
7601 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7602 +       } while (0)
7603 +
7604 +#define local_spin_trylock_irq(lvar, lock)                             \
7605 +       ({                                                              \
7606 +               int __locked;                                           \
7607 +               local_lock_irq(lvar);                                   \
7608 +               __locked = spin_trylock(lock);                          \
7609 +               if (!__locked)                                          \
7610 +                       local_unlock_irq(lvar);                         \
7611 +               __locked;                                               \
7612 +       })
7613 +
7614 +#define local_spin_lock_irq(lvar, lock)                                        \
7615 +       do {                                                            \
7616 +               local_lock_irq(lvar);                                   \
7617 +               spin_lock(lock);                                        \
7618 +       } while (0)
7619 +
7620 +#define local_spin_unlock_irq(lvar, lock)                              \
7621 +       do {                                                            \
7622 +               spin_unlock(lock);                                      \
7623 +               local_unlock_irq(lvar);                                 \
7624 +       } while (0)
7625 +
7626 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7627 +       do {                                                            \
7628 +               local_lock_irqsave(lvar, flags);                        \
7629 +               spin_lock(lock);                                        \
7630 +       } while (0)
7631 +
7632 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7633 +       do {                                                            \
7634 +               spin_unlock(lock);                                      \
7635 +               local_unlock_irqrestore(lvar, flags);                   \
7636 +       } while (0)
7637 +
7638 +#define get_locked_var(lvar, var)                                      \
7639 +       (*({                                                            \
7640 +               local_lock(lvar);                                       \
7641 +               this_cpu_ptr(&var);                                     \
7642 +       }))
7643 +
7644 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7645 +
7646 +#define local_lock_cpu(lvar)                                           \
7647 +       ({                                                              \
7648 +               local_lock(lvar);                                       \
7649 +               smp_processor_id();                                     \
7650 +       })
7651 +
7652 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7653 +
7654 +#else /* PREEMPT_RT_BASE */
7655 +
7656 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7657 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7658 +
7659 +static inline void local_irq_lock_init(int lvar) { }
7660 +
7661 +#define local_lock(lvar)                       preempt_disable()
7662 +#define local_unlock(lvar)                     preempt_enable()
7663 +#define local_lock_irq(lvar)                   local_irq_disable()
7664 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7665 +#define local_unlock_irq(lvar)                 local_irq_enable()
7666 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7667 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7668 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7669 +
7670 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7671 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7672 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7673 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7674 +       spin_lock_irqsave(lock, flags)
7675 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7676 +       spin_unlock_irqrestore(lock, flags)
7677 +
7678 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7679 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7680 +
7681 +#define local_lock_cpu(lvar)                   get_cpu()
7682 +#define local_unlock_cpu(lvar)                 put_cpu()
7683 +
7684 +#endif
7685 +
7686 +#endif
7687 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
7688 index 08d947fc4c59..705fb564a605 100644
7689 --- a/include/linux/mm_types.h
7690 +++ b/include/linux/mm_types.h
7691 @@ -11,6 +11,7 @@
7692  #include <linux/completion.h>
7693  #include <linux/cpumask.h>
7694  #include <linux/uprobes.h>
7695 +#include <linux/rcupdate.h>
7696  #include <linux/page-flags-layout.h>
7697  #include <linux/workqueue.h>
7698  #include <asm/page.h>
7699 @@ -509,6 +510,9 @@ struct mm_struct {
7700         bool tlb_flush_pending;
7701  #endif
7702         struct uprobes_state uprobes_state;
7703 +#ifdef CONFIG_PREEMPT_RT_BASE
7704 +       struct rcu_head delayed_drop;
7705 +#endif
7706  #ifdef CONFIG_X86_INTEL_MPX
7707         /* address of the bounds directory */
7708         void __user *bd_addr;
7709 diff --git a/include/linux/module.h b/include/linux/module.h
7710 index 0c3207d26ac0..5944baaa3f28 100644
7711 --- a/include/linux/module.h
7712 +++ b/include/linux/module.h
7713 @@ -496,6 +496,7 @@ static inline int module_is_live(struct module *mod)
7714  struct module *__module_text_address(unsigned long addr);
7715  struct module *__module_address(unsigned long addr);
7716  bool is_module_address(unsigned long addr);
7717 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
7718  bool is_module_percpu_address(unsigned long addr);
7719  bool is_module_text_address(unsigned long addr);
7720  
7721 @@ -663,6 +664,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
7722         return false;
7723  }
7724  
7725 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
7726 +{
7727 +       return false;
7728 +}
7729 +
7730  static inline bool is_module_text_address(unsigned long addr)
7731  {
7732         return false;
7733 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
7734 index 2cb7531e7d7a..b3fdfc820216 100644
7735 --- a/include/linux/mutex.h
7736 +++ b/include/linux/mutex.h
7737 @@ -19,6 +19,17 @@
7738  #include <asm/processor.h>
7739  #include <linux/osq_lock.h>
7740  
7741 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7742 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7743 +       , .dep_map = { .name = #lockname }
7744 +#else
7745 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7746 +#endif
7747 +
7748 +#ifdef CONFIG_PREEMPT_RT_FULL
7749 +# include <linux/mutex_rt.h>
7750 +#else
7751 +
7752  /*
7753   * Simple, straightforward mutexes with strict semantics:
7754   *
7755 @@ -99,13 +110,6 @@ do {                                                        \
7756  static inline void mutex_destroy(struct mutex *lock) {}
7757  #endif
7758  
7759 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7760 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7761 -               , .dep_map = { .name = #lockname }
7762 -#else
7763 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7764 -#endif
7765 -
7766  #define __MUTEX_INITIALIZER(lockname) \
7767                 { .count = ATOMIC_INIT(1) \
7768                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7769 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
7770  extern int mutex_trylock(struct mutex *lock);
7771  extern void mutex_unlock(struct mutex *lock);
7772  
7773 +#endif /* !PREEMPT_RT_FULL */
7774 +
7775  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7776  
7777  #endif /* __LINUX_MUTEX_H */
7778 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
7779 new file mode 100644
7780 index 000000000000..e0284edec655
7781 --- /dev/null
7782 +++ b/include/linux/mutex_rt.h
7783 @@ -0,0 +1,89 @@
7784 +#ifndef __LINUX_MUTEX_RT_H
7785 +#define __LINUX_MUTEX_RT_H
7786 +
7787 +#ifndef __LINUX_MUTEX_H
7788 +#error "Please include mutex.h"
7789 +#endif
7790 +
7791 +#include <linux/rtmutex.h>
7792 +
7793 +/* FIXME: Just for __lockfunc */
7794 +#include <linux/spinlock.h>
7795 +
7796 +struct mutex {
7797 +       struct rt_mutex         lock;
7798 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7799 +       struct lockdep_map      dep_map;
7800 +#endif
7801 +};
7802 +
7803 +#define __MUTEX_INITIALIZER(mutexname)                                 \
7804 +       {                                                               \
7805 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
7806 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
7807 +       }
7808 +
7809 +#define DEFINE_MUTEX(mutexname)                                                \
7810 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7811 +
7812 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7813 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7814 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7815 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7816 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7817 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7818 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7819 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7820 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7821 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7822 +
7823 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
7824 +#define mutex_lock(l)                  _mutex_lock(l)
7825 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
7826 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
7827 +#define mutex_trylock(l)               _mutex_trylock(l)
7828 +#define mutex_unlock(l)                        _mutex_unlock(l)
7829 +
7830 +#ifdef CONFIG_DEBUG_MUTEXES
7831 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
7832 +#else
7833 +static inline void mutex_destroy(struct mutex *lock) {}
7834 +#endif
7835 +
7836 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7837 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
7838 +# define mutex_lock_interruptible_nested(l, s) \
7839 +                                       _mutex_lock_interruptible_nested(l, s)
7840 +# define mutex_lock_killable_nested(l, s) \
7841 +                                       _mutex_lock_killable_nested(l, s)
7842 +
7843 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
7844 +do {                                                                   \
7845 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
7846 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
7847 +} while (0)
7848 +
7849 +#else
7850 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
7851 +# define mutex_lock_interruptible_nested(l, s) \
7852 +                                       _mutex_lock_interruptible(l)
7853 +# define mutex_lock_killable_nested(l, s) \
7854 +                                       _mutex_lock_killable(l)
7855 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7856 +#endif
7857 +
7858 +# define mutex_init(mutex)                             \
7859 +do {                                                   \
7860 +       static struct lock_class_key __key;             \
7861 +                                                       \
7862 +       rt_mutex_init(&(mutex)->lock);                  \
7863 +       __mutex_do_init((mutex), #mutex, &__key);       \
7864 +} while (0)
7865 +
7866 +# define __mutex_init(mutex, name, key)                        \
7867 +do {                                                   \
7868 +       rt_mutex_init(&(mutex)->lock);                  \
7869 +       __mutex_do_init((mutex), name, key);            \
7870 +} while (0)
7871 +
7872 +#endif
7873 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
7874 index bb9b102c15cd..a5b12b8ad196 100644
7875 --- a/include/linux/netdevice.h
7876 +++ b/include/linux/netdevice.h
7877 @@ -396,7 +396,19 @@ typedef enum rx_handler_result rx_handler_result_t;
7878  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7879  
7880  void __napi_schedule(struct napi_struct *n);
7881 +
7882 +/*
7883 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7884 + * run as threads, and they can also be preempted (without PREEMPT_RT
7885 + * interrupt threads can not be preempted). Which means that calling
7886 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7887 + * and can corrupt the napi->poll_list.
7888 + */
7889 +#ifdef CONFIG_PREEMPT_RT_FULL
7890 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7891 +#else
7892  void __napi_schedule_irqoff(struct napi_struct *n);
7893 +#endif
7894  
7895  static inline bool napi_disable_pending(struct napi_struct *n)
7896  {
7897 @@ -2463,14 +2475,53 @@ void netdev_freemem(struct net_device *dev);
7898  void synchronize_net(void);
7899  int init_dummy_netdev(struct net_device *dev);
7900  
7901 -DECLARE_PER_CPU(int, xmit_recursion);
7902  #define XMIT_RECURSION_LIMIT   10
7903 +#ifdef CONFIG_PREEMPT_RT_FULL
7904 +static inline int dev_recursion_level(void)
7905 +{
7906 +       return current->xmit_recursion;
7907 +}
7908 +
7909 +static inline int xmit_rec_read(void)
7910 +{
7911 +       return current->xmit_recursion;
7912 +}
7913 +
7914 +static inline void xmit_rec_inc(void)
7915 +{
7916 +       current->xmit_recursion++;
7917 +}
7918 +
7919 +static inline void xmit_rec_dec(void)
7920 +{
7921 +       current->xmit_recursion--;
7922 +}
7923 +
7924 +#else
7925 +
7926 +DECLARE_PER_CPU(int, xmit_recursion);
7927  
7928  static inline int dev_recursion_level(void)
7929  {
7930         return this_cpu_read(xmit_recursion);
7931  }
7932  
7933 +static inline int xmit_rec_read(void)
7934 +{
7935 +       return __this_cpu_read(xmit_recursion);
7936 +}
7937 +
7938 +static inline void xmit_rec_inc(void)
7939 +{
7940 +       __this_cpu_inc(xmit_recursion);
7941 +}
7942 +
7943 +static inline void xmit_rec_dec(void)
7944 +{
7945 +       __this_cpu_dec(xmit_recursion);
7946 +}
7947 +#endif
7948 +
7949  struct net_device *dev_get_by_index(struct net *net, int ifindex);
7950  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
7951  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
7952 @@ -2855,6 +2906,7 @@ struct softnet_data {
7953         unsigned int            dropped;
7954         struct sk_buff_head     input_pkt_queue;
7955         struct napi_struct      backlog;
7956 +       struct sk_buff_head     tofree_queue;
7957  
7958  };
7959  
7960 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
7961 index 2ad1a2b289b5..b4d10155af54 100644
7962 --- a/include/linux/netfilter/x_tables.h
7963 +++ b/include/linux/netfilter/x_tables.h
7964 @@ -4,6 +4,7 @@
7965  
7966  #include <linux/netdevice.h>
7967  #include <linux/static_key.h>
7968 +#include <linux/locallock.h>
7969  #include <uapi/linux/netfilter/x_tables.h>
7970  
7971  /* Test a struct->invflags and a boolean for inequality */
7972 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
7973   */
7974  DECLARE_PER_CPU(seqcount_t, xt_recseq);
7975  
7976 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
7977 +
7978  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
7979   *
7980   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
7981 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
7982  {
7983         unsigned int addend;
7984  
7985 +       /* RT protection */
7986 +       local_lock(xt_write_lock);
7987 +
7988         /*
7989          * Low order bit of sequence is set if we already
7990          * called xt_write_recseq_begin().
7991 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
7992         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
7993         smp_wmb();
7994         __this_cpu_add(xt_recseq.sequence, addend);
7995 +       local_unlock(xt_write_lock);
7996  }
7997  
7998  /*
7999 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
8000 index 810124b33327..d54ca43d571f 100644
8001 --- a/include/linux/nfs_fs.h
8002 +++ b/include/linux/nfs_fs.h
8003 @@ -165,7 +165,11 @@ struct nfs_inode {
8004  
8005         /* Readers: in-flight sillydelete RPC calls */
8006         /* Writers: rmdir */
8007 +#ifdef CONFIG_PREEMPT_RT_BASE
8008 +       struct semaphore        rmdir_sem;
8009 +#else
8010         struct rw_semaphore     rmdir_sem;
8011 +#endif
8012  
8013  #if IS_ENABLED(CONFIG_NFS_V4)
8014         struct nfs4_cached_acl  *nfs4_acl;
8015 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
8016 index beb1e10f446e..ebaf2e7bfe29 100644
8017 --- a/include/linux/nfs_xdr.h
8018 +++ b/include/linux/nfs_xdr.h
8019 @@ -1490,7 +1490,7 @@ struct nfs_unlinkdata {
8020         struct nfs_removeargs args;
8021         struct nfs_removeres res;
8022         struct dentry *dentry;
8023 -       wait_queue_head_t wq;
8024 +       struct swait_queue_head wq;
8025         struct rpc_cred *cred;
8026         struct nfs_fattr dir_attr;
8027         long timeout;
8028 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
8029 index 4149868de4e6..babe5b9bcb91 100644
8030 --- a/include/linux/notifier.h
8031 +++ b/include/linux/notifier.h
8032 @@ -6,7 +6,7 @@
8033   *
8034   *                             Alan Cox <Alan.Cox@linux.org>
8035   */
8036
8037 +
8038  #ifndef _LINUX_NOTIFIER_H
8039  #define _LINUX_NOTIFIER_H
8040  #include <linux/errno.h>
8041 @@ -42,9 +42,7 @@
8042   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
8043   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
8044   * SRCU notifier chains should be used when the chain will be called very
8045 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
8046 - * chains are slightly more difficult to use because they require special
8047 - * runtime initialization.
8048 + * often but notifier_blocks will seldom be removed.
8049   */
8050  
8051  struct notifier_block;
8052 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
8053                 (name)->head = NULL;            \
8054         } while (0)
8055  
8056 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
8057 +/* srcu_notifier_heads must be cleaned up dynamically */
8058  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8059  #define srcu_cleanup_notifier_head(name)       \
8060                 cleanup_srcu_struct(&(name)->srcu);
8061 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8062                 .head = NULL }
8063  #define RAW_NOTIFIER_INIT(name)        {                               \
8064                 .head = NULL }
8065 -/* srcu_notifier_heads cannot be initialized statically */
8066 +
8067 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
8068 +       {                                                       \
8069 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
8070 +               .head = NULL,                                   \
8071 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
8072 +       }
8073  
8074  #define ATOMIC_NOTIFIER_HEAD(name)                             \
8075         struct atomic_notifier_head name =                      \
8076 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8077         struct raw_notifier_head name =                         \
8078                 RAW_NOTIFIER_INIT(name)
8079  
8080 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
8081 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
8082 +                       name##_head_srcu_array);                \
8083 +       mod struct srcu_notifier_head name =                    \
8084 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
8085 +
8086 +#define SRCU_NOTIFIER_HEAD(name)                               \
8087 +       _SRCU_NOTIFIER_HEAD(name, )
8088 +
8089 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
8090 +       _SRCU_NOTIFIER_HEAD(name, static)
8091 +
8092  #ifdef __KERNEL__
8093  
8094  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
8095 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
8096  
8097  /*
8098   *     Declared notifiers so far. I can imagine quite a few more chains
8099 - *     over time (eg laptop power reset chains, reboot chain (to clean 
8100 + *     over time (eg laptop power reset chains, reboot chain (to clean
8101   *     device units up), device [un]mount chain, module load/unload chain,
8102 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
8103 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
8104   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
8105   */
8106
8107 +
8108  /* CPU notfiers are defined in include/linux/cpu.h. */
8109  
8110  /* netdevice notifiers are defined in include/linux/netdevice.h */
8111 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
8112 index 5b2e6159b744..ea940f451606 100644
8113 --- a/include/linux/percpu-rwsem.h
8114 +++ b/include/linux/percpu-rwsem.h
8115 @@ -4,7 +4,7 @@
8116  #include <linux/atomic.h>
8117  #include <linux/rwsem.h>
8118  #include <linux/percpu.h>
8119 -#include <linux/wait.h>
8120 +#include <linux/swait.h>
8121  #include <linux/rcu_sync.h>
8122  #include <linux/lockdep.h>
8123  
8124 @@ -12,7 +12,7 @@ struct percpu_rw_semaphore {
8125         struct rcu_sync         rss;
8126         unsigned int __percpu   *read_count;
8127         struct rw_semaphore     rw_sem;
8128 -       wait_queue_head_t       writer;
8129 +       struct swait_queue_head writer;
8130         int                     readers_block;
8131  };
8132  
8133 @@ -22,13 +22,13 @@ static struct percpu_rw_semaphore name = {                          \
8134         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
8135         .read_count = &__percpu_rwsem_rc_##name,                        \
8136         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
8137 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
8138 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
8139  }
8140  
8141  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
8142  extern void __percpu_up_read(struct percpu_rw_semaphore *);
8143  
8144 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
8145 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8146  {
8147         might_sleep();
8148  
8149 @@ -46,16 +46,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
8150         __this_cpu_inc(*sem->read_count);
8151         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
8152                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
8153 -       barrier();
8154         /*
8155 -        * The barrier() prevents the compiler from
8156 +        * The preempt_enable() prevents the compiler from
8157          * bleeding the critical section out.
8158          */
8159 -}
8160 -
8161 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8162 -{
8163 -       percpu_down_read_preempt_disable(sem);
8164         preempt_enable();
8165  }
8166  
8167 @@ -82,13 +76,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
8168         return ret;
8169  }
8170  
8171 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
8172 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8173  {
8174 -       /*
8175 -        * The barrier() prevents the compiler from
8176 -        * bleeding the critical section out.
8177 -        */
8178 -       barrier();
8179 +       preempt_disable();
8180         /*
8181          * Same as in percpu_down_read().
8182          */
8183 @@ -101,12 +91,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
8184         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
8185  }
8186  
8187 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8188 -{
8189 -       preempt_disable();
8190 -       percpu_up_read_preempt_enable(sem);
8191 -}
8192 -
8193  extern void percpu_down_write(struct percpu_rw_semaphore *);
8194  extern void percpu_up_write(struct percpu_rw_semaphore *);
8195  
8196 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
8197 index 56939d3f6e53..b988bf40ad3e 100644
8198 --- a/include/linux/percpu.h
8199 +++ b/include/linux/percpu.h
8200 @@ -18,6 +18,35 @@
8201  #define PERCPU_MODULE_RESERVE          0
8202  #endif
8203  
8204 +#ifdef CONFIG_PREEMPT_RT_FULL
8205 +
8206 +#define get_local_var(var) (*({        \
8207 +       migrate_disable();      \
8208 +       this_cpu_ptr(&var);     }))
8209 +
8210 +#define put_local_var(var) do {        \
8211 +       (void)&(var);           \
8212 +       migrate_enable();       \
8213 +} while (0)
8214 +
8215 +# define get_local_ptr(var) ({ \
8216 +       migrate_disable();      \
8217 +       this_cpu_ptr(var);      })
8218 +
8219 +# define put_local_ptr(var) do {       \
8220 +       (void)(var);                    \
8221 +       migrate_enable();               \
8222 +} while (0)
8223 +
8224 +#else
8225 +
8226 +#define get_local_var(var)     get_cpu_var(var)
8227 +#define put_local_var(var)     put_cpu_var(var)
8228 +#define get_local_ptr(var)     get_cpu_ptr(var)
8229 +#define put_local_ptr(var)     put_cpu_ptr(var)
8230 +
8231 +#endif
8232 +
8233  /* minimum unit size, also is the maximum supported allocation size */
8234  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
8235  
8236 @@ -110,6 +139,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
8237  #endif
8238  
8239  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
8240 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
8241  extern bool is_kernel_percpu_address(unsigned long addr);
8242  
8243  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
8244 diff --git a/include/linux/pid.h b/include/linux/pid.h
8245 index 23705a53abba..2cc64b779f03 100644
8246 --- a/include/linux/pid.h
8247 +++ b/include/linux/pid.h
8248 @@ -2,6 +2,7 @@
8249  #define _LINUX_PID_H
8250  
8251  #include <linux/rcupdate.h>
8252 +#include <linux/atomic.h>
8253  
8254  enum pid_type
8255  {
8256 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
8257 index 75e4e30677f1..1cfb1cb72354 100644
8258 --- a/include/linux/preempt.h
8259 +++ b/include/linux/preempt.h
8260 @@ -50,7 +50,11 @@
8261  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8262  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8263  
8264 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8265 +#ifndef CONFIG_PREEMPT_RT_FULL
8266 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8267 +#else
8268 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8269 +#endif
8270  
8271  /* We use the MSB mostly because its available */
8272  #define PREEMPT_NEED_RESCHED   0x80000000
8273 @@ -59,9 +63,15 @@
8274  #include <asm/preempt.h>
8275  
8276  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8277 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8278  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8279                                  | NMI_MASK))
8280 +#ifndef CONFIG_PREEMPT_RT_FULL
8281 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8282 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8283 +#else
8284 +# define softirq_count()       (0UL)
8285 +extern int in_serving_softirq(void);
8286 +#endif
8287  
8288  /*
8289   * Are we doing bottom half or hardware interrupt processing?
8290 @@ -72,7 +82,6 @@
8291  #define in_irq()               (hardirq_count())
8292  #define in_softirq()           (softirq_count())
8293  #define in_interrupt()         (irq_count())
8294 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8295  
8296  /*
8297   * Are we in NMI context?
8298 @@ -91,7 +100,11 @@
8299  /*
8300   * The preempt_count offset after spin_lock()
8301   */
8302 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8303  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8304 +#else
8305 +#define PREEMPT_LOCK_OFFSET    0
8306 +#endif
8307  
8308  /*
8309   * The preempt_count offset needed for things like:
8310 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
8311  #define preempt_count_inc() preempt_count_add(1)
8312  #define preempt_count_dec() preempt_count_sub(1)
8313  
8314 +#ifdef CONFIG_PREEMPT_LAZY
8315 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8316 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8317 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8318 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8319 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8320 +#else
8321 +#define add_preempt_lazy_count(val)    do { } while (0)
8322 +#define sub_preempt_lazy_count(val)    do { } while (0)
8323 +#define inc_preempt_lazy_count()       do { } while (0)
8324 +#define dec_preempt_lazy_count()       do { } while (0)
8325 +#define preempt_lazy_count()           (0)
8326 +#endif
8327 +
8328  #ifdef CONFIG_PREEMPT_COUNT
8329  
8330  #define preempt_disable() \
8331 @@ -148,13 +175,25 @@ do { \
8332         barrier(); \
8333  } while (0)
8334  
8335 +#define preempt_lazy_disable() \
8336 +do { \
8337 +       inc_preempt_lazy_count(); \
8338 +       barrier(); \
8339 +} while (0)
8340 +
8341  #define sched_preempt_enable_no_resched() \
8342  do { \
8343         barrier(); \
8344         preempt_count_dec(); \
8345  } while (0)
8346  
8347 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8348 +#ifdef CONFIG_PREEMPT_RT_BASE
8349 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8350 +# define preempt_check_resched_rt() preempt_check_resched()
8351 +#else
8352 +# define preempt_enable_no_resched() preempt_enable()
8353 +# define preempt_check_resched_rt() barrier();
8354 +#endif
8355  
8356  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8357  
8358 @@ -179,6 +218,13 @@ do { \
8359                 __preempt_schedule(); \
8360  } while (0)
8361  
8362 +#define preempt_lazy_enable() \
8363 +do { \
8364 +       dec_preempt_lazy_count(); \
8365 +       barrier(); \
8366 +       preempt_check_resched(); \
8367 +} while (0)
8368 +
8369  #else /* !CONFIG_PREEMPT */
8370  #define preempt_enable() \
8371  do { \
8372 @@ -224,6 +270,7 @@ do { \
8373  #define preempt_disable_notrace()              barrier()
8374  #define preempt_enable_no_resched_notrace()    barrier()
8375  #define preempt_enable_notrace()               barrier()
8376 +#define preempt_check_resched_rt()             barrier()
8377  #define preemptible()                          0
8378  
8379  #endif /* CONFIG_PREEMPT_COUNT */
8380 @@ -244,10 +291,31 @@ do { \
8381  } while (0)
8382  #define preempt_fold_need_resched() \
8383  do { \
8384 -       if (tif_need_resched()) \
8385 +       if (tif_need_resched_now()) \
8386                 set_preempt_need_resched(); \
8387  } while (0)
8388  
8389 +#ifdef CONFIG_PREEMPT_RT_FULL
8390 +# define preempt_disable_rt()          preempt_disable()
8391 +# define preempt_enable_rt()           preempt_enable()
8392 +# define preempt_disable_nort()                barrier()
8393 +# define preempt_enable_nort()         barrier()
8394 +# ifdef CONFIG_SMP
8395 +   extern void migrate_disable(void);
8396 +   extern void migrate_enable(void);
8397 +# else /* CONFIG_SMP */
8398 +#  define migrate_disable()            barrier()
8399 +#  define migrate_enable()             barrier()
8400 +# endif /* CONFIG_SMP */
8401 +#else
8402 +# define preempt_disable_rt()          barrier()
8403 +# define preempt_enable_rt()           barrier()
8404 +# define preempt_disable_nort()                preempt_disable()
8405 +# define preempt_enable_nort()         preempt_enable()
8406 +# define migrate_disable()             preempt_disable()
8407 +# define migrate_enable()              preempt_enable()
8408 +#endif
8409 +
8410  #ifdef CONFIG_PREEMPT_NOTIFIERS
8411  
8412  struct preempt_notifier;
8413 diff --git a/include/linux/printk.h b/include/linux/printk.h
8414 index eac1af8502bb..37e647af0b0b 100644
8415 --- a/include/linux/printk.h
8416 +++ b/include/linux/printk.h
8417 @@ -126,9 +126,11 @@ struct va_format {
8418  #ifdef CONFIG_EARLY_PRINTK
8419  extern asmlinkage __printf(1, 2)
8420  void early_printk(const char *fmt, ...);
8421 +extern void printk_kill(void);
8422  #else
8423  static inline __printf(1, 2) __cold
8424  void early_printk(const char *s, ...) { }
8425 +static inline void printk_kill(void) { }
8426  #endif
8427  
8428  #ifdef CONFIG_PRINTK_NMI
8429 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
8430 index af3581b8a451..277295039c8f 100644
8431 --- a/include/linux/radix-tree.h
8432 +++ b/include/linux/radix-tree.h
8433 @@ -292,6 +292,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
8434  int radix_tree_preload(gfp_t gfp_mask);
8435  int radix_tree_maybe_preload(gfp_t gfp_mask);
8436  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8437 +void radix_tree_preload_end(void);
8438 +
8439  void radix_tree_init(void);
8440  void *radix_tree_tag_set(struct radix_tree_root *root,
8441                         unsigned long index, unsigned int tag);
8442 @@ -314,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
8443  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
8444  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8445  
8446 -static inline void radix_tree_preload_end(void)
8447 -{
8448 -       preempt_enable();
8449 -}
8450 -
8451  /**
8452   * struct radix_tree_iter - radix tree iterator state
8453   *
8454 diff --git a/include/linux/random.h b/include/linux/random.h
8455 index 7bd2403e4fef..b2df7148a42b 100644
8456 --- a/include/linux/random.h
8457 +++ b/include/linux/random.h
8458 @@ -31,7 +31,7 @@ static inline void add_latent_entropy(void) {}
8459  
8460  extern void add_input_randomness(unsigned int type, unsigned int code,
8461                                  unsigned int value) __latent_entropy;
8462 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8463 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8464  
8465  extern void get_random_bytes(void *buf, int nbytes);
8466  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8467 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
8468 index e585018498d5..25c64474fc27 100644
8469 --- a/include/linux/rbtree.h
8470 +++ b/include/linux/rbtree.h
8471 @@ -31,7 +31,7 @@
8472  
8473  #include <linux/kernel.h>
8474  #include <linux/stddef.h>
8475 -#include <linux/rcupdate.h>
8476 +#include <linux/rcu_assign_pointer.h>
8477  
8478  struct rb_node {
8479         unsigned long  __rb_parent_color;
8480 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
8481 index d076183e49be..36bfb4dd57ae 100644
8482 --- a/include/linux/rbtree_augmented.h
8483 +++ b/include/linux/rbtree_augmented.h
8484 @@ -26,6 +26,7 @@
8485  
8486  #include <linux/compiler.h>
8487  #include <linux/rbtree.h>
8488 +#include <linux/rcupdate.h>
8489  
8490  /*
8491   * Please note - only struct rb_augment_callbacks and the prototypes for
8492 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
8493 new file mode 100644
8494 index 000000000000..7066962a4379
8495 --- /dev/null
8496 +++ b/include/linux/rcu_assign_pointer.h
8497 @@ -0,0 +1,54 @@
8498 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8499 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8500 +#include <linux/compiler.h>
8501 +#include <asm/barrier.h>
8502 +
8503 +/**
8504 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8505 + * @v: The value to statically initialize with.
8506 + */
8507 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8508 +
8509 +/**
8510 + * rcu_assign_pointer() - assign to RCU-protected pointer
8511 + * @p: pointer to assign to
8512 + * @v: value to assign (publish)
8513 + *
8514 + * Assigns the specified value to the specified RCU-protected
8515 + * pointer, ensuring that any concurrent RCU readers will see
8516 + * any prior initialization.
8517 + *
8518 + * Inserts memory barriers on architectures that require them
8519 + * (which is most of them), and also prevents the compiler from
8520 + * reordering the code that initializes the structure after the pointer
8521 + * assignment.  More importantly, this call documents which pointers
8522 + * will be dereferenced by RCU read-side code.
8523 + *
8524 + * In some special cases, you may use RCU_INIT_POINTER() instead
8525 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8526 + * to the fact that it does not constrain either the CPU or the compiler.
8527 + * That said, using RCU_INIT_POINTER() when you should have used
8528 + * rcu_assign_pointer() is a very bad thing that results in
8529 + * impossible-to-diagnose memory corruption.  So please be careful.
8530 + * See the RCU_INIT_POINTER() comment header for details.
8531 + *
8532 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8533 + * once, appearances notwithstanding.  One of the "extra" evaluations
8534 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8535 + * neither of which actually execute the argument.  As with most cpp
8536 + * macros, this execute-arguments-only-once property is important, so
8537 + * please be careful when making changes to rcu_assign_pointer() and the
8538 + * other macros that it invokes.
8539 + */
8540 +#define rcu_assign_pointer(p, v)                                             \
8541 +({                                                                           \
8542 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8543 +                                                                             \
8544 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8545 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8546 +       else                                                                  \
8547 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8548 +       _r_a_p__v;                                                            \
8549 +})
8550 +
8551 +#endif
8552 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
8553 index 01f71e1d2e94..30cc001d0d5a 100644
8554 --- a/include/linux/rcupdate.h
8555 +++ b/include/linux/rcupdate.h
8556 @@ -46,6 +46,7 @@
8557  #include <linux/compiler.h>
8558  #include <linux/ktime.h>
8559  #include <linux/irqflags.h>
8560 +#include <linux/rcu_assign_pointer.h>
8561  
8562  #include <asm/barrier.h>
8563  
8564 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
8565  
8566  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8567  
8568 +#ifdef CONFIG_PREEMPT_RT_FULL
8569 +#define call_rcu_bh    call_rcu
8570 +#else
8571  /**
8572   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8573   * @head: structure to be used for queueing the RCU updates.
8574 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
8575   */
8576  void call_rcu_bh(struct rcu_head *head,
8577                  rcu_callback_t func);
8578 +#endif
8579  
8580  /**
8581   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8582 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
8583   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8584   */
8585  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8586 +#ifndef CONFIG_PREEMPT_RT_FULL
8587 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8588 +#else
8589 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8590 +#endif
8591  
8592  #else /* #ifdef CONFIG_PREEMPT_RCU */
8593  
8594 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
8595         return 0;
8596  }
8597  
8598 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8599 +
8600  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8601  
8602  /* Internal to kernel */
8603 @@ -505,7 +517,14 @@ extern struct lockdep_map rcu_callback_map;
8604  int debug_lockdep_rcu_enabled(void);
8605  
8606  int rcu_read_lock_held(void);
8607 +#ifdef CONFIG_PREEMPT_RT_FULL
8608 +static inline int rcu_read_lock_bh_held(void)
8609 +{
8610 +       return rcu_read_lock_held();
8611 +}
8612 +#else
8613  int rcu_read_lock_bh_held(void);
8614 +#endif
8615  
8616  /**
8617   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8618 @@ -626,54 +645,6 @@ static inline void rcu_preempt_sleep_check(void)
8619  })
8620  
8621  /**
8622 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8623 - * @v: The value to statically initialize with.
8624 - */
8625 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8626 -
8627 -/**
8628 - * rcu_assign_pointer() - assign to RCU-protected pointer
8629 - * @p: pointer to assign to
8630 - * @v: value to assign (publish)
8631 - *
8632 - * Assigns the specified value to the specified RCU-protected
8633 - * pointer, ensuring that any concurrent RCU readers will see
8634 - * any prior initialization.
8635 - *
8636 - * Inserts memory barriers on architectures that require them
8637 - * (which is most of them), and also prevents the compiler from
8638 - * reordering the code that initializes the structure after the pointer
8639 - * assignment.  More importantly, this call documents which pointers
8640 - * will be dereferenced by RCU read-side code.
8641 - *
8642 - * In some special cases, you may use RCU_INIT_POINTER() instead
8643 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8644 - * to the fact that it does not constrain either the CPU or the compiler.
8645 - * That said, using RCU_INIT_POINTER() when you should have used
8646 - * rcu_assign_pointer() is a very bad thing that results in
8647 - * impossible-to-diagnose memory corruption.  So please be careful.
8648 - * See the RCU_INIT_POINTER() comment header for details.
8649 - *
8650 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8651 - * once, appearances notwithstanding.  One of the "extra" evaluations
8652 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8653 - * neither of which actually execute the argument.  As with most cpp
8654 - * macros, this execute-arguments-only-once property is important, so
8655 - * please be careful when making changes to rcu_assign_pointer() and the
8656 - * other macros that it invokes.
8657 - */
8658 -#define rcu_assign_pointer(p, v)                                             \
8659 -({                                                                           \
8660 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8661 -                                                                             \
8662 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8663 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8664 -       else                                                                  \
8665 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8666 -       _r_a_p__v;                                                            \
8667 -})
8668 -
8669 -/**
8670   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8671   * @p: The pointer to read
8672   *
8673 @@ -951,10 +922,14 @@ static inline void rcu_read_unlock(void)
8674  static inline void rcu_read_lock_bh(void)
8675  {
8676         local_bh_disable();
8677 +#ifdef CONFIG_PREEMPT_RT_FULL
8678 +       rcu_read_lock();
8679 +#else
8680         __acquire(RCU_BH);
8681         rcu_lock_acquire(&rcu_bh_lock_map);
8682         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8683                          "rcu_read_lock_bh() used illegally while idle");
8684 +#endif
8685  }
8686  
8687  /*
8688 @@ -964,10 +939,14 @@ static inline void rcu_read_lock_bh(void)
8689   */
8690  static inline void rcu_read_unlock_bh(void)
8691  {
8692 +#ifdef CONFIG_PREEMPT_RT_FULL
8693 +       rcu_read_unlock();
8694 +#else
8695         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8696                          "rcu_read_unlock_bh() used illegally while idle");
8697         rcu_lock_release(&rcu_bh_lock_map);
8698         __release(RCU_BH);
8699 +#endif
8700         local_bh_enable();
8701  }
8702  
8703 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
8704 index 63a4e4cf40a5..08ab12df2863 100644
8705 --- a/include/linux/rcutree.h
8706 +++ b/include/linux/rcutree.h
8707 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
8708         rcu_note_context_switch();
8709  }
8710  
8711 +#ifdef CONFIG_PREEMPT_RT_FULL
8712 +# define synchronize_rcu_bh    synchronize_rcu
8713 +#else
8714  void synchronize_rcu_bh(void);
8715 +#endif
8716  void synchronize_sched_expedited(void);
8717  void synchronize_rcu_expedited(void);
8718  
8719 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
8720  }
8721  
8722  void rcu_barrier(void);
8723 +#ifdef CONFIG_PREEMPT_RT_FULL
8724 +# define rcu_barrier_bh                rcu_barrier
8725 +#else
8726  void rcu_barrier_bh(void);
8727 +#endif
8728  void rcu_barrier_sched(void);
8729  unsigned long get_state_synchronize_rcu(void);
8730  void cond_synchronize_rcu(unsigned long oldstate);
8731 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
8732  extern unsigned long rcutorture_testseq;
8733  extern unsigned long rcutorture_vernum;
8734  unsigned long rcu_batches_started(void);
8735 -unsigned long rcu_batches_started_bh(void);
8736  unsigned long rcu_batches_started_sched(void);
8737  unsigned long rcu_batches_completed(void);
8738 -unsigned long rcu_batches_completed_bh(void);
8739  unsigned long rcu_batches_completed_sched(void);
8740  unsigned long rcu_exp_batches_completed(void);
8741  unsigned long rcu_exp_batches_completed_sched(void);
8742  void show_rcu_gp_kthreads(void);
8743  
8744  void rcu_force_quiescent_state(void);
8745 -void rcu_bh_force_quiescent_state(void);
8746  void rcu_sched_force_quiescent_state(void);
8747  
8748  void rcu_idle_enter(void);
8749 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
8750  
8751  bool rcu_is_watching(void);
8752  
8753 +#ifndef CONFIG_PREEMPT_RT_FULL
8754 +void rcu_bh_force_quiescent_state(void);
8755 +unsigned long rcu_batches_started_bh(void);
8756 +unsigned long rcu_batches_completed_bh(void);
8757 +#else
8758 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
8759 +# define rcu_batches_completed_bh      rcu_batches_completed
8760 +# define rcu_batches_started_bh                rcu_batches_completed
8761 +#endif
8762 +
8763  void rcu_all_qs(void);
8764  
8765  /* RCUtree hotplug events */
8766 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
8767 index 1abba5ce2a2f..294a8b4875f1 100644
8768 --- a/include/linux/rtmutex.h
8769 +++ b/include/linux/rtmutex.h
8770 @@ -13,11 +13,15 @@
8771  #define __LINUX_RT_MUTEX_H
8772  
8773  #include <linux/linkage.h>
8774 +#include <linux/spinlock_types_raw.h>
8775  #include <linux/rbtree.h>
8776 -#include <linux/spinlock_types.h>
8777  
8778  extern int max_lock_depth; /* for sysctl */
8779  
8780 +#ifdef CONFIG_DEBUG_MUTEXES
8781 +#include <linux/debug_locks.h>
8782 +#endif
8783 +
8784  /**
8785   * The rt_mutex structure
8786   *
8787 @@ -31,8 +35,8 @@ struct rt_mutex {
8788         struct rb_root          waiters;
8789         struct rb_node          *waiters_leftmost;
8790         struct task_struct      *owner;
8791 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8792         int                     save_state;
8793 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8794         const char              *name, *file;
8795         int                     line;
8796         void                    *magic;
8797 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
8798  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
8799  #endif
8800  
8801 +# define rt_mutex_init(mutex)                                  \
8802 +       do {                                                    \
8803 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
8804 +               __rt_mutex_init(mutex, #mutex);                 \
8805 +       } while (0)
8806 +
8807  #ifdef CONFIG_DEBUG_RT_MUTEXES
8808  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8809         , .name = #mutexname, .file = __FILE__, .line = __LINE__
8810 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
8811   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8812  #else
8813  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8814 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
8815  # define rt_mutex_debug_task_free(t)                   do { } while (0)
8816  #endif
8817  
8818 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8819 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8820 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8821 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8822         , .waiters = RB_ROOT \
8823         , .owner = NULL \
8824 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8825 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8826 +
8827 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8828 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8829 +
8830 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8831 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
8832 +       , .save_state = 1 }
8833  
8834  #define DEFINE_RT_MUTEX(mutexname) \
8835         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8836 @@ -90,7 +105,9 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
8837  extern void rt_mutex_destroy(struct rt_mutex *lock);
8838  
8839  extern void rt_mutex_lock(struct rt_mutex *lock);
8840 +extern int rt_mutex_lock_state(struct rt_mutex *lock, int state);
8841  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8842 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8843  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8844                                struct hrtimer_sleeper *timeout);
8845  
8846 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
8847 new file mode 100644
8848 index 000000000000..49ed2d45d3be
8849 --- /dev/null
8850 +++ b/include/linux/rwlock_rt.h
8851 @@ -0,0 +1,99 @@
8852 +#ifndef __LINUX_RWLOCK_RT_H
8853 +#define __LINUX_RWLOCK_RT_H
8854 +
8855 +#ifndef __LINUX_SPINLOCK_H
8856 +#error Do not include directly. Use spinlock.h
8857 +#endif
8858 +
8859 +#define rwlock_init(rwl)                               \
8860 +do {                                                   \
8861 +       static struct lock_class_key __key;             \
8862 +                                                       \
8863 +       rt_mutex_init(&(rwl)->lock);                    \
8864 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
8865 +} while (0)
8866 +
8867 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8868 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8869 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8870 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8871 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8872 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8873 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8874 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8875 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8876 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8877 +
8878 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
8879 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
8880 +
8881 +#define write_trylock_irqsave(lock, flags)     \
8882 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8883 +
8884 +#define read_lock_irqsave(lock, flags)                 \
8885 +       do {                                            \
8886 +               typecheck(unsigned long, flags);        \
8887 +               flags = rt_read_lock_irqsave(lock);     \
8888 +       } while (0)
8889 +
8890 +#define write_lock_irqsave(lock, flags)                        \
8891 +       do {                                            \
8892 +               typecheck(unsigned long, flags);        \
8893 +               flags = rt_write_lock_irqsave(lock);    \
8894 +       } while (0)
8895 +
8896 +#define read_lock(lock)                rt_read_lock(lock)
8897 +
8898 +#define read_lock_bh(lock)                             \
8899 +       do {                                            \
8900 +               local_bh_disable();                     \
8901 +               rt_read_lock(lock);                     \
8902 +       } while (0)
8903 +
8904 +#define read_lock_irq(lock)    read_lock(lock)
8905 +
8906 +#define write_lock(lock)       rt_write_lock(lock)
8907 +
8908 +#define write_lock_bh(lock)                            \
8909 +       do {                                            \
8910 +               local_bh_disable();                     \
8911 +               rt_write_lock(lock);                    \
8912 +       } while (0)
8913 +
8914 +#define write_lock_irq(lock)   write_lock(lock)
8915 +
8916 +#define read_unlock(lock)      rt_read_unlock(lock)
8917 +
8918 +#define read_unlock_bh(lock)                           \
8919 +       do {                                            \
8920 +               rt_read_unlock(lock);                   \
8921 +               local_bh_enable();                      \
8922 +       } while (0)
8923 +
8924 +#define read_unlock_irq(lock)  read_unlock(lock)
8925 +
8926 +#define write_unlock(lock)     rt_write_unlock(lock)
8927 +
8928 +#define write_unlock_bh(lock)                          \
8929 +       do {                                            \
8930 +               rt_write_unlock(lock);                  \
8931 +               local_bh_enable();                      \
8932 +       } while (0)
8933 +
8934 +#define write_unlock_irq(lock) write_unlock(lock)
8935 +
8936 +#define read_unlock_irqrestore(lock, flags)            \
8937 +       do {                                            \
8938 +               typecheck(unsigned long, flags);        \
8939 +               (void) flags;                           \
8940 +               rt_read_unlock(lock);                   \
8941 +       } while (0)
8942 +
8943 +#define write_unlock_irqrestore(lock, flags) \
8944 +       do {                                            \
8945 +               typecheck(unsigned long, flags);        \
8946 +               (void) flags;                           \
8947 +               rt_write_unlock(lock);                  \
8948 +       } while (0)
8949 +
8950 +#endif
8951 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
8952 index cc0072e93e36..5317cd957292 100644
8953 --- a/include/linux/rwlock_types.h
8954 +++ b/include/linux/rwlock_types.h
8955 @@ -1,6 +1,10 @@
8956  #ifndef __LINUX_RWLOCK_TYPES_H
8957  #define __LINUX_RWLOCK_TYPES_H
8958  
8959 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
8960 +# error "Do not include directly, include spinlock_types.h"
8961 +#endif
8962 +
8963  /*
8964   * include/linux/rwlock_types.h - generic rwlock type definitions
8965   *                               and initializers
8966 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
8967 new file mode 100644
8968 index 000000000000..51b28d775fe1
8969 --- /dev/null
8970 +++ b/include/linux/rwlock_types_rt.h
8971 @@ -0,0 +1,33 @@
8972 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
8973 +#define __LINUX_RWLOCK_TYPES_RT_H
8974 +
8975 +#ifndef __LINUX_SPINLOCK_TYPES_H
8976 +#error "Do not include directly. Include spinlock_types.h instead"
8977 +#endif
8978 +
8979 +/*
8980 + * rwlocks - rtmutex which allows single reader recursion
8981 + */
8982 +typedef struct {
8983 +       struct rt_mutex         lock;
8984 +       int                     read_depth;
8985 +       unsigned int            break_lock;
8986 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8987 +       struct lockdep_map      dep_map;
8988 +#endif
8989 +} rwlock_t;
8990 +
8991 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8992 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
8993 +#else
8994 +# define RW_DEP_MAP_INIT(lockname)
8995 +#endif
8996 +
8997 +#define __RW_LOCK_UNLOCKED(name) \
8998 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
8999 +         RW_DEP_MAP_INIT(name) }
9000 +
9001 +#define DEFINE_RWLOCK(name) \
9002 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
9003 +
9004 +#endif
9005 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
9006 index dd1d14250340..aa2ac1f65c2d 100644
9007 --- a/include/linux/rwsem.h
9008 +++ b/include/linux/rwsem.h
9009 @@ -19,6 +19,10 @@
9010  #include <linux/osq_lock.h>
9011  #endif
9012  
9013 +#ifdef CONFIG_PREEMPT_RT_FULL
9014 +#include <linux/rwsem_rt.h>
9015 +#else /* PREEMPT_RT_FULL */
9016 +
9017  struct rw_semaphore;
9018  
9019  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
9020 @@ -106,6 +110,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
9021         return !list_empty(&sem->wait_list);
9022  }
9023  
9024 +#endif /* !PREEMPT_RT_FULL */
9025 +
9026 +/*
9027 + * The functions below are the same for all rwsem implementations including
9028 + * the RT specific variant.
9029 + */
9030 +
9031  /*
9032   * lock for reading
9033   */
9034 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
9035 new file mode 100644
9036 index 000000000000..2ffbf093ae92
9037 --- /dev/null
9038 +++ b/include/linux/rwsem_rt.h
9039 @@ -0,0 +1,67 @@
9040 +#ifndef _LINUX_RWSEM_RT_H
9041 +#define _LINUX_RWSEM_RT_H
9042 +
9043 +#ifndef _LINUX_RWSEM_H
9044 +#error "Include rwsem.h"
9045 +#endif
9046 +
9047 +#include <linux/rtmutex.h>
9048 +#include <linux/swait.h>
9049 +
9050 +#define READER_BIAS            (1U << 31)
9051 +#define WRITER_BIAS            (1U << 30)
9052 +
9053 +struct rw_semaphore {
9054 +       atomic_t                readers;
9055 +       struct rt_mutex         rtmutex;
9056 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9057 +       struct lockdep_map      dep_map;
9058 +#endif
9059 +};
9060 +
9061 +#define __RWSEM_INITIALIZER(name)                              \
9062 +{                                                              \
9063 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
9064 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
9065 +       RW_DEP_MAP_INIT(name)                                   \
9066 +}
9067 +
9068 +#define DECLARE_RWSEM(lockname) \
9069 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
9070 +
9071 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
9072 +                         struct lock_class_key *key);
9073 +
9074 +#define __init_rwsem(sem, name, key)                   \
9075 +do {                                                   \
9076 +               rt_mutex_init(&(sem)->rtmutex);         \
9077 +               __rwsem_init((sem), (name), (key));     \
9078 +} while (0)
9079 +
9080 +#define init_rwsem(sem)                                        \
9081 +do {                                                   \
9082 +       static struct lock_class_key __key;             \
9083 +                                                       \
9084 +       __init_rwsem((sem), #sem, &__key);              \
9085 +} while (0)
9086 +
9087 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
9088 +{
9089 +       return atomic_read(&sem->readers) != READER_BIAS;
9090 +}
9091 +
9092 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
9093 +{
9094 +       return atomic_read(&sem->readers) > 0;
9095 +}
9096 +
9097 +extern void __down_read(struct rw_semaphore *sem);
9098 +extern int __down_read_trylock(struct rw_semaphore *sem);
9099 +extern void __down_write(struct rw_semaphore *sem);
9100 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
9101 +extern int __down_write_trylock(struct rw_semaphore *sem);
9102 +extern void __up_read(struct rw_semaphore *sem);
9103 +extern void __up_write(struct rw_semaphore *sem);
9104 +extern void __downgrade_write(struct rw_semaphore *sem);
9105 +
9106 +#endif
9107 diff --git a/include/linux/sched.h b/include/linux/sched.h
9108 index 75d9a57e212e..8cb7df0f56e3 100644
9109 --- a/include/linux/sched.h
9110 +++ b/include/linux/sched.h
9111 @@ -26,6 +26,7 @@ struct sched_param {
9112  #include <linux/nodemask.h>
9113  #include <linux/mm_types.h>
9114  #include <linux/preempt.h>
9115 +#include <asm/kmap_types.h>
9116  
9117  #include <asm/page.h>
9118  #include <asm/ptrace.h>
9119 @@ -243,10 +244,7 @@ extern char ___assert_task_state[1 - 2*!!(
9120                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
9121                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
9122  
9123 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
9124  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
9125 -#define task_is_stopped_or_traced(task)        \
9126 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
9127  #define task_contributes_to_load(task) \
9128                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
9129                                  (task->flags & PF_FROZEN) == 0 && \
9130 @@ -312,6 +310,11 @@ extern char ___assert_task_state[1 - 2*!!(
9131  
9132  #endif
9133  
9134 +#define __set_current_state_no_track(state_value)      \
9135 +       do { current->state = (state_value); } while (0)
9136 +#define set_current_state_no_track(state_value)                \
9137 +       set_mb(current->state, (state_value))
9138 +
9139  /* Task command name length */
9140  #define TASK_COMM_LEN 16
9141  
9142 @@ -1013,8 +1016,18 @@ struct wake_q_head {
9143         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
9144  
9145  extern void wake_q_add(struct wake_q_head *head,
9146 -                      struct task_struct *task);
9147 -extern void wake_up_q(struct wake_q_head *head);
9148 +                             struct task_struct *task);
9149 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
9150 +
9151 +static inline void wake_up_q(struct wake_q_head *head)
9152 +{
9153 +       __wake_up_q(head, false);
9154 +}
9155 +
9156 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
9157 +{
9158 +       __wake_up_q(head, true);
9159 +}
9160  
9161  /*
9162   * sched-domains (multiprocessor balancing) declarations:
9163 @@ -1481,6 +1494,7 @@ struct task_struct {
9164         struct thread_info thread_info;
9165  #endif
9166         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9167 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
9168         void *stack;
9169         atomic_t usage;
9170         unsigned int flags;     /* per process flags, defined below */
9171 @@ -1520,6 +1534,12 @@ struct task_struct {
9172  #endif
9173  
9174         unsigned int policy;
9175 +#ifdef CONFIG_PREEMPT_RT_FULL
9176 +       int migrate_disable;
9177 +# ifdef CONFIG_SCHED_DEBUG
9178 +       int migrate_disable_atomic;
9179 +# endif
9180 +#endif
9181         int nr_cpus_allowed;
9182         cpumask_t cpus_allowed;
9183  
9184 @@ -1654,6 +1674,9 @@ struct task_struct {
9185  
9186         struct task_cputime cputime_expires;
9187         struct list_head cpu_timers[3];
9188 +#ifdef CONFIG_PREEMPT_RT_BASE
9189 +       struct task_struct *posix_timer_list;
9190 +#endif
9191  
9192  /* process credentials */
9193         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
9194 @@ -1685,10 +1708,15 @@ struct task_struct {
9195  /* signal handlers */
9196         struct signal_struct *signal;
9197         struct sighand_struct *sighand;
9198 +       struct sigqueue *sigqueue_cache;
9199  
9200         sigset_t blocked, real_blocked;
9201         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9202         struct sigpending pending;
9203 +#ifdef CONFIG_PREEMPT_RT_FULL
9204 +       /* TODO: move me into ->restart_block ? */
9205 +       struct siginfo forced_info;
9206 +#endif
9207  
9208         unsigned long sas_ss_sp;
9209         size_t sas_ss_size;
9210 @@ -1917,6 +1945,12 @@ struct task_struct {
9211         /* bitmask and counter of trace recursion */
9212         unsigned long trace_recursion;
9213  #endif /* CONFIG_TRACING */
9214 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
9215 +       u64 preempt_timestamp_hist;
9216 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
9217 +       long timer_offset;
9218 +#endif
9219 +#endif
9220  #ifdef CONFIG_KCOV
9221         /* Coverage collection mode enabled for this task (0 if disabled). */
9222         enum kcov_mode kcov_mode;
9223 @@ -1942,9 +1976,23 @@ struct task_struct {
9224         unsigned int    sequential_io;
9225         unsigned int    sequential_io_avg;
9226  #endif
9227 +#ifdef CONFIG_PREEMPT_RT_BASE
9228 +       struct rcu_head put_rcu;
9229 +       int softirq_nestcnt;
9230 +       unsigned int softirqs_raised;
9231 +#endif
9232 +#ifdef CONFIG_PREEMPT_RT_FULL
9233 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9234 +       int kmap_idx;
9235 +       pte_t kmap_pte[KM_TYPE_NR];
9236 +# endif
9237 +#endif
9238  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9239         unsigned long   task_state_change;
9240  #endif
9241 +#ifdef CONFIG_PREEMPT_RT_FULL
9242 +       int xmit_recursion;
9243 +#endif
9244         int pagefault_disabled;
9245  #ifdef CONFIG_MMU
9246         struct task_struct *oom_reaper_list;
9247 @@ -1984,14 +2032,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
9248  }
9249  #endif
9250  
9251 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9252 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9253 -
9254 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9255 -{
9256 -       return p->nr_cpus_allowed;
9257 -}
9258 -
9259  #define TNF_MIGRATED   0x01
9260  #define TNF_NO_GROUP   0x02
9261  #define TNF_SHARED     0x04
9262 @@ -2207,6 +2247,15 @@ extern struct pid *cad_pid;
9263  extern void free_task(struct task_struct *tsk);
9264  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9265  
9266 +#ifdef CONFIG_PREEMPT_RT_BASE
9267 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9268 +
9269 +static inline void put_task_struct(struct task_struct *t)
9270 +{
9271 +       if (atomic_dec_and_test(&t->usage))
9272 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9273 +}
9274 +#else
9275  extern void __put_task_struct(struct task_struct *t);
9276  
9277  static inline void put_task_struct(struct task_struct *t)
9278 @@ -2214,6 +2263,7 @@ static inline void put_task_struct(struct task_struct *t)
9279         if (atomic_dec_and_test(&t->usage))
9280                 __put_task_struct(t);
9281  }
9282 +#endif
9283  
9284  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9285  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9286 @@ -2255,6 +2305,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
9287  /*
9288   * Per process flags
9289   */
9290 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9291  #define PF_EXITING     0x00000004      /* getting shut down */
9292  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9293  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9294 @@ -2423,6 +2474,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
9295  
9296  extern int set_cpus_allowed_ptr(struct task_struct *p,
9297                                 const struct cpumask *new_mask);
9298 +int migrate_me(void);
9299 +void tell_sched_cpu_down_begin(int cpu);
9300 +void tell_sched_cpu_down_done(int cpu);
9301 +
9302  #else
9303  static inline void do_set_cpus_allowed(struct task_struct *p,
9304                                       const struct cpumask *new_mask)
9305 @@ -2435,6 +2490,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
9306                 return -EINVAL;
9307         return 0;
9308  }
9309 +static inline int migrate_me(void) { return 0; }
9310 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9311 +static inline void tell_sched_cpu_down_done(int cpu) { }
9312  #endif
9313  
9314  #ifdef CONFIG_NO_HZ_COMMON
9315 @@ -2673,6 +2731,7 @@ extern void xtime_update(unsigned long ticks);
9316  
9317  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9318  extern int wake_up_process(struct task_struct *tsk);
9319 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9320  extern void wake_up_new_task(struct task_struct *tsk);
9321  #ifdef CONFIG_SMP
9322   extern void kick_process(struct task_struct *tsk);
9323 @@ -2881,6 +2940,17 @@ static inline void mmdrop(struct mm_struct *mm)
9324                 __mmdrop(mm);
9325  }
9326  
9327 +#ifdef CONFIG_PREEMPT_RT_BASE
9328 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9329 +static inline void mmdrop_delayed(struct mm_struct *mm)
9330 +{
9331 +       if (atomic_dec_and_test(&mm->mm_count))
9332 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9333 +}
9334 +#else
9335 +# define mmdrop_delayed(mm)    mmdrop(mm)
9336 +#endif
9337 +
9338  static inline void mmdrop_async_fn(struct work_struct *work)
9339  {
9340         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9341 @@ -3273,6 +3343,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
9342         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9343  }
9344  
9345 +#ifdef CONFIG_PREEMPT_LAZY
9346 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9347 +{
9348 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9349 +}
9350 +
9351 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9352 +{
9353 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9354 +}
9355 +
9356 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9357 +{
9358 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9359 +}
9360 +
9361 +static inline int need_resched_lazy(void)
9362 +{
9363 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9364 +}
9365 +
9366 +static inline int need_resched_now(void)
9367 +{
9368 +       return test_thread_flag(TIF_NEED_RESCHED);
9369 +}
9370 +
9371 +#else
9372 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9373 +static inline int need_resched_lazy(void) { return 0; }
9374 +
9375 +static inline int need_resched_now(void)
9376 +{
9377 +       return test_thread_flag(TIF_NEED_RESCHED);
9378 +}
9379 +
9380 +#endif
9381 +
9382  static inline int restart_syscall(void)
9383  {
9384         set_tsk_thread_flag(current, TIF_SIGPENDING);
9385 @@ -3304,6 +3411,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
9386         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9387  }
9388  
9389 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9390 +{
9391 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9392 +               return true;
9393 +#ifdef CONFIG_PREEMPT_RT_FULL
9394 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9395 +               return true;
9396 +#endif
9397 +       return false;
9398 +}
9399 +
9400 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9401 +{
9402 +       bool traced_stopped;
9403 +
9404 +#ifdef CONFIG_PREEMPT_RT_FULL
9405 +       unsigned long flags;
9406 +
9407 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9408 +       traced_stopped = __task_is_stopped_or_traced(task);
9409 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9410 +#else
9411 +       traced_stopped = __task_is_stopped_or_traced(task);
9412 +#endif
9413 +       return traced_stopped;
9414 +}
9415 +
9416 +static inline bool task_is_traced(struct task_struct *task)
9417 +{
9418 +       bool traced = false;
9419 +
9420 +       if (task->state & __TASK_TRACED)
9421 +               return true;
9422 +#ifdef CONFIG_PREEMPT_RT_FULL
9423 +       /* in case the task is sleeping on tasklist_lock */
9424 +       raw_spin_lock_irq(&task->pi_lock);
9425 +       if (task->state & __TASK_TRACED)
9426 +               traced = true;
9427 +       else if (task->saved_state & __TASK_TRACED)
9428 +               traced = true;
9429 +       raw_spin_unlock_irq(&task->pi_lock);
9430 +#endif
9431 +       return traced;
9432 +}
9433 +
9434  /*
9435   * cond_resched() and cond_resched_lock(): latency reduction via
9436   * explicit rescheduling in places that are safe. The return
9437 @@ -3329,12 +3481,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
9438         __cond_resched_lock(lock);                              \
9439  })
9440  
9441 +#ifndef CONFIG_PREEMPT_RT_FULL
9442  extern int __cond_resched_softirq(void);
9443  
9444  #define cond_resched_softirq() ({                                      \
9445         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9446         __cond_resched_softirq();                                       \
9447  })
9448 +#else
9449 +# define cond_resched_softirq()                cond_resched()
9450 +#endif
9451  
9452  static inline void cond_resched_rcu(void)
9453  {
9454 @@ -3509,6 +3665,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
9455  
9456  #endif /* CONFIG_SMP */
9457  
9458 +static inline int __migrate_disabled(struct task_struct *p)
9459 +{
9460 +#ifdef CONFIG_PREEMPT_RT_FULL
9461 +       return p->migrate_disable;
9462 +#else
9463 +       return 0;
9464 +#endif
9465 +}
9466 +
9467 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9468 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9469 +{
9470 +       if (__migrate_disabled(p))
9471 +               return cpumask_of(task_cpu(p));
9472 +
9473 +       return &p->cpus_allowed;
9474 +}
9475 +
9476 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9477 +{
9478 +       if (__migrate_disabled(p))
9479 +               return 1;
9480 +       return p->nr_cpus_allowed;
9481 +}
9482 +
9483  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9484  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9485  
9486 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
9487 index ead97654c4e9..3d7223ffdd3b 100644
9488 --- a/include/linux/seqlock.h
9489 +++ b/include/linux/seqlock.h
9490 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
9491         return __read_seqcount_retry(s, start);
9492  }
9493  
9494 -
9495 -
9496 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9497 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9498  {
9499         s->sequence++;
9500         smp_wmb();
9501  }
9502  
9503 -static inline void raw_write_seqcount_end(seqcount_t *s)
9504 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9505 +{
9506 +       preempt_disable_rt();
9507 +       __raw_write_seqcount_begin(s);
9508 +}
9509 +
9510 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9511  {
9512         smp_wmb();
9513         s->sequence++;
9514  }
9515  
9516 +static inline void raw_write_seqcount_end(seqcount_t *s)
9517 +{
9518 +       __raw_write_seqcount_end(s);
9519 +       preempt_enable_rt();
9520 +}
9521 +
9522  /**
9523   * raw_write_seqcount_barrier - do a seq write barrier
9524   * @s: pointer to seqcount_t
9525 @@ -428,10 +438,32 @@ typedef struct {
9526  /*
9527   * Read side functions for starting and finalizing a read side section.
9528   */
9529 +#ifndef CONFIG_PREEMPT_RT_FULL
9530  static inline unsigned read_seqbegin(const seqlock_t *sl)
9531  {
9532         return read_seqcount_begin(&sl->seqcount);
9533  }
9534 +#else
9535 +/*
9536 + * Starvation safe read side for RT
9537 + */
9538 +static inline unsigned read_seqbegin(seqlock_t *sl)
9539 +{
9540 +       unsigned ret;
9541 +
9542 +repeat:
9543 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9544 +       if (unlikely(ret & 1)) {
9545 +               /*
9546 +                * Take the lock and let the writer proceed (i.e. evtl
9547 +                * boost it), otherwise we could loop here forever.
9548 +                */
9549 +               spin_unlock_wait(&sl->lock);
9550 +               goto repeat;
9551 +       }
9552 +       return ret;
9553 +}
9554 +#endif
9555  
9556  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9557  {
9558 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9559  static inline void write_seqlock(seqlock_t *sl)
9560  {
9561         spin_lock(&sl->lock);
9562 -       write_seqcount_begin(&sl->seqcount);
9563 +       __raw_write_seqcount_begin(&sl->seqcount);
9564 +}
9565 +
9566 +static inline int try_write_seqlock(seqlock_t *sl)
9567 +{
9568 +       if (spin_trylock(&sl->lock)) {
9569 +               __raw_write_seqcount_begin(&sl->seqcount);
9570 +               return 1;
9571 +       }
9572 +       return 0;
9573  }
9574  
9575  static inline void write_sequnlock(seqlock_t *sl)
9576  {
9577 -       write_seqcount_end(&sl->seqcount);
9578 +       __raw_write_seqcount_end(&sl->seqcount);
9579         spin_unlock(&sl->lock);
9580  }
9581  
9582  static inline void write_seqlock_bh(seqlock_t *sl)
9583  {
9584         spin_lock_bh(&sl->lock);
9585 -       write_seqcount_begin(&sl->seqcount);
9586 +       __raw_write_seqcount_begin(&sl->seqcount);
9587  }
9588  
9589  static inline void write_sequnlock_bh(seqlock_t *sl)
9590  {
9591 -       write_seqcount_end(&sl->seqcount);
9592 +       __raw_write_seqcount_end(&sl->seqcount);
9593         spin_unlock_bh(&sl->lock);
9594  }
9595  
9596  static inline void write_seqlock_irq(seqlock_t *sl)
9597  {
9598         spin_lock_irq(&sl->lock);
9599 -       write_seqcount_begin(&sl->seqcount);
9600 +       __raw_write_seqcount_begin(&sl->seqcount);
9601  }
9602  
9603  static inline void write_sequnlock_irq(seqlock_t *sl)
9604  {
9605 -       write_seqcount_end(&sl->seqcount);
9606 +       __raw_write_seqcount_end(&sl->seqcount);
9607         spin_unlock_irq(&sl->lock);
9608  }
9609  
9610 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9611         unsigned long flags;
9612  
9613         spin_lock_irqsave(&sl->lock, flags);
9614 -       write_seqcount_begin(&sl->seqcount);
9615 +       __raw_write_seqcount_begin(&sl->seqcount);
9616         return flags;
9617  }
9618  
9619 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9620  static inline void
9621  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9622  {
9623 -       write_seqcount_end(&sl->seqcount);
9624 +       __raw_write_seqcount_end(&sl->seqcount);
9625         spin_unlock_irqrestore(&sl->lock, flags);
9626  }
9627  
9628 diff --git a/include/linux/signal.h b/include/linux/signal.h
9629 index b63f63eaa39c..295540fdfc72 100644
9630 --- a/include/linux/signal.h
9631 +++ b/include/linux/signal.h
9632 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
9633  }
9634  
9635  extern void flush_sigqueue(struct sigpending *queue);
9636 +extern void flush_task_sigqueue(struct task_struct *tsk);
9637  
9638  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9639  static inline int valid_signal(unsigned long sig)
9640 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
9641 index 32810f279f8e..0db6e31161f6 100644
9642 --- a/include/linux/skbuff.h
9643 +++ b/include/linux/skbuff.h
9644 @@ -284,6 +284,7 @@ struct sk_buff_head {
9645  
9646         __u32           qlen;
9647         spinlock_t      lock;
9648 +       raw_spinlock_t  raw_lock;
9649  };
9650  
9651  struct sk_buff;
9652 @@ -1573,6 +1574,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
9653         __skb_queue_head_init(list);
9654  }
9655  
9656 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9657 +{
9658 +       raw_spin_lock_init(&list->raw_lock);
9659 +       __skb_queue_head_init(list);
9660 +}
9661 +
9662  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9663                 struct lock_class_key *class)
9664  {
9665 diff --git a/include/linux/smp.h b/include/linux/smp.h
9666 index 8e0cb7a0f836..891c533724f5 100644
9667 --- a/include/linux/smp.h
9668 +++ b/include/linux/smp.h
9669 @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
9670  extern void __init setup_nr_cpu_ids(void);
9671  extern void __init smp_init(void);
9672  
9673 +extern int __boot_cpu_id;
9674 +
9675 +static inline int get_boot_cpu_id(void)
9676 +{
9677 +       return __boot_cpu_id;
9678 +}
9679 +
9680  #else /* !SMP */
9681  
9682  static inline void smp_send_stop(void) { }
9683 @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
9684  static inline void smp_init(void) { }
9685  #endif
9686  
9687 +static inline int get_boot_cpu_id(void)
9688 +{
9689 +       return 0;
9690 +}
9691 +
9692  #endif /* !SMP */
9693  
9694  /*
9695 @@ -185,6 +197,9 @@ static inline void smp_init(void) { }
9696  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
9697  #define put_cpu()              preempt_enable()
9698  
9699 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
9700 +#define put_cpu_light()                migrate_enable()
9701 +
9702  /*
9703   * Callback to arch code if there's nosmp or maxcpus=0 on the
9704   * boot command line:
9705 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
9706 index 47dd0cebd204..b241cc044bd3 100644
9707 --- a/include/linux/spinlock.h
9708 +++ b/include/linux/spinlock.h
9709 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9710  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
9711  
9712  /* Include rwlock functions */
9713 -#include <linux/rwlock.h>
9714 +#ifdef CONFIG_PREEMPT_RT_FULL
9715 +# include <linux/rwlock_rt.h>
9716 +#else
9717 +# include <linux/rwlock.h>
9718 +#endif
9719  
9720  /*
9721   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9722 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9723  # include <linux/spinlock_api_up.h>
9724  #endif
9725  
9726 +#ifdef CONFIG_PREEMPT_RT_FULL
9727 +# include <linux/spinlock_rt.h>
9728 +#else /* PREEMPT_RT_FULL */
9729 +
9730  /*
9731   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9732   */
9733 @@ -416,4 +424,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
9734  #define atomic_dec_and_lock(atomic, lock) \
9735                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9736  
9737 +#endif /* !PREEMPT_RT_FULL */
9738 +
9739  #endif /* __LINUX_SPINLOCK_H */
9740 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
9741 index 5344268e6e62..043263f30e81 100644
9742 --- a/include/linux/spinlock_api_smp.h
9743 +++ b/include/linux/spinlock_api_smp.h
9744 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
9745         return 0;
9746  }
9747  
9748 -#include <linux/rwlock_api_smp.h>
9749 +#ifndef CONFIG_PREEMPT_RT_FULL
9750 +# include <linux/rwlock_api_smp.h>
9751 +#endif
9752  
9753  #endif /* __LINUX_SPINLOCK_API_SMP_H */
9754 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
9755 new file mode 100644
9756 index 000000000000..43ca841b913a
9757 --- /dev/null
9758 +++ b/include/linux/spinlock_rt.h
9759 @@ -0,0 +1,162 @@
9760 +#ifndef __LINUX_SPINLOCK_RT_H
9761 +#define __LINUX_SPINLOCK_RT_H
9762 +
9763 +#ifndef __LINUX_SPINLOCK_H
9764 +#error Do not include directly. Use spinlock.h
9765 +#endif
9766 +
9767 +#include <linux/bug.h>
9768 +
9769 +extern void
9770 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9771 +
9772 +#define spin_lock_init(slock)                          \
9773 +do {                                                   \
9774 +       static struct lock_class_key __key;             \
9775 +                                                       \
9776 +       rt_mutex_init(&(slock)->lock);                  \
9777 +       __rt_spin_lock_init(slock, #slock, &__key);     \
9778 +} while (0)
9779 +
9780 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9781 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9782 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9783 +
9784 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9785 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9786 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9787 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9788 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9789 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9790 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9791 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9792 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9793 +
9794 +/*
9795 + * lockdep-less calls, for derived types like rwlock:
9796 + * (for trylock they can use rt_mutex_trylock() directly.
9797 + */
9798 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9799 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9800 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9801 +
9802 +#define spin_lock(lock)                        rt_spin_lock(lock)
9803 +
9804 +#define spin_lock_bh(lock)                     \
9805 +       do {                                    \
9806 +               local_bh_disable();             \
9807 +               rt_spin_lock(lock);             \
9808 +       } while (0)
9809 +
9810 +#define spin_lock_irq(lock)            spin_lock(lock)
9811 +
9812 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
9813 +
9814 +#define spin_trylock(lock)                     \
9815 +({                                             \
9816 +       int __locked;                           \
9817 +       __locked = spin_do_trylock(lock);       \
9818 +       __locked;                               \
9819 +})
9820 +
9821 +#ifdef CONFIG_LOCKDEP
9822 +# define spin_lock_nested(lock, subclass)              \
9823 +       do {                                            \
9824 +               rt_spin_lock_nested(lock, subclass);    \
9825 +       } while (0)
9826 +
9827 +#define spin_lock_bh_nested(lock, subclass)            \
9828 +       do {                                            \
9829 +               local_bh_disable();                     \
9830 +               rt_spin_lock_nested(lock, subclass);    \
9831 +       } while (0)
9832 +
9833 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9834 +       do {                                             \
9835 +               typecheck(unsigned long, flags);         \
9836 +               flags = 0;                               \
9837 +               rt_spin_lock_nested(lock, subclass);     \
9838 +       } while (0)
9839 +#else
9840 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
9841 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
9842 +
9843 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9844 +       do {                                             \
9845 +               typecheck(unsigned long, flags);         \
9846 +               flags = 0;                               \
9847 +               spin_lock(lock);                         \
9848 +       } while (0)
9849 +#endif
9850 +
9851 +#define spin_lock_irqsave(lock, flags)                  \
9852 +       do {                                             \
9853 +               typecheck(unsigned long, flags);         \
9854 +               flags = 0;                               \
9855 +               spin_lock(lock);                         \
9856 +       } while (0)
9857 +
9858 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
9859 +{
9860 +       unsigned long flags = 0;
9861 +#ifdef CONFIG_TRACE_IRQFLAGS
9862 +       flags = rt_spin_lock_trace_flags(lock);
9863 +#else
9864 +       spin_lock(lock); /* lock_local */
9865 +#endif
9866 +       return flags;
9867 +}
9868 +
9869 +/* FIXME: we need rt_spin_lock_nest_lock */
9870 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
9871 +
9872 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
9873 +
9874 +#define spin_unlock_bh(lock)                           \
9875 +       do {                                            \
9876 +               rt_spin_unlock(lock);                   \
9877 +               local_bh_enable();                      \
9878 +       } while (0)
9879 +
9880 +#define spin_unlock_irq(lock)          spin_unlock(lock)
9881 +
9882 +#define spin_unlock_irqrestore(lock, flags)            \
9883 +       do {                                            \
9884 +               typecheck(unsigned long, flags);        \
9885 +               (void) flags;                           \
9886 +               spin_unlock(lock);                      \
9887 +       } while (0)
9888 +
9889 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
9890 +#define spin_trylock_irq(lock) spin_trylock(lock)
9891 +
9892 +#define spin_trylock_irqsave(lock, flags)      \
9893 +       rt_spin_trylock_irqsave(lock, &(flags))
9894 +
9895 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
9896 +
9897 +#ifdef CONFIG_GENERIC_LOCKBREAK
9898 +# define spin_is_contended(lock)       ((lock)->break_lock)
9899 +#else
9900 +# define spin_is_contended(lock)       (((void)(lock), 0))
9901 +#endif
9902 +
9903 +static inline int spin_can_lock(spinlock_t *lock)
9904 +{
9905 +       return !rt_mutex_is_locked(&lock->lock);
9906 +}
9907 +
9908 +static inline int spin_is_locked(spinlock_t *lock)
9909 +{
9910 +       return rt_mutex_is_locked(&lock->lock);
9911 +}
9912 +
9913 +static inline void assert_spin_locked(spinlock_t *lock)
9914 +{
9915 +       BUG_ON(!spin_is_locked(lock));
9916 +}
9917 +
9918 +#define atomic_dec_and_lock(atomic, lock) \
9919 +       atomic_dec_and_spin_lock(atomic, lock)
9920 +
9921 +#endif
9922 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
9923 index 73548eb13a5d..10bac715ea96 100644
9924 --- a/include/linux/spinlock_types.h
9925 +++ b/include/linux/spinlock_types.h
9926 @@ -9,80 +9,15 @@
9927   * Released under the General Public License (GPL).
9928   */
9929  
9930 -#if defined(CONFIG_SMP)
9931 -# include <asm/spinlock_types.h>
9932 +#include <linux/spinlock_types_raw.h>
9933 +
9934 +#ifndef CONFIG_PREEMPT_RT_FULL
9935 +# include <linux/spinlock_types_nort.h>
9936 +# include <linux/rwlock_types.h>
9937  #else
9938 -# include <linux/spinlock_types_up.h>
9939 +# include <linux/rtmutex.h>
9940 +# include <linux/spinlock_types_rt.h>
9941 +# include <linux/rwlock_types_rt.h>
9942  #endif
9943  
9944 -#include <linux/lockdep.h>
9945 -
9946 -typedef struct raw_spinlock {
9947 -       arch_spinlock_t raw_lock;
9948 -#ifdef CONFIG_GENERIC_LOCKBREAK
9949 -       unsigned int break_lock;
9950 -#endif
9951 -#ifdef CONFIG_DEBUG_SPINLOCK
9952 -       unsigned int magic, owner_cpu;
9953 -       void *owner;
9954 -#endif
9955 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9956 -       struct lockdep_map dep_map;
9957 -#endif
9958 -} raw_spinlock_t;
9959 -
9960 -#define SPINLOCK_MAGIC         0xdead4ead
9961 -
9962 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9963 -
9964 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9965 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9966 -#else
9967 -# define SPIN_DEP_MAP_INIT(lockname)
9968 -#endif
9969 -
9970 -#ifdef CONFIG_DEBUG_SPINLOCK
9971 -# define SPIN_DEBUG_INIT(lockname)             \
9972 -       .magic = SPINLOCK_MAGIC,                \
9973 -       .owner_cpu = -1,                        \
9974 -       .owner = SPINLOCK_OWNER_INIT,
9975 -#else
9976 -# define SPIN_DEBUG_INIT(lockname)
9977 -#endif
9978 -
9979 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9980 -       {                                       \
9981 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9982 -       SPIN_DEBUG_INIT(lockname)               \
9983 -       SPIN_DEP_MAP_INIT(lockname) }
9984 -
9985 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9986 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9987 -
9988 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9989 -
9990 -typedef struct spinlock {
9991 -       union {
9992 -               struct raw_spinlock rlock;
9993 -
9994 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9995 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9996 -               struct {
9997 -                       u8 __padding[LOCK_PADSIZE];
9998 -                       struct lockdep_map dep_map;
9999 -               };
10000 -#endif
10001 -       };
10002 -} spinlock_t;
10003 -
10004 -#define __SPIN_LOCK_INITIALIZER(lockname) \
10005 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10006 -
10007 -#define __SPIN_LOCK_UNLOCKED(lockname) \
10008 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10009 -
10010 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10011 -
10012 -#include <linux/rwlock_types.h>
10013 -
10014  #endif /* __LINUX_SPINLOCK_TYPES_H */
10015 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
10016 new file mode 100644
10017 index 000000000000..f1dac1fb1d6a
10018 --- /dev/null
10019 +++ b/include/linux/spinlock_types_nort.h
10020 @@ -0,0 +1,33 @@
10021 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
10022 +#define __LINUX_SPINLOCK_TYPES_NORT_H
10023 +
10024 +#ifndef __LINUX_SPINLOCK_TYPES_H
10025 +#error "Do not include directly. Include spinlock_types.h instead"
10026 +#endif
10027 +
10028 +/*
10029 + * The non RT version maps spinlocks to raw_spinlocks
10030 + */
10031 +typedef struct spinlock {
10032 +       union {
10033 +               struct raw_spinlock rlock;
10034 +
10035 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10036 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10037 +               struct {
10038 +                       u8 __padding[LOCK_PADSIZE];
10039 +                       struct lockdep_map dep_map;
10040 +               };
10041 +#endif
10042 +       };
10043 +} spinlock_t;
10044 +
10045 +#define __SPIN_LOCK_INITIALIZER(lockname) \
10046 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10047 +
10048 +#define __SPIN_LOCK_UNLOCKED(lockname) \
10049 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10050 +
10051 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10052 +
10053 +#endif
10054 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
10055 new file mode 100644
10056 index 000000000000..edffc4d53fc9
10057 --- /dev/null
10058 +++ b/include/linux/spinlock_types_raw.h
10059 @@ -0,0 +1,56 @@
10060 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
10061 +#define __LINUX_SPINLOCK_TYPES_RAW_H
10062 +
10063 +#if defined(CONFIG_SMP)
10064 +# include <asm/spinlock_types.h>
10065 +#else
10066 +# include <linux/spinlock_types_up.h>
10067 +#endif
10068 +
10069 +#include <linux/lockdep.h>
10070 +
10071 +typedef struct raw_spinlock {
10072 +       arch_spinlock_t raw_lock;
10073 +#ifdef CONFIG_GENERIC_LOCKBREAK
10074 +       unsigned int break_lock;
10075 +#endif
10076 +#ifdef CONFIG_DEBUG_SPINLOCK
10077 +       unsigned int magic, owner_cpu;
10078 +       void *owner;
10079 +#endif
10080 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10081 +       struct lockdep_map dep_map;
10082 +#endif
10083 +} raw_spinlock_t;
10084 +
10085 +#define SPINLOCK_MAGIC         0xdead4ead
10086 +
10087 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10088 +
10089 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10090 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10091 +#else
10092 +# define SPIN_DEP_MAP_INIT(lockname)
10093 +#endif
10094 +
10095 +#ifdef CONFIG_DEBUG_SPINLOCK
10096 +# define SPIN_DEBUG_INIT(lockname)             \
10097 +       .magic = SPINLOCK_MAGIC,                \
10098 +       .owner_cpu = -1,                        \
10099 +       .owner = SPINLOCK_OWNER_INIT,
10100 +#else
10101 +# define SPIN_DEBUG_INIT(lockname)
10102 +#endif
10103 +
10104 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10105 +       {                                       \
10106 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10107 +       SPIN_DEBUG_INIT(lockname)               \
10108 +       SPIN_DEP_MAP_INIT(lockname) }
10109 +
10110 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10111 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10112 +
10113 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10114 +
10115 +#endif
10116 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
10117 new file mode 100644
10118 index 000000000000..3e3d8c5f7a9a
10119 --- /dev/null
10120 +++ b/include/linux/spinlock_types_rt.h
10121 @@ -0,0 +1,48 @@
10122 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
10123 +#define __LINUX_SPINLOCK_TYPES_RT_H
10124 +
10125 +#ifndef __LINUX_SPINLOCK_TYPES_H
10126 +#error "Do not include directly. Include spinlock_types.h instead"
10127 +#endif
10128 +
10129 +#include <linux/cache.h>
10130 +
10131 +/*
10132 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
10133 + */
10134 +typedef struct spinlock {
10135 +       struct rt_mutex         lock;
10136 +       unsigned int            break_lock;
10137 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10138 +       struct lockdep_map      dep_map;
10139 +#endif
10140 +} spinlock_t;
10141 +
10142 +#ifdef CONFIG_DEBUG_RT_MUTEXES
10143 +# define __RT_SPIN_INITIALIZER(name) \
10144 +       { \
10145 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
10146 +       .save_state = 1, \
10147 +       .file = __FILE__, \
10148 +       .line = __LINE__ , \
10149 +       }
10150 +#else
10151 +# define __RT_SPIN_INITIALIZER(name) \
10152 +       {                                                               \
10153 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
10154 +       .save_state = 1, \
10155 +       }
10156 +#endif
10157 +
10158 +/*
10159 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
10160 +*/
10161 +
10162 +#define __SPIN_LOCK_UNLOCKED(name)                     \
10163 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
10164 +         SPIN_DEP_MAP_INIT(name) }
10165 +
10166 +#define DEFINE_SPINLOCK(name) \
10167 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
10168 +
10169 +#endif
10170 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
10171 index dc8eb63c6568..e793d3a257da 100644
10172 --- a/include/linux/srcu.h
10173 +++ b/include/linux/srcu.h
10174 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
10175  
10176  void process_srcu(struct work_struct *work);
10177  
10178 -#define __SRCU_STRUCT_INIT(name)                                       \
10179 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
10180         {                                                               \
10181                 .completed = -300,                                      \
10182 -               .per_cpu_ref = &name##_srcu_array,                      \
10183 +               .per_cpu_ref = &pcpu_name,                              \
10184                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
10185                 .running = false,                                       \
10186                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
10187 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
10188   */
10189  #define __DEFINE_SRCU(name, is_static)                                 \
10190         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
10191 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
10192 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
10193  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
10194  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
10195  
10196 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
10197 index d9718378a8be..e81e6dc7dcb1 100644
10198 --- a/include/linux/suspend.h
10199 +++ b/include/linux/suspend.h
10200 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
10201         void (*end)(void);
10202  };
10203  
10204 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
10205 +extern bool pm_in_action;
10206 +#else
10207 +# define pm_in_action false
10208 +#endif
10209 +
10210  #ifdef CONFIG_SUSPEND
10211  /**
10212   * suspend_set_ops - set platform dependent suspend operations
10213 diff --git a/include/linux/swait.h b/include/linux/swait.h
10214 index c1f9c62a8a50..83f004a72320 100644
10215 --- a/include/linux/swait.h
10216 +++ b/include/linux/swait.h
10217 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
10218  extern void swake_up(struct swait_queue_head *q);
10219  extern void swake_up_all(struct swait_queue_head *q);
10220  extern void swake_up_locked(struct swait_queue_head *q);
10221 +extern void swake_up_all_locked(struct swait_queue_head *q);
10222  
10223  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
10224  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
10225 diff --git a/include/linux/swap.h b/include/linux/swap.h
10226 index 55ff5593c193..52bf5477dc92 100644
10227 --- a/include/linux/swap.h
10228 +++ b/include/linux/swap.h
10229 @@ -11,6 +11,7 @@
10230  #include <linux/fs.h>
10231  #include <linux/atomic.h>
10232  #include <linux/page-flags.h>
10233 +#include <linux/locallock.h>
10234  #include <asm/page.h>
10235  
10236  struct notifier_block;
10237 @@ -247,7 +248,8 @@ struct swap_info_struct {
10238  void *workingset_eviction(struct address_space *mapping, struct page *page);
10239  bool workingset_refault(void *shadow);
10240  void workingset_activation(struct page *page);
10241 -extern struct list_lru workingset_shadow_nodes;
10242 +extern struct list_lru __workingset_shadow_nodes;
10243 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
10244  
10245  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10246  {
10247 @@ -292,6 +294,7 @@ extern unsigned long nr_free_pagecache_pages(void);
10248  
10249  
10250  /* linux/mm/swap.c */
10251 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10252  extern void lru_cache_add(struct page *);
10253  extern void lru_cache_add_anon(struct page *page);
10254  extern void lru_cache_add_file(struct page *page);
10255 diff --git a/include/linux/swork.h b/include/linux/swork.h
10256 new file mode 100644
10257 index 000000000000..f175fa9a6016
10258 --- /dev/null
10259 +++ b/include/linux/swork.h
10260 @@ -0,0 +1,24 @@
10261 +#ifndef _LINUX_SWORK_H
10262 +#define _LINUX_SWORK_H
10263 +
10264 +#include <linux/list.h>
10265 +
10266 +struct swork_event {
10267 +       struct list_head item;
10268 +       unsigned long flags;
10269 +       void (*func)(struct swork_event *);
10270 +};
10271 +
10272 +static inline void INIT_SWORK(struct swork_event *event,
10273 +                             void (*func)(struct swork_event *))
10274 +{
10275 +       event->flags = 0;
10276 +       event->func = func;
10277 +}
10278 +
10279 +bool swork_queue(struct swork_event *sev);
10280 +
10281 +int swork_get(void);
10282 +void swork_put(void);
10283 +
10284 +#endif /* _LINUX_SWORK_H */
10285 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
10286 index 2873baf5372a..eb1a108f17ca 100644
10287 --- a/include/linux/thread_info.h
10288 +++ b/include/linux/thread_info.h
10289 @@ -107,7 +107,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
10290  #define test_thread_flag(flag) \
10291         test_ti_thread_flag(current_thread_info(), flag)
10292  
10293 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10294 +#ifdef CONFIG_PREEMPT_LAZY
10295 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10296 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10297 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10298 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10299 +
10300 +#else
10301 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10302 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10303 +#define tif_need_resched_lazy()        0
10304 +#endif
10305  
10306  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10307  static inline int arch_within_stack_frames(const void * const stack,
10308 diff --git a/include/linux/timer.h b/include/linux/timer.h
10309 index 51d601f192d4..83cea629efe1 100644
10310 --- a/include/linux/timer.h
10311 +++ b/include/linux/timer.h
10312 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
10313  
10314  extern int try_to_del_timer_sync(struct timer_list *timer);
10315  
10316 -#ifdef CONFIG_SMP
10317 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10318    extern int del_timer_sync(struct timer_list *timer);
10319  #else
10320  # define del_timer_sync(t)             del_timer(t)
10321 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
10322 index be007610ceb0..15154b13a53b 100644
10323 --- a/include/linux/trace_events.h
10324 +++ b/include/linux/trace_events.h
10325 @@ -56,6 +56,9 @@ struct trace_entry {
10326         unsigned char           flags;
10327         unsigned char           preempt_count;
10328         int                     pid;
10329 +       unsigned short          migrate_disable;
10330 +       unsigned short          padding;
10331 +       unsigned char           preempt_lazy_count;
10332  };
10333  
10334  #define TRACE_EVENT_TYPE_MAX                                           \
10335 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
10336 index f30c187ed785..83bf0f798426 100644
10337 --- a/include/linux/uaccess.h
10338 +++ b/include/linux/uaccess.h
10339 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
10340   */
10341  static inline void pagefault_disable(void)
10342  {
10343 +       migrate_disable();
10344         pagefault_disabled_inc();
10345         /*
10346          * make sure to have issued the store before a pagefault
10347 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
10348          */
10349         barrier();
10350         pagefault_disabled_dec();
10351 +       migrate_enable();
10352  }
10353  
10354  /*
10355 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
10356 index 4a29c75b146e..0a294e950df8 100644
10357 --- a/include/linux/uprobes.h
10358 +++ b/include/linux/uprobes.h
10359 @@ -27,6 +27,7 @@
10360  #include <linux/errno.h>
10361  #include <linux/rbtree.h>
10362  #include <linux/types.h>
10363 +#include <linux/wait.h>
10364  
10365  struct vm_area_struct;
10366  struct mm_struct;
10367 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
10368 index 613771909b6e..e28c5a43229d 100644
10369 --- a/include/linux/vmstat.h
10370 +++ b/include/linux/vmstat.h
10371 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
10372   */
10373  static inline void __count_vm_event(enum vm_event_item item)
10374  {
10375 +       preempt_disable_rt();
10376         raw_cpu_inc(vm_event_states.event[item]);
10377 +       preempt_enable_rt();
10378  }
10379  
10380  static inline void count_vm_event(enum vm_event_item item)
10381 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
10382  
10383  static inline void __count_vm_events(enum vm_event_item item, long delta)
10384  {
10385 +       preempt_disable_rt();
10386         raw_cpu_add(vm_event_states.event[item], delta);
10387 +       preempt_enable_rt();
10388  }
10389  
10390  static inline void count_vm_events(enum vm_event_item item, long delta)
10391 diff --git a/include/linux/wait.h b/include/linux/wait.h
10392 index 2408e8d5c05c..db50d6609195 100644
10393 --- a/include/linux/wait.h
10394 +++ b/include/linux/wait.h
10395 @@ -8,6 +8,7 @@
10396  #include <linux/spinlock.h>
10397  #include <asm/current.h>
10398  #include <uapi/linux/wait.h>
10399 +#include <linux/atomic.h>
10400  
10401  typedef struct __wait_queue wait_queue_t;
10402  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10403 diff --git a/include/net/dst.h b/include/net/dst.h
10404 index 6835d224d47b..55a5a9698f14 100644
10405 --- a/include/net/dst.h
10406 +++ b/include/net/dst.h
10407 @@ -446,7 +446,7 @@ static inline void dst_confirm(struct dst_entry *dst)
10408  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10409                                    struct sk_buff *skb)
10410  {
10411 -       const struct hh_cache *hh;
10412 +       struct hh_cache *hh;
10413  
10414         if (dst->pending_confirm) {
10415                 unsigned long now = jiffies;
10416 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
10417 index 231e121cc7d9..d125222b979d 100644
10418 --- a/include/net/gen_stats.h
10419 +++ b/include/net/gen_stats.h
10420 @@ -5,6 +5,7 @@
10421  #include <linux/socket.h>
10422  #include <linux/rtnetlink.h>
10423  #include <linux/pkt_sched.h>
10424 +#include <net/net_seq_lock.h>
10425  
10426  struct gnet_stats_basic_cpu {
10427         struct gnet_stats_basic_packed bstats;
10428 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
10429                                  spinlock_t *lock, struct gnet_dump *d,
10430                                  int padattr);
10431  
10432 -int gnet_stats_copy_basic(const seqcount_t *running,
10433 +int gnet_stats_copy_basic(net_seqlock_t *running,
10434                           struct gnet_dump *d,
10435                           struct gnet_stats_basic_cpu __percpu *cpu,
10436                           struct gnet_stats_basic_packed *b);
10437 -void __gnet_stats_copy_basic(const seqcount_t *running,
10438 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10439                              struct gnet_stats_basic_packed *bstats,
10440                              struct gnet_stats_basic_cpu __percpu *cpu,
10441                              struct gnet_stats_basic_packed *b);
10442 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
10443                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10444                       struct gnet_stats_rate_est64 *rate_est,
10445                       spinlock_t *stats_lock,
10446 -                     seqcount_t *running, struct nlattr *opt);
10447 +                     net_seqlock_t *running, struct nlattr *opt);
10448  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10449                         struct gnet_stats_rate_est64 *rate_est);
10450  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10451                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10452                           struct gnet_stats_rate_est64 *rate_est,
10453                           spinlock_t *stats_lock,
10454 -                         seqcount_t *running, struct nlattr *opt);
10455 +                         net_seqlock_t *running, struct nlattr *opt);
10456  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10457                           const struct gnet_stats_rate_est64 *rate_est);
10458  #endif
10459 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
10460 index 8b683841e574..bf656008f6e7 100644
10461 --- a/include/net/neighbour.h
10462 +++ b/include/net/neighbour.h
10463 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
10464  }
10465  #endif
10466  
10467 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10468 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10469  {
10470         unsigned int seq;
10471         int hh_len;
10472 @@ -501,7 +501,7 @@ struct neighbour_cb {
10473  
10474  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10475  
10476 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10477 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10478                                      const struct net_device *dev)
10479  {
10480         unsigned int seq;
10481 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
10482 new file mode 100644
10483 index 000000000000..a7034298a82a
10484 --- /dev/null
10485 +++ b/include/net/net_seq_lock.h
10486 @@ -0,0 +1,15 @@
10487 +#ifndef __NET_NET_SEQ_LOCK_H__
10488 +#define __NET_NET_SEQ_LOCK_H__
10489 +
10490 +#ifdef CONFIG_PREEMPT_RT_BASE
10491 +# define net_seqlock_t                 seqlock_t
10492 +# define net_seq_begin(__r)            read_seqbegin(__r)
10493 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10494 +
10495 +#else
10496 +# define net_seqlock_t                 seqcount_t
10497 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10498 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10499 +#endif
10500 +
10501 +#endif
10502 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
10503 index 7adf4386ac8f..d3fd5c357268 100644
10504 --- a/include/net/netns/ipv4.h
10505 +++ b/include/net/netns/ipv4.h
10506 @@ -69,6 +69,7 @@ struct netns_ipv4 {
10507  
10508         int sysctl_icmp_echo_ignore_all;
10509         int sysctl_icmp_echo_ignore_broadcasts;
10510 +       int sysctl_icmp_echo_sysrq;
10511         int sysctl_icmp_ignore_bogus_error_responses;
10512         int sysctl_icmp_ratelimit;
10513         int sysctl_icmp_ratemask;
10514 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
10515 index e6aa0a249672..b57736f2a8a3 100644
10516 --- a/include/net/sch_generic.h
10517 +++ b/include/net/sch_generic.h
10518 @@ -10,6 +10,7 @@
10519  #include <linux/dynamic_queue_limits.h>
10520  #include <net/gen_stats.h>
10521  #include <net/rtnetlink.h>
10522 +#include <net/net_seq_lock.h>
10523  
10524  struct Qdisc_ops;
10525  struct qdisc_walker;
10526 @@ -86,7 +87,7 @@ struct Qdisc {
10527         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10528         struct qdisc_skb_head   q;
10529         struct gnet_stats_basic_packed bstats;
10530 -       seqcount_t              running;
10531 +       net_seqlock_t           running;
10532         struct gnet_stats_queue qstats;
10533         unsigned long           state;
10534         struct Qdisc            *next_sched;
10535 @@ -98,13 +99,22 @@ struct Qdisc {
10536         spinlock_t              busylock ____cacheline_aligned_in_smp;
10537  };
10538  
10539 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10540 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10541  {
10542 +#ifdef CONFIG_PREEMPT_RT_BASE
10543 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10544 +#else
10545         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10546 +#endif
10547  }
10548  
10549  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10550  {
10551 +#ifdef CONFIG_PREEMPT_RT_BASE
10552 +       if (try_write_seqlock(&qdisc->running))
10553 +               return true;
10554 +       return false;
10555 +#else
10556         if (qdisc_is_running(qdisc))
10557                 return false;
10558         /* Variant of write_seqcount_begin() telling lockdep a trylock
10559 @@ -113,11 +123,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10560         raw_write_seqcount_begin(&qdisc->running);
10561         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10562         return true;
10563 +#endif
10564  }
10565  
10566  static inline void qdisc_run_end(struct Qdisc *qdisc)
10567  {
10568 +#ifdef CONFIG_PREEMPT_RT_BASE
10569 +       write_sequnlock(&qdisc->running);
10570 +#else
10571         write_seqcount_end(&qdisc->running);
10572 +#endif
10573  }
10574  
10575  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10576 @@ -308,7 +323,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
10577         return qdisc_lock(root);
10578  }
10579  
10580 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10581 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10582  {
10583         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10584  
10585 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
10586 new file mode 100644
10587 index 000000000000..f7710de1b1f3
10588 --- /dev/null
10589 +++ b/include/trace/events/hist.h
10590 @@ -0,0 +1,73 @@
10591 +#undef TRACE_SYSTEM
10592 +#define TRACE_SYSTEM hist
10593 +
10594 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10595 +#define _TRACE_HIST_H
10596 +
10597 +#include "latency_hist.h"
10598 +#include <linux/tracepoint.h>
10599 +
10600 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10601 +#define trace_preemptirqsoff_hist(a, b)
10602 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10603 +#else
10604 +TRACE_EVENT(preemptirqsoff_hist,
10605 +
10606 +       TP_PROTO(int reason, int starthist),
10607 +
10608 +       TP_ARGS(reason, starthist),
10609 +
10610 +       TP_STRUCT__entry(
10611 +               __field(int,    reason)
10612 +               __field(int,    starthist)
10613 +       ),
10614 +
10615 +       TP_fast_assign(
10616 +               __entry->reason         = reason;
10617 +               __entry->starthist      = starthist;
10618 +       ),
10619 +
10620 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10621 +                 __entry->starthist ? "start" : "stop")
10622 +);
10623 +#endif
10624 +
10625 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10626 +#define trace_hrtimer_interrupt(a, b, c, d)
10627 +#else
10628 +TRACE_EVENT(hrtimer_interrupt,
10629 +
10630 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10631 +               struct task_struct *task),
10632 +
10633 +       TP_ARGS(cpu, offset, curr, task),
10634 +
10635 +       TP_STRUCT__entry(
10636 +               __field(int,            cpu)
10637 +               __field(long long,      offset)
10638 +               __array(char,           ccomm,  TASK_COMM_LEN)
10639 +               __field(int,            cprio)
10640 +               __array(char,           tcomm,  TASK_COMM_LEN)
10641 +               __field(int,            tprio)
10642 +       ),
10643 +
10644 +       TP_fast_assign(
10645 +               __entry->cpu    = cpu;
10646 +               __entry->offset = offset;
10647 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10648 +               __entry->cprio  = curr->prio;
10649 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10650 +                       task != NULL ? TASK_COMM_LEN : 7);
10651 +               __entry->tprio  = task != NULL ? task->prio : -1;
10652 +       ),
10653 +
10654 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10655 +               __entry->cpu, __entry->offset, __entry->ccomm,
10656 +               __entry->cprio, __entry->tcomm, __entry->tprio)
10657 +);
10658 +#endif
10659 +
10660 +#endif /* _TRACE_HIST_H */
10661 +
10662 +/* This part must be outside protection */
10663 +#include <trace/define_trace.h>
10664 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
10665 new file mode 100644
10666 index 000000000000..d3f2fbd560b1
10667 --- /dev/null
10668 +++ b/include/trace/events/latency_hist.h
10669 @@ -0,0 +1,29 @@
10670 +#ifndef _LATENCY_HIST_H
10671 +#define _LATENCY_HIST_H
10672 +
10673 +enum hist_action {
10674 +       IRQS_ON,
10675 +       PREEMPT_ON,
10676 +       TRACE_STOP,
10677 +       IRQS_OFF,
10678 +       PREEMPT_OFF,
10679 +       TRACE_START,
10680 +};
10681 +
10682 +static char *actions[] = {
10683 +       "IRQS_ON",
10684 +       "PREEMPT_ON",
10685 +       "TRACE_STOP",
10686 +       "IRQS_OFF",
10687 +       "PREEMPT_OFF",
10688 +       "TRACE_START",
10689 +};
10690 +
10691 +static inline char *getaction(int action)
10692 +{
10693 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10694 +               return actions[action];
10695 +       return "unknown";
10696 +}
10697 +
10698 +#endif /* _LATENCY_HIST_H */
10699 diff --git a/init/Kconfig b/init/Kconfig
10700 index 34407f15e6d3..2ce33a32e65d 100644
10701 --- a/init/Kconfig
10702 +++ b/init/Kconfig
10703 @@ -506,7 +506,7 @@ config TINY_RCU
10704  
10705  config RCU_EXPERT
10706         bool "Make expert-level adjustments to RCU configuration"
10707 -       default n
10708 +       default y if PREEMPT_RT_FULL
10709         help
10710           This option needs to be enabled if you wish to make
10711           expert-level adjustments to RCU configuration.  By default,
10712 @@ -623,7 +623,7 @@ config RCU_FANOUT_LEAF
10713  
10714  config RCU_FAST_NO_HZ
10715         bool "Accelerate last non-dyntick-idle CPU's grace periods"
10716 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10717 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10718         default n
10719         help
10720           This option permits CPUs to enter dynticks-idle state even if
10721 @@ -650,7 +650,7 @@ config TREE_RCU_TRACE
10722  config RCU_BOOST
10723         bool "Enable RCU priority boosting"
10724         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10725 -       default n
10726 +       default y if PREEMPT_RT_FULL
10727         help
10728           This option boosts the priority of preempted RCU readers that
10729           block the current preemptible RCU grace period for too long.
10730 @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
10731  
10732  endchoice
10733  
10734 -config RCU_EXPEDITE_BOOT
10735 -       bool
10736 -       default n
10737 -       help
10738 -         This option enables expedited grace periods at boot time,
10739 -         as if rcu_expedite_gp() had been invoked early in boot.
10740 -         The corresponding rcu_unexpedite_gp() is invoked from
10741 -         rcu_end_inkernel_boot(), which is intended to be invoked
10742 -         at the end of the kernel-only boot sequence, just before
10743 -         init is exec'ed.
10744 -
10745 -         Accept the default if unsure.
10746 -
10747  endmenu # "RCU Subsystem"
10748  
10749  config BUILD_BIN2C
10750 @@ -1064,6 +1051,7 @@ config CFS_BANDWIDTH
10751  config RT_GROUP_SCHED
10752         bool "Group scheduling for SCHED_RR/FIFO"
10753         depends on CGROUP_SCHED
10754 +       depends on !PREEMPT_RT_FULL
10755         default n
10756         help
10757           This feature lets you explicitly allocate real CPU bandwidth
10758 @@ -1772,6 +1760,7 @@ choice
10759  
10760  config SLAB
10761         bool "SLAB"
10762 +       depends on !PREEMPT_RT_FULL
10763         select HAVE_HARDENED_USERCOPY_ALLOCATOR
10764         help
10765           The regular slab allocator that is established and known to work
10766 @@ -1792,6 +1781,7 @@ config SLUB
10767  config SLOB
10768         depends on EXPERT
10769         bool "SLOB (Simple Allocator)"
10770 +       depends on !PREEMPT_RT_FULL
10771         help
10772            SLOB replaces the stock allocator with a drastically simpler
10773            allocator. SLOB is generally more space efficient but
10774 @@ -1810,7 +1800,7 @@ config SLAB_FREELIST_RANDOM
10775  
10776  config SLUB_CPU_PARTIAL
10777         default y
10778 -       depends on SLUB && SMP
10779 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
10780         bool "SLUB per cpu partial cache"
10781         help
10782           Per cpu partial caches accellerate objects allocation and freeing
10783 diff --git a/init/Makefile b/init/Makefile
10784 index c4fb45525d08..821190dfaa75 100644
10785 --- a/init/Makefile
10786 +++ b/init/Makefile
10787 @@ -35,4 +35,4 @@ $(obj)/version.o: include/generated/compile.h
10788  include/generated/compile.h: FORCE
10789         @$($(quiet)chk_compile.h)
10790         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
10791 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
10792 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
10793 diff --git a/init/main.c b/init/main.c
10794 index 2858be732f6d..3c97c3c91d88 100644
10795 --- a/init/main.c
10796 +++ b/init/main.c
10797 @@ -507,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
10798         setup_command_line(command_line);
10799         setup_nr_cpu_ids();
10800         setup_per_cpu_areas();
10801 +       softirq_early_init();
10802         boot_cpu_state_init();
10803         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
10804  
10805 diff --git a/ipc/sem.c b/ipc/sem.c
10806 index 10b94bc59d4a..b8360eaacc7a 100644
10807 --- a/ipc/sem.c
10808 +++ b/ipc/sem.c
10809 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
10810  static void wake_up_sem_queue_prepare(struct list_head *pt,
10811                                 struct sem_queue *q, int error)
10812  {
10813 +#ifdef CONFIG_PREEMPT_RT_BASE
10814 +       struct task_struct *p = q->sleeper;
10815 +       get_task_struct(p);
10816 +       q->status = error;
10817 +       wake_up_process(p);
10818 +       put_task_struct(p);
10819 +#else
10820         if (list_empty(pt)) {
10821                 /*
10822                  * Hold preempt off so that we don't get preempted and have the
10823 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
10824         q->pid = error;
10825  
10826         list_add_tail(&q->list, pt);
10827 +#endif
10828  }
10829  
10830  /**
10831 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
10832   */
10833  static void wake_up_sem_queue_do(struct list_head *pt)
10834  {
10835 +#ifndef CONFIG_PREEMPT_RT_BASE
10836         struct sem_queue *q, *t;
10837         int did_something;
10838  
10839 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
10840         }
10841         if (did_something)
10842                 preempt_enable();
10843 +#endif
10844  }
10845  
10846  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
10847 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
10848 index ebdb0043203a..b9e6aa7e5aa6 100644
10849 --- a/kernel/Kconfig.locks
10850 +++ b/kernel/Kconfig.locks
10851 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
10852  
10853  config MUTEX_SPIN_ON_OWNER
10854         def_bool y
10855 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
10856 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10857  
10858  config RWSEM_SPIN_ON_OWNER
10859         def_bool y
10860 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
10861 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10862  
10863  config LOCK_SPIN_ON_OWNER
10864         def_bool y
10865 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
10866 index 3f9c97419f02..11dbe26a8279 100644
10867 --- a/kernel/Kconfig.preempt
10868 +++ b/kernel/Kconfig.preempt
10869 @@ -1,3 +1,16 @@
10870 +config PREEMPT
10871 +       bool
10872 +       select PREEMPT_COUNT
10873 +
10874 +config PREEMPT_RT_BASE
10875 +       bool
10876 +       select PREEMPT
10877 +
10878 +config HAVE_PREEMPT_LAZY
10879 +       bool
10880 +
10881 +config PREEMPT_LAZY
10882 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
10883  
10884  choice
10885         prompt "Preemption Model"
10886 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
10887  
10888           Select this if you are building a kernel for a desktop system.
10889  
10890 -config PREEMPT
10891 +config PREEMPT__LL
10892         bool "Preemptible Kernel (Low-Latency Desktop)"
10893 -       select PREEMPT_COUNT
10894 +       select PREEMPT
10895         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
10896         help
10897           This option reduces the latency of the kernel by making
10898 @@ -52,6 +65,22 @@ config PREEMPT
10899           embedded system with latency requirements in the milliseconds
10900           range.
10901  
10902 +config PREEMPT_RTB
10903 +       bool "Preemptible Kernel (Basic RT)"
10904 +       select PREEMPT_RT_BASE
10905 +       help
10906 +         This option is basically the same as (Low-Latency Desktop) but
10907 +         enables changes which are preliminary for the full preemptible
10908 +         RT kernel.
10909 +
10910 +config PREEMPT_RT_FULL
10911 +       bool "Fully Preemptible Kernel (RT)"
10912 +       depends on IRQ_FORCED_THREADING
10913 +       select PREEMPT_RT_BASE
10914 +       select PREEMPT_RCU
10915 +       help
10916 +         All and everything
10917 +
10918  endchoice
10919  
10920  config PREEMPT_COUNT
10921 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
10922 index 4e2f3de0e40b..6401eb5fe140 100644
10923 --- a/kernel/cgroup.c
10924 +++ b/kernel/cgroup.c
10925 @@ -5040,10 +5040,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
10926         queue_work(cgroup_destroy_wq, &css->destroy_work);
10927  }
10928  
10929 -static void css_release_work_fn(struct work_struct *work)
10930 +static void css_release_work_fn(struct swork_event *sev)
10931  {
10932         struct cgroup_subsys_state *css =
10933 -               container_of(work, struct cgroup_subsys_state, destroy_work);
10934 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
10935         struct cgroup_subsys *ss = css->ss;
10936         struct cgroup *cgrp = css->cgroup;
10937  
10938 @@ -5086,8 +5086,8 @@ static void css_release(struct percpu_ref *ref)
10939         struct cgroup_subsys_state *css =
10940                 container_of(ref, struct cgroup_subsys_state, refcnt);
10941  
10942 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
10943 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
10944 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
10945 +       swork_queue(&css->destroy_swork);
10946  }
10947  
10948  static void init_and_link_css(struct cgroup_subsys_state *css,
10949 @@ -5739,6 +5739,7 @@ static int __init cgroup_wq_init(void)
10950          */
10951         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
10952         BUG_ON(!cgroup_destroy_wq);
10953 +       BUG_ON(swork_get());
10954  
10955         /*
10956          * Used to destroy pidlists and separate to serve as flush domain.
10957 diff --git a/kernel/cpu.c b/kernel/cpu.c
10958 index 217fd2e7f435..c23676e58dfd 100644
10959 --- a/kernel/cpu.c
10960 +++ b/kernel/cpu.c
10961 @@ -239,6 +239,289 @@ static struct {
10962  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
10963  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
10964  
10965 +/**
10966 + * hotplug_pcp - per cpu hotplug descriptor
10967 + * @unplug:    set when pin_current_cpu() needs to sync tasks
10968 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
10969 + * @refcount:  counter of tasks in pinned sections
10970 + * @grab_lock: set when the tasks entering pinned sections should wait
10971 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
10972 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
10973 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
10974 + *
10975 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
10976 + * is used as a flag and still exists after @sync_tsk has exited and
10977 + * @sync_tsk set to NULL.
10978 + */
10979 +struct hotplug_pcp {
10980 +       struct task_struct *unplug;
10981 +       struct task_struct *sync_tsk;
10982 +       int refcount;
10983 +       int grab_lock;
10984 +       struct completion synced;
10985 +       struct completion unplug_wait;
10986 +#ifdef CONFIG_PREEMPT_RT_FULL
10987 +       /*
10988 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
10989 +        * the task, otherwise the mutex will cause the task to fail
10990 +        * to sleep when required. (Because it's called from migrate_disable())
10991 +        *
10992 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
10993 +        * state.
10994 +        */
10995 +       spinlock_t lock;
10996 +#else
10997 +       struct mutex mutex;
10998 +#endif
10999 +       int mutex_init;
11000 +};
11001 +
11002 +#ifdef CONFIG_PREEMPT_RT_FULL
11003 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
11004 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
11005 +#else
11006 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
11007 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
11008 +#endif
11009 +
11010 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
11011 +
11012 +/**
11013 + * pin_current_cpu - Prevent the current cpu from being unplugged
11014 + *
11015 + * Lightweight version of get_online_cpus() to prevent cpu from being
11016 + * unplugged when code runs in a migration disabled region.
11017 + *
11018 + * Must be called with preemption disabled (preempt_count = 1)!
11019 + */
11020 +void pin_current_cpu(void)
11021 +{
11022 +       struct hotplug_pcp *hp;
11023 +       int force = 0;
11024 +
11025 +retry:
11026 +       hp = this_cpu_ptr(&hotplug_pcp);
11027 +
11028 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
11029 +           hp->unplug == current) {
11030 +               hp->refcount++;
11031 +               return;
11032 +       }
11033 +       if (hp->grab_lock) {
11034 +               preempt_enable();
11035 +               hotplug_lock(hp);
11036 +               hotplug_unlock(hp);
11037 +       } else {
11038 +               preempt_enable();
11039 +               /*
11040 +                * Try to push this task off of this CPU.
11041 +                */
11042 +               if (!migrate_me()) {
11043 +                       preempt_disable();
11044 +                       hp = this_cpu_ptr(&hotplug_pcp);
11045 +                       if (!hp->grab_lock) {
11046 +                               /*
11047 +                                * Just let it continue it's already pinned
11048 +                                * or about to sleep.
11049 +                                */
11050 +                               force = 1;
11051 +                               goto retry;
11052 +                       }
11053 +                       preempt_enable();
11054 +               }
11055 +       }
11056 +       preempt_disable();
11057 +       goto retry;
11058 +}
11059 +
11060 +/**
11061 + * unpin_current_cpu - Allow unplug of current cpu
11062 + *
11063 + * Must be called with preemption or interrupts disabled!
11064 + */
11065 +void unpin_current_cpu(void)
11066 +{
11067 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
11068 +
11069 +       WARN_ON(hp->refcount <= 0);
11070 +
11071 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
11072 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
11073 +               wake_up_process(hp->unplug);
11074 +}
11075 +
11076 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
11077 +{
11078 +       set_current_state(TASK_UNINTERRUPTIBLE);
11079 +       while (hp->refcount) {
11080 +               schedule_preempt_disabled();
11081 +               set_current_state(TASK_UNINTERRUPTIBLE);
11082 +       }
11083 +}
11084 +
11085 +static int sync_unplug_thread(void *data)
11086 +{
11087 +       struct hotplug_pcp *hp = data;
11088 +
11089 +       wait_for_completion(&hp->unplug_wait);
11090 +       preempt_disable();
11091 +       hp->unplug = current;
11092 +       wait_for_pinned_cpus(hp);
11093 +
11094 +       /*
11095 +        * This thread will synchronize the cpu_down() with threads
11096 +        * that have pinned the CPU. When the pinned CPU count reaches
11097 +        * zero, we inform the cpu_down code to continue to the next step.
11098 +        */
11099 +       set_current_state(TASK_UNINTERRUPTIBLE);
11100 +       preempt_enable();
11101 +       complete(&hp->synced);
11102 +
11103 +       /*
11104 +        * If all succeeds, the next step will need tasks to wait till
11105 +        * the CPU is offline before continuing. To do this, the grab_lock
11106 +        * is set and tasks going into pin_current_cpu() will block on the
11107 +        * mutex. But we still need to wait for those that are already in
11108 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
11109 +        * will kick this thread out.
11110 +        */
11111 +       while (!hp->grab_lock && !kthread_should_stop()) {
11112 +               schedule();
11113 +               set_current_state(TASK_UNINTERRUPTIBLE);
11114 +       }
11115 +
11116 +       /* Make sure grab_lock is seen before we see a stale completion */
11117 +       smp_mb();
11118 +
11119 +       /*
11120 +        * Now just before cpu_down() enters stop machine, we need to make
11121 +        * sure all tasks that are in pinned CPU sections are out, and new
11122 +        * tasks will now grab the lock, keeping them from entering pinned
11123 +        * CPU sections.
11124 +        */
11125 +       if (!kthread_should_stop()) {
11126 +               preempt_disable();
11127 +               wait_for_pinned_cpus(hp);
11128 +               preempt_enable();
11129 +               complete(&hp->synced);
11130 +       }
11131 +
11132 +       set_current_state(TASK_UNINTERRUPTIBLE);
11133 +       while (!kthread_should_stop()) {
11134 +               schedule();
11135 +               set_current_state(TASK_UNINTERRUPTIBLE);
11136 +       }
11137 +       set_current_state(TASK_RUNNING);
11138 +
11139 +       /*
11140 +        * Force this thread off this CPU as it's going down and
11141 +        * we don't want any more work on this CPU.
11142 +        */
11143 +       current->flags &= ~PF_NO_SETAFFINITY;
11144 +       set_cpus_allowed_ptr(current, cpu_present_mask);
11145 +       migrate_me();
11146 +       return 0;
11147 +}
11148 +
11149 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
11150 +{
11151 +       wake_up_process(hp->sync_tsk);
11152 +       wait_for_completion(&hp->synced);
11153 +}
11154 +
11155 +static void __cpu_unplug_wait(unsigned int cpu)
11156 +{
11157 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11158 +
11159 +       complete(&hp->unplug_wait);
11160 +       wait_for_completion(&hp->synced);
11161 +}
11162 +
11163 +/*
11164 + * Start the sync_unplug_thread on the target cpu and wait for it to
11165 + * complete.
11166 + */
11167 +static int cpu_unplug_begin(unsigned int cpu)
11168 +{
11169 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11170 +       int err;
11171 +
11172 +       /* Protected by cpu_hotplug.lock */
11173 +       if (!hp->mutex_init) {
11174 +#ifdef CONFIG_PREEMPT_RT_FULL
11175 +               spin_lock_init(&hp->lock);
11176 +#else
11177 +               mutex_init(&hp->mutex);
11178 +#endif
11179 +               hp->mutex_init = 1;
11180 +       }
11181 +
11182 +       /* Inform the scheduler to migrate tasks off this CPU */
11183 +       tell_sched_cpu_down_begin(cpu);
11184 +
11185 +       init_completion(&hp->synced);
11186 +       init_completion(&hp->unplug_wait);
11187 +
11188 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
11189 +       if (IS_ERR(hp->sync_tsk)) {
11190 +               err = PTR_ERR(hp->sync_tsk);
11191 +               hp->sync_tsk = NULL;
11192 +               return err;
11193 +       }
11194 +       kthread_bind(hp->sync_tsk, cpu);
11195 +
11196 +       /*
11197 +        * Wait for tasks to get out of the pinned sections,
11198 +        * it's still OK if new tasks enter. Some CPU notifiers will
11199 +        * wait for tasks that are going to enter these sections and
11200 +        * we must not have them block.
11201 +        */
11202 +       wake_up_process(hp->sync_tsk);
11203 +       return 0;
11204 +}
11205 +
11206 +static void cpu_unplug_sync(unsigned int cpu)
11207 +{
11208 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11209 +
11210 +       init_completion(&hp->synced);
11211 +       /* The completion needs to be initialzied before setting grab_lock */
11212 +       smp_wmb();
11213 +
11214 +       /* Grab the mutex before setting grab_lock */
11215 +       hotplug_lock(hp);
11216 +       hp->grab_lock = 1;
11217 +
11218 +       /*
11219 +        * The CPU notifiers have been completed.
11220 +        * Wait for tasks to get out of pinned CPU sections and have new
11221 +        * tasks block until the CPU is completely down.
11222 +        */
11223 +       __cpu_unplug_sync(hp);
11224 +
11225 +       /* All done with the sync thread */
11226 +       kthread_stop(hp->sync_tsk);
11227 +       hp->sync_tsk = NULL;
11228 +}
11229 +
11230 +static void cpu_unplug_done(unsigned int cpu)
11231 +{
11232 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11233 +
11234 +       hp->unplug = NULL;
11235 +       /* Let all tasks know cpu unplug is finished before cleaning up */
11236 +       smp_wmb();
11237 +
11238 +       if (hp->sync_tsk)
11239 +               kthread_stop(hp->sync_tsk);
11240 +
11241 +       if (hp->grab_lock) {
11242 +               hotplug_unlock(hp);
11243 +               /* protected by cpu_hotplug.lock */
11244 +               hp->grab_lock = 0;
11245 +       }
11246 +       tell_sched_cpu_down_done(cpu);
11247 +}
11248  
11249  void get_online_cpus(void)
11250  {
11251 @@ -789,10 +1072,14 @@ static int takedown_cpu(unsigned int cpu)
11252         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11253         int err;
11254  
11255 +       __cpu_unplug_wait(cpu);
11256         /* Park the smpboot threads */
11257         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
11258         smpboot_park_threads(cpu);
11259  
11260 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
11261 +       cpu_unplug_sync(cpu);
11262 +
11263         /*
11264          * Prevent irq alloc/free while the dying cpu reorganizes the
11265          * interrupt affinities.
11266 @@ -877,6 +1164,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11267         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11268         int prev_state, ret = 0;
11269         bool hasdied = false;
11270 +       int mycpu;
11271 +       cpumask_var_t cpumask;
11272 +       cpumask_var_t cpumask_org;
11273  
11274         if (num_online_cpus() == 1)
11275                 return -EBUSY;
11276 @@ -884,7 +1174,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11277         if (!cpu_present(cpu))
11278                 return -EINVAL;
11279  
11280 +       /* Move the downtaker off the unplug cpu */
11281 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11282 +               return -ENOMEM;
11283 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11284 +               free_cpumask_var(cpumask);
11285 +               return -ENOMEM;
11286 +       }
11287 +
11288 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11289 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11290 +       set_cpus_allowed_ptr(current, cpumask);
11291 +       free_cpumask_var(cpumask);
11292 +       migrate_disable();
11293 +       mycpu = smp_processor_id();
11294 +       if (mycpu == cpu) {
11295 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11296 +               migrate_enable();
11297 +               ret = -EBUSY;
11298 +               goto restore_cpus;
11299 +       }
11300 +
11301 +       migrate_enable();
11302         cpu_hotplug_begin();
11303 +       ret = cpu_unplug_begin(cpu);
11304 +       if (ret) {
11305 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11306 +               goto out_cancel;
11307 +       }
11308  
11309         cpuhp_tasks_frozen = tasks_frozen;
11310  
11311 @@ -923,10 +1240,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11312  
11313         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11314  out:
11315 +       cpu_unplug_done(cpu);
11316 +out_cancel:
11317         cpu_hotplug_done();
11318         /* This post dead nonsense must die */
11319         if (!ret && hasdied)
11320                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11321 +restore_cpus:
11322 +       set_cpus_allowed_ptr(current, cpumask_org);
11323 +       free_cpumask_var(cpumask_org);
11324         return ret;
11325  }
11326  
11327 @@ -1240,6 +1562,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
11328  
11329  #endif /* CONFIG_PM_SLEEP_SMP */
11330  
11331 +int __boot_cpu_id;
11332 +
11333  #endif /* CONFIG_SMP */
11334  
11335  /* Boot processor state steps */
11336 @@ -1923,6 +2247,10 @@ void __init boot_cpu_init(void)
11337         set_cpu_active(cpu, true);
11338         set_cpu_present(cpu, true);
11339         set_cpu_possible(cpu, true);
11340 +
11341 +#ifdef CONFIG_SMP
11342 +       __boot_cpu_id = cpu;
11343 +#endif
11344  }
11345  
11346  /*
11347 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
11348 index 29f815d2ef7e..341b17f24f95 100644
11349 --- a/kernel/cpuset.c
11350 +++ b/kernel/cpuset.c
11351 @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
11352   */
11353  
11354  static DEFINE_MUTEX(cpuset_mutex);
11355 -static DEFINE_SPINLOCK(callback_lock);
11356 +static DEFINE_RAW_SPINLOCK(callback_lock);
11357  
11358  static struct workqueue_struct *cpuset_migrate_mm_wq;
11359  
11360 @@ -907,9 +907,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
11361                         continue;
11362                 rcu_read_unlock();
11363  
11364 -               spin_lock_irq(&callback_lock);
11365 +               raw_spin_lock_irq(&callback_lock);
11366                 cpumask_copy(cp->effective_cpus, new_cpus);
11367 -               spin_unlock_irq(&callback_lock);
11368 +               raw_spin_unlock_irq(&callback_lock);
11369  
11370                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11371                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
11372 @@ -974,9 +974,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
11373         if (retval < 0)
11374                 return retval;
11375  
11376 -       spin_lock_irq(&callback_lock);
11377 +       raw_spin_lock_irq(&callback_lock);
11378         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
11379 -       spin_unlock_irq(&callback_lock);
11380 +       raw_spin_unlock_irq(&callback_lock);
11381  
11382         /* use trialcs->cpus_allowed as a temp variable */
11383         update_cpumasks_hier(cs, trialcs->cpus_allowed);
11384 @@ -1176,9 +1176,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
11385                         continue;
11386                 rcu_read_unlock();
11387  
11388 -               spin_lock_irq(&callback_lock);
11389 +               raw_spin_lock_irq(&callback_lock);
11390                 cp->effective_mems = *new_mems;
11391 -               spin_unlock_irq(&callback_lock);
11392 +               raw_spin_unlock_irq(&callback_lock);
11393  
11394                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11395                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
11396 @@ -1246,9 +1246,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
11397         if (retval < 0)
11398                 goto done;
11399  
11400 -       spin_lock_irq(&callback_lock);
11401 +       raw_spin_lock_irq(&callback_lock);
11402         cs->mems_allowed = trialcs->mems_allowed;
11403 -       spin_unlock_irq(&callback_lock);
11404 +       raw_spin_unlock_irq(&callback_lock);
11405  
11406         /* use trialcs->mems_allowed as a temp variable */
11407         update_nodemasks_hier(cs, &trialcs->mems_allowed);
11408 @@ -1339,9 +1339,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
11409         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
11410                         || (is_spread_page(cs) != is_spread_page(trialcs)));
11411  
11412 -       spin_lock_irq(&callback_lock);
11413 +       raw_spin_lock_irq(&callback_lock);
11414         cs->flags = trialcs->flags;
11415 -       spin_unlock_irq(&callback_lock);
11416 +       raw_spin_unlock_irq(&callback_lock);
11417  
11418         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
11419                 rebuild_sched_domains_locked();
11420 @@ -1756,7 +1756,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11421         cpuset_filetype_t type = seq_cft(sf)->private;
11422         int ret = 0;
11423  
11424 -       spin_lock_irq(&callback_lock);
11425 +       raw_spin_lock_irq(&callback_lock);
11426  
11427         switch (type) {
11428         case FILE_CPULIST:
11429 @@ -1775,7 +1775,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11430                 ret = -EINVAL;
11431         }
11432  
11433 -       spin_unlock_irq(&callback_lock);
11434 +       raw_spin_unlock_irq(&callback_lock);
11435         return ret;
11436  }
11437  
11438 @@ -1989,12 +1989,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
11439  
11440         cpuset_inc();
11441  
11442 -       spin_lock_irq(&callback_lock);
11443 +       raw_spin_lock_irq(&callback_lock);
11444         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11445                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
11446                 cs->effective_mems = parent->effective_mems;
11447         }
11448 -       spin_unlock_irq(&callback_lock);
11449 +       raw_spin_unlock_irq(&callback_lock);
11450  
11451         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
11452                 goto out_unlock;
11453 @@ -2021,12 +2021,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
11454         }
11455         rcu_read_unlock();
11456  
11457 -       spin_lock_irq(&callback_lock);
11458 +       raw_spin_lock_irq(&callback_lock);
11459         cs->mems_allowed = parent->mems_allowed;
11460         cs->effective_mems = parent->mems_allowed;
11461         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
11462         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
11463 -       spin_unlock_irq(&callback_lock);
11464 +       raw_spin_unlock_irq(&callback_lock);
11465  out_unlock:
11466         mutex_unlock(&cpuset_mutex);
11467         return 0;
11468 @@ -2065,7 +2065,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
11469  static void cpuset_bind(struct cgroup_subsys_state *root_css)
11470  {
11471         mutex_lock(&cpuset_mutex);
11472 -       spin_lock_irq(&callback_lock);
11473 +       raw_spin_lock_irq(&callback_lock);
11474  
11475         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11476                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
11477 @@ -2076,7 +2076,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
11478                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
11479         }
11480  
11481 -       spin_unlock_irq(&callback_lock);
11482 +       raw_spin_unlock_irq(&callback_lock);
11483         mutex_unlock(&cpuset_mutex);
11484  }
11485  
11486 @@ -2177,12 +2177,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
11487  {
11488         bool is_empty;
11489  
11490 -       spin_lock_irq(&callback_lock);
11491 +       raw_spin_lock_irq(&callback_lock);
11492         cpumask_copy(cs->cpus_allowed, new_cpus);
11493         cpumask_copy(cs->effective_cpus, new_cpus);
11494         cs->mems_allowed = *new_mems;
11495         cs->effective_mems = *new_mems;
11496 -       spin_unlock_irq(&callback_lock);
11497 +       raw_spin_unlock_irq(&callback_lock);
11498  
11499         /*
11500          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
11501 @@ -2219,10 +2219,10 @@ hotplug_update_tasks(struct cpuset *cs,
11502         if (nodes_empty(*new_mems))
11503                 *new_mems = parent_cs(cs)->effective_mems;
11504  
11505 -       spin_lock_irq(&callback_lock);
11506 +       raw_spin_lock_irq(&callback_lock);
11507         cpumask_copy(cs->effective_cpus, new_cpus);
11508         cs->effective_mems = *new_mems;
11509 -       spin_unlock_irq(&callback_lock);
11510 +       raw_spin_unlock_irq(&callback_lock);
11511  
11512         if (cpus_updated)
11513                 update_tasks_cpumask(cs);
11514 @@ -2308,21 +2308,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
11515  
11516         /* synchronize cpus_allowed to cpu_active_mask */
11517         if (cpus_updated) {
11518 -               spin_lock_irq(&callback_lock);
11519 +               raw_spin_lock_irq(&callback_lock);
11520                 if (!on_dfl)
11521                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
11522                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
11523 -               spin_unlock_irq(&callback_lock);
11524 +               raw_spin_unlock_irq(&callback_lock);
11525                 /* we don't mess with cpumasks of tasks in top_cpuset */
11526         }
11527  
11528         /* synchronize mems_allowed to N_MEMORY */
11529         if (mems_updated) {
11530 -               spin_lock_irq(&callback_lock);
11531 +               raw_spin_lock_irq(&callback_lock);
11532                 if (!on_dfl)
11533                         top_cpuset.mems_allowed = new_mems;
11534                 top_cpuset.effective_mems = new_mems;
11535 -               spin_unlock_irq(&callback_lock);
11536 +               raw_spin_unlock_irq(&callback_lock);
11537                 update_tasks_nodemask(&top_cpuset);
11538         }
11539  
11540 @@ -2420,11 +2420,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
11541  {
11542         unsigned long flags;
11543  
11544 -       spin_lock_irqsave(&callback_lock, flags);
11545 +       raw_spin_lock_irqsave(&callback_lock, flags);
11546         rcu_read_lock();
11547         guarantee_online_cpus(task_cs(tsk), pmask);
11548         rcu_read_unlock();
11549 -       spin_unlock_irqrestore(&callback_lock, flags);
11550 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11551  }
11552  
11553  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
11554 @@ -2472,11 +2472,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
11555         nodemask_t mask;
11556         unsigned long flags;
11557  
11558 -       spin_lock_irqsave(&callback_lock, flags);
11559 +       raw_spin_lock_irqsave(&callback_lock, flags);
11560         rcu_read_lock();
11561         guarantee_online_mems(task_cs(tsk), &mask);
11562         rcu_read_unlock();
11563 -       spin_unlock_irqrestore(&callback_lock, flags);
11564 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11565  
11566         return mask;
11567  }
11568 @@ -2568,14 +2568,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
11569                 return true;
11570  
11571         /* Not hardwall and node outside mems_allowed: scan up cpusets */
11572 -       spin_lock_irqsave(&callback_lock, flags);
11573 +       raw_spin_lock_irqsave(&callback_lock, flags);
11574  
11575         rcu_read_lock();
11576         cs = nearest_hardwall_ancestor(task_cs(current));
11577         allowed = node_isset(node, cs->mems_allowed);
11578         rcu_read_unlock();
11579  
11580 -       spin_unlock_irqrestore(&callback_lock, flags);
11581 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11582         return allowed;
11583  }
11584  
11585 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
11586 index fc1ef736253c..83c666537a7a 100644
11587 --- a/kernel/debug/kdb/kdb_io.c
11588 +++ b/kernel/debug/kdb/kdb_io.c
11589 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11590         int linecount;
11591         int colcount;
11592         int logging, saved_loglevel = 0;
11593 -       int saved_trap_printk;
11594         int got_printf_lock = 0;
11595         int retlen = 0;
11596         int fnd, len;
11597 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11598         unsigned long uninitialized_var(flags);
11599  
11600         preempt_disable();
11601 -       saved_trap_printk = kdb_trap_printk;
11602 -       kdb_trap_printk = 0;
11603  
11604         /* Serialize kdb_printf if multiple cpus try to write at once.
11605          * But if any cpu goes recursive in kdb, just print the output,
11606 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11607         } else {
11608                 __release(kdb_printf_lock);
11609         }
11610 -       kdb_trap_printk = saved_trap_printk;
11611         preempt_enable();
11612         return retlen;
11613  }
11614 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
11615         va_list ap;
11616         int r;
11617  
11618 +       kdb_trap_printk++;
11619         va_start(ap, fmt);
11620         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
11621         va_end(ap);
11622 +       kdb_trap_printk--;
11623  
11624         return r;
11625  }
11626 diff --git a/kernel/events/core.c b/kernel/events/core.c
11627 index 07c0dc806dfc..baf1a2867d74 100644
11628 --- a/kernel/events/core.c
11629 +++ b/kernel/events/core.c
11630 @@ -1050,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
11631         raw_spin_lock_init(&cpuctx->hrtimer_lock);
11632         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
11633         timer->function = perf_mux_hrtimer_handler;
11634 +       timer->irqsafe = 1;
11635  }
11636  
11637  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
11638 @@ -8363,6 +8364,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
11639  
11640         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11641         hwc->hrtimer.function = perf_swevent_hrtimer;
11642 +       hwc->hrtimer.irqsafe = 1;
11643  
11644         /*
11645          * Since hrtimers have a fixed rate, we can do a static freq->period
11646 diff --git a/kernel/exit.c b/kernel/exit.c
11647 index 3076f3089919..fb2ebcf3ca7c 100644
11648 --- a/kernel/exit.c
11649 +++ b/kernel/exit.c
11650 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
11651          * Do this under ->siglock, we can race with another thread
11652          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
11653          */
11654 -       flush_sigqueue(&tsk->pending);
11655 +       flush_task_sigqueue(tsk);
11656         tsk->sighand = NULL;
11657         spin_unlock(&sighand->siglock);
11658  
11659 diff --git a/kernel/fork.c b/kernel/fork.c
11660 index ba8a01564985..47784f8aed37 100644
11661 --- a/kernel/fork.c
11662 +++ b/kernel/fork.c
11663 @@ -76,6 +76,7 @@
11664  #include <linux/compiler.h>
11665  #include <linux/sysctl.h>
11666  #include <linux/kcov.h>
11667 +#include <linux/kprobes.h>
11668  
11669  #include <asm/pgtable.h>
11670  #include <asm/pgalloc.h>
11671 @@ -376,13 +377,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
11672         if (atomic_dec_and_test(&sig->sigcnt))
11673                 free_signal_struct(sig);
11674  }
11675 -
11676 +#ifdef CONFIG_PREEMPT_RT_BASE
11677 +static
11678 +#endif
11679  void __put_task_struct(struct task_struct *tsk)
11680  {
11681         WARN_ON(!tsk->exit_state);
11682         WARN_ON(atomic_read(&tsk->usage));
11683         WARN_ON(tsk == current);
11684  
11685 +       /*
11686 +        * Remove function-return probe instances associated with this
11687 +        * task and put them back on the free list.
11688 +        */
11689 +       kprobe_flush_task(tsk);
11690 +
11691 +       /* Task is done with its stack. */
11692 +       put_task_stack(tsk);
11693 +
11694         cgroup_free(tsk);
11695         task_numa_free(tsk);
11696         security_task_free(tsk);
11697 @@ -393,7 +405,18 @@ void __put_task_struct(struct task_struct *tsk)
11698         if (!profile_handoff_task(tsk))
11699                 free_task(tsk);
11700  }
11701 +#ifndef CONFIG_PREEMPT_RT_BASE
11702  EXPORT_SYMBOL_GPL(__put_task_struct);
11703 +#else
11704 +void __put_task_struct_cb(struct rcu_head *rhp)
11705 +{
11706 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
11707 +
11708 +       __put_task_struct(tsk);
11709 +
11710 +}
11711 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
11712 +#endif
11713  
11714  void __init __weak arch_task_cache_init(void) { }
11715  
11716 @@ -852,6 +875,19 @@ void __mmdrop(struct mm_struct *mm)
11717  }
11718  EXPORT_SYMBOL_GPL(__mmdrop);
11719  
11720 +#ifdef CONFIG_PREEMPT_RT_BASE
11721 +/*
11722 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
11723 + * want another facility to make this work.
11724 + */
11725 +void __mmdrop_delayed(struct rcu_head *rhp)
11726 +{
11727 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
11728 +
11729 +       __mmdrop(mm);
11730 +}
11731 +#endif
11732 +
11733  static inline void __mmput(struct mm_struct *mm)
11734  {
11735         VM_BUG_ON(atomic_read(&mm->mm_users));
11736 @@ -1426,6 +1462,9 @@ static void rt_mutex_init_task(struct task_struct *p)
11737   */
11738  static void posix_cpu_timers_init(struct task_struct *tsk)
11739  {
11740 +#ifdef CONFIG_PREEMPT_RT_BASE
11741 +       tsk->posix_timer_list = NULL;
11742 +#endif
11743         tsk->cputime_expires.prof_exp = 0;
11744         tsk->cputime_expires.virt_exp = 0;
11745         tsk->cputime_expires.sched_exp = 0;
11746 @@ -1552,6 +1591,7 @@ static __latent_entropy struct task_struct *copy_process(
11747         spin_lock_init(&p->alloc_lock);
11748  
11749         init_sigpending(&p->pending);
11750 +       p->sigqueue_cache = NULL;
11751  
11752         p->utime = p->stime = p->gtime = 0;
11753         p->utimescaled = p->stimescaled = 0;
11754 diff --git a/kernel/futex.c b/kernel/futex.c
11755 index 4c6b6e697b73..a01d203939cb 100644
11756 --- a/kernel/futex.c
11757 +++ b/kernel/futex.c
11758 @@ -800,7 +800,7 @@ static int refill_pi_state_cache(void)
11759         return 0;
11760  }
11761  
11762 -static struct futex_pi_state * alloc_pi_state(void)
11763 +static struct futex_pi_state *alloc_pi_state(void)
11764  {
11765         struct futex_pi_state *pi_state = current->pi_state_cache;
11766  
11767 @@ -810,6 +810,11 @@ static struct futex_pi_state * alloc_pi_state(void)
11768         return pi_state;
11769  }
11770  
11771 +static void get_pi_state(struct futex_pi_state *pi_state)
11772 +{
11773 +       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
11774 +}
11775 +
11776  /*
11777   * Drops a reference to the pi_state object and frees or caches it
11778   * when the last reference is gone.
11779 @@ -854,7 +859,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
11780   * Look up the task based on what TID userspace gave us.
11781   * We dont trust it.
11782   */
11783 -static struct task_struct * futex_find_get_task(pid_t pid)
11784 +static struct task_struct *futex_find_get_task(pid_t pid)
11785  {
11786         struct task_struct *p;
11787  
11788 @@ -904,7 +909,9 @@ void exit_pi_state_list(struct task_struct *curr)
11789                  * task still owns the PI-state:
11790                  */
11791                 if (head->next != next) {
11792 +                       raw_spin_unlock_irq(&curr->pi_lock);
11793                         spin_unlock(&hb->lock);
11794 +                       raw_spin_lock_irq(&curr->pi_lock);
11795                         continue;
11796                 }
11797  
11798 @@ -914,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
11799                 pi_state->owner = NULL;
11800                 raw_spin_unlock_irq(&curr->pi_lock);
11801  
11802 -               rt_mutex_unlock(&pi_state->pi_mutex);
11803 -
11804 +               get_pi_state(pi_state);
11805                 spin_unlock(&hb->lock);
11806  
11807 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
11808 +               put_pi_state(pi_state);
11809 +
11810                 raw_spin_lock_irq(&curr->pi_lock);
11811         }
11812         raw_spin_unlock_irq(&curr->pi_lock);
11813 @@ -971,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
11814   *
11815   * [10] There is no transient state which leaves owner and user space
11816   *     TID out of sync.
11817 + *
11818 + *
11819 + * Serialization and lifetime rules:
11820 + *
11821 + * hb->lock:
11822 + *
11823 + *     hb -> futex_q, relation
11824 + *     futex_q -> pi_state, relation
11825 + *
11826 + *     (cannot be raw because hb can contain arbitrary amount
11827 + *      of futex_q's)
11828 + *
11829 + * pi_mutex->wait_lock:
11830 + *
11831 + *     {uval, pi_state}
11832 + *
11833 + *     (and pi_mutex 'obviously')
11834 + *
11835 + * p->pi_lock:
11836 + *
11837 + *     p->pi_state_list -> pi_state->list, relation
11838 + *
11839 + * pi_state->refcount:
11840 + *
11841 + *     pi_state lifetime
11842 + *
11843 + *
11844 + * Lock order:
11845 + *
11846 + *   hb->lock
11847 + *     pi_mutex->wait_lock
11848 + *       p->pi_lock
11849 + *
11850   */
11851  
11852  /*
11853 @@ -978,10 +1020,12 @@ void exit_pi_state_list(struct task_struct *curr)
11854   * the pi_state against the user space value. If correct, attach to
11855   * it.
11856   */
11857 -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
11858 +static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
11859 +                             struct futex_pi_state *pi_state,
11860                               struct futex_pi_state **ps)
11861  {
11862         pid_t pid = uval & FUTEX_TID_MASK;
11863 +       int ret, uval2;
11864  
11865         /*
11866          * Userspace might have messed up non-PI and PI futexes [3]
11867 @@ -989,9 +1033,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
11868         if (unlikely(!pi_state))
11869                 return -EINVAL;
11870  
11871 +       /*
11872 +        * We get here with hb->lock held, and having found a
11873 +        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
11874 +        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
11875 +        * which in turn means that futex_lock_pi() still has a reference on
11876 +        * our pi_state.
11877 +        *
11878 +        * The waiter holding a reference on @pi_state also protects against
11879 +        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
11880 +        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
11881 +        * free pi_state before we can take a reference ourselves.
11882 +        */
11883         WARN_ON(!atomic_read(&pi_state->refcount));
11884  
11885         /*
11886 +        * Now that we have a pi_state, we can acquire wait_lock
11887 +        * and do the state validation.
11888 +        */
11889 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
11890 +
11891 +       /*
11892 +        * Since {uval, pi_state} is serialized by wait_lock, and our current
11893 +        * uval was read without holding it, it can have changed. Verify it
11894 +        * still is what we expect it to be, otherwise retry the entire
11895 +        * operation.
11896 +        */
11897 +       if (get_futex_value_locked(&uval2, uaddr))
11898 +               goto out_efault;
11899 +
11900 +       if (uval != uval2)
11901 +               goto out_eagain;
11902 +
11903 +       /*
11904          * Handle the owner died case:
11905          */
11906         if (uval & FUTEX_OWNER_DIED) {
11907 @@ -1006,11 +1080,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
11908                          * is not 0. Inconsistent state. [5]
11909                          */
11910                         if (pid)
11911 -                               return -EINVAL;
11912 +                               goto out_einval;
11913                         /*
11914                          * Take a ref on the state and return success. [4]
11915                          */
11916 -                       goto out_state;
11917 +                       goto out_attach;
11918                 }
11919  
11920                 /*
11921 @@ -1022,14 +1096,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
11922                  * Take a ref on the state and return success. [6]
11923                  */
11924                 if (!pid)
11925 -                       goto out_state;
11926 +                       goto out_attach;
11927         } else {
11928                 /*
11929                  * If the owner died bit is not set, then the pi_state
11930                  * must have an owner. [7]
11931                  */
11932                 if (!pi_state->owner)
11933 -                       return -EINVAL;
11934 +                       goto out_einval;
11935         }
11936  
11937         /*
11938 @@ -1038,11 +1112,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
11939          * user space TID. [9/10]
11940          */
11941         if (pid != task_pid_vnr(pi_state->owner))
11942 -               return -EINVAL;
11943 -out_state:
11944 -       atomic_inc(&pi_state->refcount);
11945 +               goto out_einval;
11946 +
11947 +out_attach:
11948 +       get_pi_state(pi_state);
11949 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11950         *ps = pi_state;
11951         return 0;
11952 +
11953 +out_einval:
11954 +       ret = -EINVAL;
11955 +       goto out_error;
11956 +
11957 +out_eagain:
11958 +       ret = -EAGAIN;
11959 +       goto out_error;
11960 +
11961 +out_efault:
11962 +       ret = -EFAULT;
11963 +       goto out_error;
11964 +
11965 +out_error:
11966 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11967 +       return ret;
11968  }
11969  
11970  /*
11971 @@ -1093,6 +1185,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
11972  
11973         /*
11974          * No existing pi state. First waiter. [2]
11975 +        *
11976 +        * This creates pi_state, we have hb->lock held, this means nothing can
11977 +        * observe this state, wait_lock is irrelevant.
11978          */
11979         pi_state = alloc_pi_state();
11980  
11981 @@ -1117,17 +1212,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
11982         return 0;
11983  }
11984  
11985 -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
11986 +static int lookup_pi_state(u32 __user *uaddr, u32 uval,
11987 +                          struct futex_hash_bucket *hb,
11988                            union futex_key *key, struct futex_pi_state **ps)
11989  {
11990 -       struct futex_q *match = futex_top_waiter(hb, key);
11991 +       struct futex_q *top_waiter = futex_top_waiter(hb, key);
11992  
11993         /*
11994          * If there is a waiter on that futex, validate it and
11995          * attach to the pi_state when the validation succeeds.
11996          */
11997 -       if (match)
11998 -               return attach_to_pi_state(uval, match->pi_state, ps);
11999 +       if (top_waiter)
12000 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12001  
12002         /*
12003          * We are the first waiter - try to look up the owner based on
12004 @@ -1146,7 +1242,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
12005         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
12006                 return -EFAULT;
12007  
12008 -       /*If user space value changed, let the caller retry */
12009 +       /* If user space value changed, let the caller retry */
12010         return curval != uval ? -EAGAIN : 0;
12011  }
12012  
12013 @@ -1174,7 +1270,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12014                                 struct task_struct *task, int set_waiters)
12015  {
12016         u32 uval, newval, vpid = task_pid_vnr(task);
12017 -       struct futex_q *match;
12018 +       struct futex_q *top_waiter;
12019         int ret;
12020  
12021         /*
12022 @@ -1200,9 +1296,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12023          * Lookup existing state first. If it exists, try to attach to
12024          * its pi_state.
12025          */
12026 -       match = futex_top_waiter(hb, key);
12027 -       if (match)
12028 -               return attach_to_pi_state(uval, match->pi_state, ps);
12029 +       top_waiter = futex_top_waiter(hb, key);
12030 +       if (top_waiter)
12031 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12032  
12033         /*
12034          * No waiter and user TID is 0. We are here because the
12035 @@ -1288,45 +1384,39 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
12036          * memory barrier is required here to prevent the following
12037          * store to lock_ptr from getting ahead of the plist_del.
12038          */
12039 -       smp_wmb();
12040 -       q->lock_ptr = NULL;
12041 +       smp_store_release(&q->lock_ptr, NULL);
12042  }
12043  
12044 -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12045 -                        struct futex_hash_bucket *hb)
12046 +/*
12047 + * Caller must hold a reference on @pi_state.
12048 + */
12049 +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
12050  {
12051 -       struct task_struct *new_owner;
12052 -       struct futex_pi_state *pi_state = this->pi_state;
12053         u32 uninitialized_var(curval), newval;
12054 +       struct task_struct *new_owner;
12055 +       bool deboost = false;
12056         WAKE_Q(wake_q);
12057 -       bool deboost;
12058 +       WAKE_Q(wake_sleeper_q);
12059         int ret = 0;
12060  
12061 -       if (!pi_state)
12062 -               return -EINVAL;
12063 -
12064 -       /*
12065 -        * If current does not own the pi_state then the futex is
12066 -        * inconsistent and user space fiddled with the futex value.
12067 -        */
12068 -       if (pi_state->owner != current)
12069 -               return -EINVAL;
12070 -
12071 -       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12072         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
12073 +       if (WARN_ON_ONCE(!new_owner)) {
12074 +               /*
12075 +                * As per the comment in futex_unlock_pi() this should not happen.
12076 +                *
12077 +                * When this happens, give up our locks and try again, giving
12078 +                * the futex_lock_pi() instance time to complete, either by
12079 +                * waiting on the rtmutex or removing itself from the futex
12080 +                * queue.
12081 +                */
12082 +               ret = -EAGAIN;
12083 +               goto out_unlock;
12084 +       }
12085  
12086         /*
12087 -        * It is possible that the next waiter (the one that brought
12088 -        * this owner to the kernel) timed out and is no longer
12089 -        * waiting on the lock.
12090 -        */
12091 -       if (!new_owner)
12092 -               new_owner = this->task;
12093 -
12094 -       /*
12095 -        * We pass it to the next owner. The WAITERS bit is always
12096 -        * kept enabled while there is PI state around. We cleanup the
12097 -        * owner died bit, because we are the owner.
12098 +        * We pass it to the next owner. The WAITERS bit is always kept
12099 +        * enabled while there is PI state around. We cleanup the owner
12100 +        * died bit, because we are the owner.
12101          */
12102         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
12103  
12104 @@ -1335,6 +1425,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12105  
12106         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
12107                 ret = -EFAULT;
12108 +
12109         } else if (curval != uval) {
12110                 /*
12111                  * If a unconditional UNLOCK_PI operation (user space did not
12112 @@ -1347,10 +1438,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12113                 else
12114                         ret = -EINVAL;
12115         }
12116 -       if (ret) {
12117 -               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12118 -               return ret;
12119 -       }
12120 +
12121 +       if (ret)
12122 +               goto out_unlock;
12123  
12124         raw_spin_lock(&pi_state->owner->pi_lock);
12125         WARN_ON(list_empty(&pi_state->list));
12126 @@ -1363,22 +1453,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12127         pi_state->owner = new_owner;
12128         raw_spin_unlock(&new_owner->pi_lock);
12129  
12130 +       /*
12131 +        * We've updated the uservalue, this unlock cannot fail.
12132 +        */
12133 +       deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
12134 +                                         &wake_sleeper_q);
12135 +
12136 +out_unlock:
12137         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12138  
12139 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
12140 -
12141 -       /*
12142 -        * First unlock HB so the waiter does not spin on it once he got woken
12143 -        * up. Second wake up the waiter before the priority is adjusted. If we
12144 -        * deboost first (and lose our higher priority), then the task might get
12145 -        * scheduled away before the wake up can take place.
12146 -        */
12147 -       spin_unlock(&hb->lock);
12148 -       wake_up_q(&wake_q);
12149 -       if (deboost)
12150 +       if (deboost) {
12151 +               wake_up_q(&wake_q);
12152 +               wake_up_q_sleeper(&wake_sleeper_q);
12153                 rt_mutex_adjust_prio(current);
12154 +       }
12155  
12156 -       return 0;
12157 +       return ret;
12158  }
12159  
12160  /*
12161 @@ -1824,7 +1914,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12162                          * If that call succeeds then we have pi_state and an
12163                          * initial refcount on it.
12164                          */
12165 -                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
12166 +                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
12167                 }
12168  
12169                 switch (ret) {
12170 @@ -1907,7 +1997,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12171                          * refcount on the pi_state and store the pointer in
12172                          * the futex_q object of the waiter.
12173                          */
12174 -                       atomic_inc(&pi_state->refcount);
12175 +                       get_pi_state(pi_state);
12176                         this->pi_state = pi_state;
12177                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
12178                                                         this->rt_waiter,
12179 @@ -1924,6 +2014,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12180                                 requeue_pi_wake_futex(this, &key2, hb2);
12181                                 drop_count++;
12182                                 continue;
12183 +                       } else if (ret == -EAGAIN) {
12184 +                               /*
12185 +                                * Waiter was woken by timeout or
12186 +                                * signal and has set pi_blocked_on to
12187 +                                * PI_WAKEUP_INPROGRESS before we
12188 +                                * tried to enqueue it on the rtmutex.
12189 +                                */
12190 +                               this->pi_state = NULL;
12191 +                               put_pi_state(pi_state);
12192 +                               continue;
12193                         } else if (ret) {
12194                                 /*
12195                                  * rt_mutex_start_proxy_lock() detected a
12196 @@ -2007,20 +2107,7 @@ queue_unlock(struct futex_hash_bucket *hb)
12197         hb_waiters_dec(hb);
12198  }
12199  
12200 -/**
12201 - * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12202 - * @q: The futex_q to enqueue
12203 - * @hb:        The destination hash bucket
12204 - *
12205 - * The hb->lock must be held by the caller, and is released here. A call to
12206 - * queue_me() is typically paired with exactly one call to unqueue_me().  The
12207 - * exceptions involve the PI related operations, which may use unqueue_me_pi()
12208 - * or nothing if the unqueue is done as part of the wake process and the unqueue
12209 - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12210 - * an example).
12211 - */
12212 -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12213 -       __releases(&hb->lock)
12214 +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12215  {
12216         int prio;
12217  
12218 @@ -2037,6 +2124,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12219         plist_node_init(&q->list, prio);
12220         plist_add(&q->list, &hb->chain);
12221         q->task = current;
12222 +}
12223 +
12224 +/**
12225 + * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12226 + * @q: The futex_q to enqueue
12227 + * @hb:        The destination hash bucket
12228 + *
12229 + * The hb->lock must be held by the caller, and is released here. A call to
12230 + * queue_me() is typically paired with exactly one call to unqueue_me().  The
12231 + * exceptions involve the PI related operations, which may use unqueue_me_pi()
12232 + * or nothing if the unqueue is done as part of the wake process and the unqueue
12233 + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12234 + * an example).
12235 + */
12236 +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12237 +       __releases(&hb->lock)
12238 +{
12239 +       __queue_me(q, hb);
12240         spin_unlock(&hb->lock);
12241  }
12242  
12243 @@ -2123,10 +2228,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12244  {
12245         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
12246         struct futex_pi_state *pi_state = q->pi_state;
12247 -       struct task_struct *oldowner = pi_state->owner;
12248         u32 uval, uninitialized_var(curval), newval;
12249 +       struct task_struct *oldowner;
12250         int ret;
12251  
12252 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12253 +
12254 +       oldowner = pi_state->owner;
12255         /* Owner died? */
12256         if (!pi_state->owner)
12257                 newtid |= FUTEX_OWNER_DIED;
12258 @@ -2134,7 +2242,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12259         /*
12260          * We are here either because we stole the rtmutex from the
12261          * previous highest priority waiter or we are the highest priority
12262 -        * waiter but failed to get the rtmutex the first time.
12263 +        * waiter but have failed to get the rtmutex the first time.
12264 +        *
12265          * We have to replace the newowner TID in the user space variable.
12266          * This must be atomic as we have to preserve the owner died bit here.
12267          *
12268 @@ -2142,17 +2251,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12269          * because we can fault here. Imagine swapped out pages or a fork
12270          * that marked all the anonymous memory readonly for cow.
12271          *
12272 -        * Modifying pi_state _before_ the user space value would
12273 -        * leave the pi_state in an inconsistent state when we fault
12274 -        * here, because we need to drop the hash bucket lock to
12275 -        * handle the fault. This might be observed in the PID check
12276 -        * in lookup_pi_state.
12277 +        * Modifying pi_state _before_ the user space value would leave the
12278 +        * pi_state in an inconsistent state when we fault here, because we
12279 +        * need to drop the locks to handle the fault. This might be observed
12280 +        * in the PID check in lookup_pi_state.
12281          */
12282  retry:
12283         if (get_futex_value_locked(&uval, uaddr))
12284                 goto handle_fault;
12285  
12286 -       while (1) {
12287 +       for (;;) {
12288                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
12289  
12290                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
12291 @@ -2167,47 +2275,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12292          * itself.
12293          */
12294         if (pi_state->owner != NULL) {
12295 -               raw_spin_lock_irq(&pi_state->owner->pi_lock);
12296 +               raw_spin_lock(&pi_state->owner->pi_lock);
12297                 WARN_ON(list_empty(&pi_state->list));
12298                 list_del_init(&pi_state->list);
12299 -               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
12300 +               raw_spin_unlock(&pi_state->owner->pi_lock);
12301         }
12302  
12303         pi_state->owner = newowner;
12304  
12305 -       raw_spin_lock_irq(&newowner->pi_lock);
12306 +       raw_spin_lock(&newowner->pi_lock);
12307         WARN_ON(!list_empty(&pi_state->list));
12308         list_add(&pi_state->list, &newowner->pi_state_list);
12309 -       raw_spin_unlock_irq(&newowner->pi_lock);
12310 +       raw_spin_unlock(&newowner->pi_lock);
12311 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12312 +
12313         return 0;
12314  
12315         /*
12316 -        * To handle the page fault we need to drop the hash bucket
12317 -        * lock here. That gives the other task (either the highest priority
12318 -        * waiter itself or the task which stole the rtmutex) the
12319 -        * chance to try the fixup of the pi_state. So once we are
12320 -        * back from handling the fault we need to check the pi_state
12321 -        * after reacquiring the hash bucket lock and before trying to
12322 -        * do another fixup. When the fixup has been done already we
12323 -        * simply return.
12324 +        * To handle the page fault we need to drop the locks here. That gives
12325 +        * the other task (either the highest priority waiter itself or the
12326 +        * task which stole the rtmutex) the chance to try the fixup of the
12327 +        * pi_state. So once we are back from handling the fault we need to
12328 +        * check the pi_state after reacquiring the locks and before trying to
12329 +        * do another fixup. When the fixup has been done already we simply
12330 +        * return.
12331 +        *
12332 +        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
12333 +        * drop hb->lock since the caller owns the hb -> futex_q relation.
12334 +        * Dropping the pi_mutex->wait_lock requires the state revalidate.
12335          */
12336  handle_fault:
12337 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12338         spin_unlock(q->lock_ptr);
12339  
12340         ret = fault_in_user_writeable(uaddr);
12341  
12342         spin_lock(q->lock_ptr);
12343 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12344  
12345         /*
12346          * Check if someone else fixed it for us:
12347          */
12348 -       if (pi_state->owner != oldowner)
12349 -               return 0;
12350 +       if (pi_state->owner != oldowner) {
12351 +               ret = 0;
12352 +               goto out_unlock;
12353 +       }
12354  
12355         if (ret)
12356 -               return ret;
12357 +               goto out_unlock;
12358  
12359         goto retry;
12360 +
12361 +out_unlock:
12362 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12363 +       return ret;
12364  }
12365  
12366  static long futex_wait_restart(struct restart_block *restart);
12367 @@ -2229,13 +2350,16 @@ static long futex_wait_restart(struct restart_block *restart);
12368   */
12369  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12370  {
12371 -       struct task_struct *owner;
12372         int ret = 0;
12373  
12374         if (locked) {
12375                 /*
12376                  * Got the lock. We might not be the anticipated owner if we
12377                  * did a lock-steal - fix up the PI-state in that case:
12378 +                *
12379 +                * We can safely read pi_state->owner without holding wait_lock
12380 +                * because we now own the rt_mutex, only the owner will attempt
12381 +                * to change it.
12382                  */
12383                 if (q->pi_state->owner != current)
12384                         ret = fixup_pi_state_owner(uaddr, q, current);
12385 @@ -2243,43 +2367,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12386         }
12387  
12388         /*
12389 -        * Catch the rare case, where the lock was released when we were on the
12390 -        * way back before we locked the hash bucket.
12391 -        */
12392 -       if (q->pi_state->owner == current) {
12393 -               /*
12394 -                * Try to get the rt_mutex now. This might fail as some other
12395 -                * task acquired the rt_mutex after we removed ourself from the
12396 -                * rt_mutex waiters list.
12397 -                */
12398 -               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
12399 -                       locked = 1;
12400 -                       goto out;
12401 -               }
12402 -
12403 -               /*
12404 -                * pi_state is incorrect, some other task did a lock steal and
12405 -                * we returned due to timeout or signal without taking the
12406 -                * rt_mutex. Too late.
12407 -                */
12408 -               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
12409 -               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
12410 -               if (!owner)
12411 -                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
12412 -               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
12413 -               ret = fixup_pi_state_owner(uaddr, q, owner);
12414 -               goto out;
12415 -       }
12416 -
12417 -       /*
12418          * Paranoia check. If we did not take the lock, then we should not be
12419          * the owner of the rt_mutex.
12420          */
12421 -       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
12422 +       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
12423                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
12424                                 "pi-state %p\n", ret,
12425                                 q->pi_state->pi_mutex.owner,
12426                                 q->pi_state->owner);
12427 +       }
12428  
12429  out:
12430         return ret ? ret : locked;
12431 @@ -2503,6 +2599,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12432                          ktime_t *time, int trylock)
12433  {
12434         struct hrtimer_sleeper timeout, *to = NULL;
12435 +       struct futex_pi_state *pi_state = NULL;
12436 +       struct rt_mutex_waiter rt_waiter;
12437         struct futex_hash_bucket *hb;
12438         struct futex_q q = futex_q_init;
12439         int res, ret;
12440 @@ -2555,25 +2653,77 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12441                 }
12442         }
12443  
12444 +       WARN_ON(!q.pi_state);
12445 +
12446         /*
12447          * Only actually queue now that the atomic ops are done:
12448          */
12449 -       queue_me(&q, hb);
12450 +       __queue_me(&q, hb);
12451  
12452 -       WARN_ON(!q.pi_state);
12453 -       /*
12454 -        * Block on the PI mutex:
12455 -        */
12456 -       if (!trylock) {
12457 -               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
12458 -       } else {
12459 -               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
12460 +       if (trylock) {
12461 +               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
12462                 /* Fixup the trylock return value: */
12463                 ret = ret ? 0 : -EWOULDBLOCK;
12464 +               goto no_block;
12465         }
12466  
12467 +       rt_mutex_init_waiter(&rt_waiter, false);
12468 +
12469 +       /*
12470 +        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
12471 +        * hold it while doing rt_mutex_start_proxy(), because then it will
12472 +        * include hb->lock in the blocking chain, even through we'll not in
12473 +        * fact hold it while blocking. This will lead it to report -EDEADLK
12474 +        * and BUG when futex_unlock_pi() interleaves with this.
12475 +        *
12476 +        * Therefore acquire wait_lock while holding hb->lock, but drop the
12477 +        * latter before calling rt_mutex_start_proxy_lock(). This still fully
12478 +        * serializes against futex_unlock_pi() as that does the exact same
12479 +        * lock handoff sequence.
12480 +        */
12481 +       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
12482 +       /*
12483 +        * the migrate_disable() here disables migration in the in_atomic() fast
12484 +        * path which is enabled again in the following spin_unlock(). We have
12485 +        * one migrate_disable() pending in the slow-path which is reversed
12486 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
12487 +        */
12488 +       migrate_disable();
12489 +
12490 +       spin_unlock(q.lock_ptr);
12491 +       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
12492 +       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
12493 +       migrate_enable();
12494 +
12495 +       if (ret) {
12496 +               if (ret == 1)
12497 +                       ret = 0;
12498 +
12499 +               spin_lock(q.lock_ptr);
12500 +               goto no_block;
12501 +       }
12502 +
12503 +
12504 +       if (unlikely(to))
12505 +               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
12506 +
12507 +       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
12508 +
12509         spin_lock(q.lock_ptr);
12510         /*
12511 +        * If we failed to acquire the lock (signal/timeout), we must
12512 +        * first acquire the hb->lock before removing the lock from the
12513 +        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
12514 +        * wait lists consistent.
12515 +        *
12516 +        * In particular; it is important that futex_unlock_pi() can not
12517 +        * observe this inconsistency.
12518 +        */
12519 +       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
12520 +               ret = 0;
12521 +
12522 +no_block:
12523 +       /*
12524          * Fixup the pi_state owner and possibly acquire the lock if we
12525          * haven't already.
12526          */
12527 @@ -2589,12 +2739,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12528          * If fixup_owner() faulted and was unable to handle the fault, unlock
12529          * it and return the fault to userspace.
12530          */
12531 -       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
12532 -               rt_mutex_unlock(&q.pi_state->pi_mutex);
12533 +       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
12534 +               pi_state = q.pi_state;
12535 +               get_pi_state(pi_state);
12536 +       }
12537  
12538         /* Unqueue and drop the lock */
12539         unqueue_me_pi(&q);
12540  
12541 +       if (pi_state) {
12542 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12543 +               put_pi_state(pi_state);
12544 +       }
12545 +
12546         goto out_put_key;
12547  
12548  out_unlock_put_key:
12549 @@ -2631,7 +2788,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12550         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
12551         union futex_key key = FUTEX_KEY_INIT;
12552         struct futex_hash_bucket *hb;
12553 -       struct futex_q *match;
12554 +       struct futex_q *top_waiter;
12555         int ret;
12556  
12557  retry:
12558 @@ -2655,12 +2812,48 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12559          * all and we at least want to know if user space fiddled
12560          * with the futex value instead of blindly unlocking.
12561          */
12562 -       match = futex_top_waiter(hb, &key);
12563 -       if (match) {
12564 -               ret = wake_futex_pi(uaddr, uval, match, hb);
12565 +       top_waiter = futex_top_waiter(hb, &key);
12566 +       if (top_waiter) {
12567 +               struct futex_pi_state *pi_state = top_waiter->pi_state;
12568 +
12569 +               ret = -EINVAL;
12570 +               if (!pi_state)
12571 +                       goto out_unlock;
12572 +
12573                 /*
12574 -                * In case of success wake_futex_pi dropped the hash
12575 -                * bucket lock.
12576 +                * If current does not own the pi_state then the futex is
12577 +                * inconsistent and user space fiddled with the futex value.
12578 +                */
12579 +               if (pi_state->owner != current)
12580 +                       goto out_unlock;
12581 +
12582 +               get_pi_state(pi_state);
12583 +               /*
12584 +                * By taking wait_lock while still holding hb->lock, we ensure
12585 +                * there is no point where we hold neither; and therefore
12586 +                * wake_futex_pi() must observe a state consistent with what we
12587 +                * observed.
12588 +                */
12589 +               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12590 +               /*
12591 +                * Magic trickery for now to make the RT migrate disable
12592 +                * logic happy. The following spin_unlock() happens with
12593 +                * interrupts disabled so the internal migrate_enable()
12594 +                * won't undo the migrate_disable() which was issued when
12595 +                * locking hb->lock.
12596 +                */
12597 +               migrate_disable();
12598 +               spin_unlock(&hb->lock);
12599 +
12600 +               /* Drops pi_state->pi_mutex.wait_lock */
12601 +               ret = wake_futex_pi(uaddr, uval, pi_state);
12602 +
12603 +               migrate_enable();
12604 +
12605 +               put_pi_state(pi_state);
12606 +
12607 +               /*
12608 +                * Success, we're done! No tricky corner cases.
12609                  */
12610                 if (!ret)
12611                         goto out_putkey;
12612 @@ -2675,7 +2868,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12613                  * setting the FUTEX_WAITERS bit. Try again.
12614                  */
12615                 if (ret == -EAGAIN) {
12616 -                       spin_unlock(&hb->lock);
12617                         put_futex_key(&key);
12618                         goto retry;
12619                 }
12620 @@ -2683,7 +2875,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12621                  * wake_futex_pi has detected invalid state. Tell user
12622                  * space.
12623                  */
12624 -               goto out_unlock;
12625 +               goto out_putkey;
12626         }
12627  
12628         /*
12629 @@ -2693,8 +2885,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12630          * preserve the WAITERS bit not the OWNER_DIED one. We are the
12631          * owner.
12632          */
12633 -       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
12634 +       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
12635 +               spin_unlock(&hb->lock);
12636                 goto pi_faulted;
12637 +       }
12638  
12639         /*
12640          * If uval has changed, let user space handle it.
12641 @@ -2708,7 +2902,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12642         return ret;
12643  
12644  pi_faulted:
12645 -       spin_unlock(&hb->lock);
12646         put_futex_key(&key);
12647  
12648         ret = fault_in_user_writeable(uaddr);
12649 @@ -2812,8 +3005,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12650                                  u32 __user *uaddr2)
12651  {
12652         struct hrtimer_sleeper timeout, *to = NULL;
12653 +       struct futex_pi_state *pi_state = NULL;
12654         struct rt_mutex_waiter rt_waiter;
12655 -       struct futex_hash_bucket *hb;
12656 +       struct futex_hash_bucket *hb, *hb2;
12657         union futex_key key2 = FUTEX_KEY_INIT;
12658         struct futex_q q = futex_q_init;
12659         int res, ret;
12660 @@ -2838,10 +3032,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12661          * The waiter is allocated on our stack, manipulated by the requeue
12662          * code while we sleep on uaddr.
12663          */
12664 -       debug_rt_mutex_init_waiter(&rt_waiter);
12665 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
12666 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
12667 -       rt_waiter.task = NULL;
12668 +       rt_mutex_init_waiter(&rt_waiter, false);
12669  
12670         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
12671         if (unlikely(ret != 0))
12672 @@ -2872,20 +3063,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12673         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
12674         futex_wait_queue_me(hb, &q, to);
12675  
12676 -       spin_lock(&hb->lock);
12677 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12678 -       spin_unlock(&hb->lock);
12679 -       if (ret)
12680 -               goto out_put_keys;
12681 +       /*
12682 +        * On RT we must avoid races with requeue and trying to block
12683 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
12684 +        * serializing access to pi_blocked_on with pi_lock.
12685 +        */
12686 +       raw_spin_lock_irq(&current->pi_lock);
12687 +       if (current->pi_blocked_on) {
12688 +               /*
12689 +                * We have been requeued or are in the process of
12690 +                * being requeued.
12691 +                */
12692 +               raw_spin_unlock_irq(&current->pi_lock);
12693 +       } else {
12694 +               /*
12695 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
12696 +                * prevents a concurrent requeue from moving us to the
12697 +                * uaddr2 rtmutex. After that we can safely acquire
12698 +                * (and possibly block on) hb->lock.
12699 +                */
12700 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
12701 +               raw_spin_unlock_irq(&current->pi_lock);
12702 +
12703 +               spin_lock(&hb->lock);
12704 +
12705 +               /*
12706 +                * Clean up pi_blocked_on. We might leak it otherwise
12707 +                * when we succeeded with the hb->lock in the fast
12708 +                * path.
12709 +                */
12710 +               raw_spin_lock_irq(&current->pi_lock);
12711 +               current->pi_blocked_on = NULL;
12712 +               raw_spin_unlock_irq(&current->pi_lock);
12713 +
12714 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12715 +               spin_unlock(&hb->lock);
12716 +               if (ret)
12717 +                       goto out_put_keys;
12718 +       }
12719  
12720         /*
12721 -        * In order for us to be here, we know our q.key == key2, and since
12722 -        * we took the hb->lock above, we also know that futex_requeue() has
12723 -        * completed and we no longer have to concern ourselves with a wakeup
12724 -        * race with the atomic proxy lock acquisition by the requeue code. The
12725 -        * futex_requeue dropped our key1 reference and incremented our key2
12726 -        * reference count.
12727 +        * In order to be here, we have either been requeued, are in
12728 +        * the process of being requeued, or requeue successfully
12729 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
12730 +        * non-null above, we may be racing with a requeue.  Do not
12731 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
12732 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
12733 +        * reference and incremented our key2 reference count.
12734          */
12735 +       hb2 = hash_futex(&key2);
12736  
12737         /* Check if the requeue code acquired the second futex for us. */
12738         if (!q.rt_waiter) {
12739 @@ -2894,16 +3120,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12740                  * did a lock-steal - fix up the PI-state in that case.
12741                  */
12742                 if (q.pi_state && (q.pi_state->owner != current)) {
12743 -                       spin_lock(q.lock_ptr);
12744 +                       spin_lock(&hb2->lock);
12745 +                       BUG_ON(&hb2->lock != q.lock_ptr);
12746                         ret = fixup_pi_state_owner(uaddr2, &q, current);
12747 -                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
12748 -                               rt_mutex_unlock(&q.pi_state->pi_mutex);
12749 +                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
12750 +                               pi_state = q.pi_state;
12751 +                               get_pi_state(pi_state);
12752 +                       }
12753                         /*
12754                          * Drop the reference to the pi state which
12755                          * the requeue_pi() code acquired for us.
12756                          */
12757                         put_pi_state(q.pi_state);
12758 -                       spin_unlock(q.lock_ptr);
12759 +                       spin_unlock(&hb2->lock);
12760                 }
12761         } else {
12762                 struct rt_mutex *pi_mutex;
12763 @@ -2915,10 +3144,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12764                  */
12765                 WARN_ON(!q.pi_state);
12766                 pi_mutex = &q.pi_state->pi_mutex;
12767 -               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
12768 -               debug_rt_mutex_free_waiter(&rt_waiter);
12769 +               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
12770  
12771 -               spin_lock(q.lock_ptr);
12772 +               spin_lock(&hb2->lock);
12773 +               BUG_ON(&hb2->lock != q.lock_ptr);
12774 +               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
12775 +                       ret = 0;
12776 +
12777 +               debug_rt_mutex_free_waiter(&rt_waiter);
12778                 /*
12779                  * Fixup the pi_state owner and possibly acquire the lock if we
12780                  * haven't already.
12781 @@ -2936,13 +3169,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12782                  * the fault, unlock the rt_mutex and return the fault to
12783                  * userspace.
12784                  */
12785 -               if (ret && rt_mutex_owner(pi_mutex) == current)
12786 -                       rt_mutex_unlock(pi_mutex);
12787 +               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
12788 +                       pi_state = q.pi_state;
12789 +                       get_pi_state(pi_state);
12790 +               }
12791  
12792                 /* Unqueue and drop the lock. */
12793                 unqueue_me_pi(&q);
12794         }
12795  
12796 +       if (pi_state) {
12797 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12798 +               put_pi_state(pi_state);
12799 +       }
12800 +
12801         if (ret == -EINTR) {
12802                 /*
12803                  * We've already been requeued, but cannot restart by calling
12804 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
12805 index d3f24905852c..f87aa8fdcc51 100644
12806 --- a/kernel/irq/handle.c
12807 +++ b/kernel/irq/handle.c
12808 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
12809  {
12810         irqreturn_t retval;
12811         unsigned int flags = 0;
12812 +       struct pt_regs *regs = get_irq_regs();
12813 +       u64 ip = regs ? instruction_pointer(regs) : 0;
12814  
12815         retval = __handle_irq_event_percpu(desc, &flags);
12816  
12817 -       add_interrupt_randomness(desc->irq_data.irq, flags);
12818 +#ifdef CONFIG_PREEMPT_RT_FULL
12819 +       desc->random_ip = ip;
12820 +#else
12821 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
12822 +#endif
12823  
12824         if (!noirqdebug)
12825                 note_interrupt(desc, retval);
12826 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
12827 index 6b669593e7eb..e357bf6c59d5 100644
12828 --- a/kernel/irq/manage.c
12829 +++ b/kernel/irq/manage.c
12830 @@ -22,6 +22,7 @@
12831  #include "internals.h"
12832  
12833  #ifdef CONFIG_IRQ_FORCED_THREADING
12834 +# ifndef CONFIG_PREEMPT_RT_BASE
12835  __read_mostly bool force_irqthreads;
12836  
12837  static int __init setup_forced_irqthreads(char *arg)
12838 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
12839         return 0;
12840  }
12841  early_param("threadirqs", setup_forced_irqthreads);
12842 +# endif
12843  #endif
12844  
12845  static void __synchronize_hardirq(struct irq_desc *desc)
12846 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
12847  
12848         if (desc->affinity_notify) {
12849                 kref_get(&desc->affinity_notify->kref);
12850 +
12851 +#ifdef CONFIG_PREEMPT_RT_BASE
12852 +               swork_queue(&desc->affinity_notify->swork);
12853 +#else
12854                 schedule_work(&desc->affinity_notify->work);
12855 +#endif
12856         }
12857         irqd_set(data, IRQD_AFFINITY_SET);
12858  
12859 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
12860  }
12861  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
12862  
12863 -static void irq_affinity_notify(struct work_struct *work)
12864 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
12865  {
12866 -       struct irq_affinity_notify *notify =
12867 -               container_of(work, struct irq_affinity_notify, work);
12868         struct irq_desc *desc = irq_to_desc(notify->irq);
12869         cpumask_var_t cpumask;
12870         unsigned long flags;
12871 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
12872         kref_put(&notify->kref, notify->release);
12873  }
12874  
12875 +#ifdef CONFIG_PREEMPT_RT_BASE
12876 +static void init_helper_thread(void)
12877 +{
12878 +       static int init_sworker_once;
12879 +
12880 +       if (init_sworker_once)
12881 +               return;
12882 +       if (WARN_ON(swork_get()))
12883 +               return;
12884 +       init_sworker_once = 1;
12885 +}
12886 +
12887 +static void irq_affinity_notify(struct swork_event *swork)
12888 +{
12889 +       struct irq_affinity_notify *notify =
12890 +               container_of(swork, struct irq_affinity_notify, swork);
12891 +       _irq_affinity_notify(notify);
12892 +}
12893 +
12894 +#else
12895 +
12896 +static void irq_affinity_notify(struct work_struct *work)
12897 +{
12898 +       struct irq_affinity_notify *notify =
12899 +               container_of(work, struct irq_affinity_notify, work);
12900 +       _irq_affinity_notify(notify);
12901 +}
12902 +#endif
12903 +
12904  /**
12905   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
12906   *     @irq:           Interrupt for which to enable/disable notification
12907 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
12908         if (notify) {
12909                 notify->irq = irq;
12910                 kref_init(&notify->kref);
12911 +#ifdef CONFIG_PREEMPT_RT_BASE
12912 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
12913 +               init_helper_thread();
12914 +#else
12915                 INIT_WORK(&notify->work, irq_affinity_notify);
12916 +#endif
12917         }
12918  
12919         raw_spin_lock_irqsave(&desc->lock, flags);
12920 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
12921         local_bh_disable();
12922         ret = action->thread_fn(action->irq, action->dev_id);
12923         irq_finalize_oneshot(desc, action);
12924 -       local_bh_enable();
12925 +       /*
12926 +        * Interrupts which have real time requirements can be set up
12927 +        * to avoid softirq processing in the thread handler. This is
12928 +        * safe as these interrupts do not raise soft interrupts.
12929 +        */
12930 +       if (irq_settings_no_softirq_call(desc))
12931 +               _local_bh_enable();
12932 +       else
12933 +               local_bh_enable();
12934         return ret;
12935  }
12936  
12937 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
12938                 if (action_ret == IRQ_WAKE_THREAD)
12939                         irq_wake_secondary(desc, action);
12940  
12941 +#ifdef CONFIG_PREEMPT_RT_FULL
12942 +               migrate_disable();
12943 +               add_interrupt_randomness(action->irq, 0,
12944 +                                desc->random_ip ^ (unsigned long) action);
12945 +               migrate_enable();
12946 +#endif
12947                 wake_threads_waitq(desc);
12948         }
12949  
12950 @@ -1336,6 +1389,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
12951                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
12952                 }
12953  
12954 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
12955 +                       irq_settings_set_no_softirq_call(desc);
12956 +
12957                 /* Set default affinity mask once everything is setup */
12958                 setup_affinity(desc, mask);
12959  
12960 @@ -2061,7 +2117,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
12961   *     This call sets the internal irqchip state of an interrupt,
12962   *     depending on the value of @which.
12963   *
12964 - *     This function should be called with preemption disabled if the
12965 + *     This function should be called with migration disabled if the
12966   *     interrupt controller has per-cpu registers.
12967   */
12968  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12969 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
12970 index 320579d89091..2df2d4445b1e 100644
12971 --- a/kernel/irq/settings.h
12972 +++ b/kernel/irq/settings.h
12973 @@ -16,6 +16,7 @@ enum {
12974         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
12975         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
12976         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
12977 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
12978         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
12979  };
12980  
12981 @@ -30,6 +31,7 @@ enum {
12982  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
12983  #define IRQ_IS_POLLED          GOT_YOU_MORON
12984  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
12985 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
12986  #undef IRQF_MODIFY_MASK
12987  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
12988  
12989 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
12990         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
12991  }
12992  
12993 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
12994 +{
12995 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
12996 +}
12997 +
12998 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
12999 +{
13000 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
13001 +}
13002 +
13003  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
13004  {
13005         return desc->status_use_accessors & _IRQ_PER_CPU;
13006 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
13007 index 5707f97a3e6a..73f38dc7a7fb 100644
13008 --- a/kernel/irq/spurious.c
13009 +++ b/kernel/irq/spurious.c
13010 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
13011  
13012  static int __init irqfixup_setup(char *str)
13013  {
13014 +#ifdef CONFIG_PREEMPT_RT_BASE
13015 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13016 +       return 1;
13017 +#endif
13018         irqfixup = 1;
13019         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
13020         printk(KERN_WARNING "This may impact system performance.\n");
13021 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
13022  
13023  static int __init irqpoll_setup(char *str)
13024  {
13025 +#ifdef CONFIG_PREEMPT_RT_BASE
13026 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13027 +       return 1;
13028 +#endif
13029         irqfixup = 2;
13030         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
13031                                 "enabled\n");
13032 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
13033 index bcf107ce0854..2899ba0d23d1 100644
13034 --- a/kernel/irq_work.c
13035 +++ b/kernel/irq_work.c
13036 @@ -17,6 +17,7 @@
13037  #include <linux/cpu.h>
13038  #include <linux/notifier.h>
13039  #include <linux/smp.h>
13040 +#include <linux/interrupt.h>
13041  #include <asm/processor.h>
13042  
13043  
13044 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
13045   */
13046  bool irq_work_queue_on(struct irq_work *work, int cpu)
13047  {
13048 +       struct llist_head *list;
13049 +
13050         /* All work should have been flushed before going offline */
13051         WARN_ON_ONCE(cpu_is_offline(cpu));
13052  
13053 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
13054         if (!irq_work_claim(work))
13055                 return false;
13056  
13057 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
13058 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
13059 +               list = &per_cpu(lazy_list, cpu);
13060 +       else
13061 +               list = &per_cpu(raised_list, cpu);
13062 +
13063 +       if (llist_add(&work->llnode, list))
13064                 arch_send_call_function_single_ipi(cpu);
13065  
13066         return true;
13067 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
13068  /* Enqueue the irq work @work on the current CPU */
13069  bool irq_work_queue(struct irq_work *work)
13070  {
13071 +       struct llist_head *list;
13072 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
13073 +
13074         /* Only queue if not already pending */
13075         if (!irq_work_claim(work))
13076                 return false;
13077 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
13078         /* Queue the entry and raise the IPI if needed. */
13079         preempt_disable();
13080  
13081 -       /* If the work is "lazy", handle it from next tick if any */
13082 -       if (work->flags & IRQ_WORK_LAZY) {
13083 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
13084 -                   tick_nohz_tick_stopped())
13085 -                       arch_irq_work_raise();
13086 -       } else {
13087 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
13088 +       lazy_work = work->flags & IRQ_WORK_LAZY;
13089 +
13090 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
13091 +               list = this_cpu_ptr(&lazy_list);
13092 +       else
13093 +               list = this_cpu_ptr(&raised_list);
13094 +
13095 +       if (llist_add(&work->llnode, list)) {
13096 +               if (!lazy_work || tick_nohz_tick_stopped())
13097                         arch_irq_work_raise();
13098         }
13099  
13100 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
13101         raised = this_cpu_ptr(&raised_list);
13102         lazy = this_cpu_ptr(&lazy_list);
13103  
13104 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
13105 -               if (llist_empty(lazy))
13106 -                       return false;
13107 +       if (llist_empty(raised) && llist_empty(lazy))
13108 +               return false;
13109  
13110         /* All work should have been flushed before going offline */
13111         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
13112 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
13113         struct irq_work *work;
13114         struct llist_node *llnode;
13115  
13116 -       BUG_ON(!irqs_disabled());
13117 +       BUG_ON_NONRT(!irqs_disabled());
13118  
13119         if (llist_empty(list))
13120                 return;
13121 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
13122  void irq_work_run(void)
13123  {
13124         irq_work_run_list(this_cpu_ptr(&raised_list));
13125 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
13126 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
13127 +               /*
13128 +                * NOTE: we raise softirq via IPI for safety,
13129 +                * and execute in irq_work_tick() to move the
13130 +                * overhead from hard to soft irq context.
13131 +                */
13132 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
13133 +                       raise_softirq(TIMER_SOFTIRQ);
13134 +       } else
13135 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13136  }
13137  EXPORT_SYMBOL_GPL(irq_work_run);
13138  
13139 @@ -179,8 +200,17 @@ void irq_work_tick(void)
13140  
13141         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
13142                 irq_work_run_list(raised);
13143 +
13144 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
13145 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13146 +}
13147 +
13148 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
13149 +void irq_work_tick_soft(void)
13150 +{
13151         irq_work_run_list(this_cpu_ptr(&lazy_list));
13152  }
13153 +#endif
13154  
13155  /*
13156   * Synchronize against the irq_work @entry, ensures the entry is not
13157 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
13158 index ee1bc1bb8feb..ddef07958840 100644
13159 --- a/kernel/ksysfs.c
13160 +++ b/kernel/ksysfs.c
13161 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
13162  
13163  #endif /* CONFIG_KEXEC_CORE */
13164  
13165 +#if defined(CONFIG_PREEMPT_RT_FULL)
13166 +static ssize_t  realtime_show(struct kobject *kobj,
13167 +                             struct kobj_attribute *attr, char *buf)
13168 +{
13169 +       return sprintf(buf, "%d\n", 1);
13170 +}
13171 +KERNEL_ATTR_RO(realtime);
13172 +#endif
13173 +
13174  /* whether file capabilities are enabled */
13175  static ssize_t fscaps_show(struct kobject *kobj,
13176                                   struct kobj_attribute *attr, char *buf)
13177 @@ -225,6 +234,9 @@ static struct attribute * kernel_attrs[] = {
13178         &rcu_expedited_attr.attr,
13179         &rcu_normal_attr.attr,
13180  #endif
13181 +#ifdef CONFIG_PREEMPT_RT_FULL
13182 +       &realtime_attr.attr,
13183 +#endif
13184         NULL
13185  };
13186  
13187 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
13188 index 6f88e352cd4f..6ff9e8011dd0 100644
13189 --- a/kernel/locking/Makefile
13190 +++ b/kernel/locking/Makefile
13191 @@ -2,7 +2,7 @@
13192  # and is generally not a function of system call inputs.
13193  KCOV_INSTRUMENT                := n
13194  
13195 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
13196 +obj-y += semaphore.o percpu-rwsem.o
13197  
13198  ifdef CONFIG_FUNCTION_TRACER
13199  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
13200 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
13201  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
13202  endif
13203  
13204 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13205 +obj-y += mutex.o
13206  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
13207 +endif
13208 +obj-y += rwsem.o
13209  obj-$(CONFIG_LOCKDEP) += lockdep.o
13210  ifeq ($(CONFIG_PROC_FS),y)
13211  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
13212 @@ -24,7 +28,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
13213  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
13214  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
13215  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
13216 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13217  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
13218  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
13219 +endif
13220 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o rwsem-rt.o
13221  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
13222  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
13223 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
13224 index 4d7ffc0a0d00..3d157b3128eb 100644
13225 --- a/kernel/locking/lockdep.c
13226 +++ b/kernel/locking/lockdep.c
13227 @@ -658,6 +658,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13228         struct lockdep_subclass_key *key;
13229         struct hlist_head *hash_head;
13230         struct lock_class *class;
13231 +       bool is_static = false;
13232  
13233         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
13234                 debug_locks_off();
13235 @@ -671,10 +672,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13236  
13237         /*
13238          * Static locks do not have their class-keys yet - for them the key
13239 -        * is the lock object itself:
13240 +        * is the lock object itself. If the lock is in the per cpu area,
13241 +        * the canonical address of the lock (per cpu offset removed) is
13242 +        * used.
13243          */
13244 -       if (unlikely(!lock->key))
13245 -               lock->key = (void *)lock;
13246 +       if (unlikely(!lock->key)) {
13247 +               unsigned long can_addr, addr = (unsigned long)lock;
13248 +
13249 +               if (__is_kernel_percpu_address(addr, &can_addr))
13250 +                       lock->key = (void *)can_addr;
13251 +               else if (__is_module_percpu_address(addr, &can_addr))
13252 +                       lock->key = (void *)can_addr;
13253 +               else if (static_obj(lock))
13254 +                       lock->key = (void *)lock;
13255 +               else
13256 +                       return ERR_PTR(-EINVAL);
13257 +               is_static = true;
13258 +       }
13259  
13260         /*
13261          * NOTE: the class-key must be unique. For dynamic locks, a static
13262 @@ -706,7 +720,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13263                 }
13264         }
13265  
13266 -       return NULL;
13267 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
13268  }
13269  
13270  /*
13271 @@ -724,19 +738,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
13272         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
13273  
13274         class = look_up_lock_class(lock, subclass);
13275 -       if (likely(class))
13276 +       if (likely(!IS_ERR_OR_NULL(class)))
13277                 goto out_set_class_cache;
13278  
13279         /*
13280          * Debug-check: all keys must be persistent!
13281 -        */
13282 -       if (!static_obj(lock->key)) {
13283 +        */
13284 +       if (IS_ERR(class)) {
13285                 debug_locks_off();
13286                 printk("INFO: trying to register non-static key.\n");
13287                 printk("the code is fine but needs lockdep annotation.\n");
13288                 printk("turning off the locking correctness validator.\n");
13289                 dump_stack();
13290 -
13291                 return NULL;
13292         }
13293  
13294 @@ -3410,7 +3423,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
13295                  * Clearly if the lock hasn't been acquired _ever_, we're not
13296                  * holding it either, so report failure.
13297                  */
13298 -               if (!class)
13299 +               if (IS_ERR_OR_NULL(class))
13300                         return 0;
13301  
13302                 /*
13303 @@ -3689,6 +3702,7 @@ static void check_flags(unsigned long flags)
13304                 }
13305         }
13306  
13307 +#ifndef CONFIG_PREEMPT_RT_FULL
13308         /*
13309          * We dont accurately track softirq state in e.g.
13310          * hardirq contexts (such as on 4KSTACKS), so only
13311 @@ -3703,6 +3717,7 @@ static void check_flags(unsigned long flags)
13312                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
13313                 }
13314         }
13315 +#endif
13316  
13317         if (!debug_locks)
13318                 print_irqtrace_events(current);
13319 @@ -4159,7 +4174,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
13320                  * If the class exists we look it up and zap it:
13321                  */
13322                 class = look_up_lock_class(lock, j);
13323 -               if (class)
13324 +               if (!IS_ERR_OR_NULL(class))
13325                         zap_class(class);
13326         }
13327         /*
13328 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
13329 index f8c5af52a131..788068773e61 100644
13330 --- a/kernel/locking/locktorture.c
13331 +++ b/kernel/locking/locktorture.c
13332 @@ -26,7 +26,6 @@
13333  #include <linux/kthread.h>
13334  #include <linux/sched/rt.h>
13335  #include <linux/spinlock.h>
13336 -#include <linux/rwlock.h>
13337  #include <linux/mutex.h>
13338  #include <linux/rwsem.h>
13339  #include <linux/smp.h>
13340 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
13341 index ce182599cf2e..2ad3a1e8344c 100644
13342 --- a/kernel/locking/percpu-rwsem.c
13343 +++ b/kernel/locking/percpu-rwsem.c
13344 @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
13345         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
13346         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
13347         __init_rwsem(&sem->rw_sem, name, rwsem_key);
13348 -       init_waitqueue_head(&sem->writer);
13349 +       init_swait_queue_head(&sem->writer);
13350         sem->readers_block = 0;
13351         return 0;
13352  }
13353 @@ -103,7 +103,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
13354         __this_cpu_dec(*sem->read_count);
13355  
13356         /* Prod writer to recheck readers_active */
13357 -       wake_up(&sem->writer);
13358 +       swake_up(&sem->writer);
13359  }
13360  EXPORT_SYMBOL_GPL(__percpu_up_read);
13361  
13362 @@ -160,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
13363          */
13364  
13365         /* Wait for all now active readers to complete. */
13366 -       wait_event(sem->writer, readers_active_check(sem));
13367 +       swait_event(sem->writer, readers_active_check(sem));
13368  }
13369  EXPORT_SYMBOL_GPL(percpu_down_write);
13370  
13371 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
13372 new file mode 100644
13373 index 000000000000..6284e3b15091
13374 --- /dev/null
13375 +++ b/kernel/locking/rt.c
13376 @@ -0,0 +1,331 @@
13377 +/*
13378 + * kernel/rt.c
13379 + *
13380 + * Real-Time Preemption Support
13381 + *
13382 + * started by Ingo Molnar:
13383 + *
13384 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
13385 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13386 + *
13387 + * historic credit for proving that Linux spinlocks can be implemented via
13388 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
13389 + * and others) who prototyped it on 2.4 and did lots of comparative
13390 + * research and analysis; TimeSys, for proving that you can implement a
13391 + * fully preemptible kernel via the use of IRQ threading and mutexes;
13392 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
13393 + * right one; and to MontaVista, who ported pmutexes to 2.6.
13394 + *
13395 + * This code is a from-scratch implementation and is not based on pmutexes,
13396 + * but the idea of converting spinlocks to mutexes is used here too.
13397 + *
13398 + * lock debugging, locking tree, deadlock detection:
13399 + *
13400 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
13401 + *  Released under the General Public License (GPL).
13402 + *
13403 + * Includes portions of the generic R/W semaphore implementation from:
13404 + *
13405 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
13406 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
13407 + *  - Derived also from comments by Linus
13408 + *
13409 + * Pending ownership of locks and ownership stealing:
13410 + *
13411 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
13412 + *
13413 + *   (also by Steven Rostedt)
13414 + *    - Converted single pi_lock to individual task locks.
13415 + *
13416 + * By Esben Nielsen:
13417 + *    Doing priority inheritance with help of the scheduler.
13418 + *
13419 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13420 + *  - major rework based on Esben Nielsens initial patch
13421 + *  - replaced thread_info references by task_struct refs
13422 + *  - removed task->pending_owner dependency
13423 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
13424 + *    in the scheduler return path as discussed with Steven Rostedt
13425 + *
13426 + *  Copyright (C) 2006, Kihon Technologies Inc.
13427 + *    Steven Rostedt <rostedt@goodmis.org>
13428 + *  - debugged and patched Thomas Gleixner's rework.
13429 + *  - added back the cmpxchg to the rework.
13430 + *  - turned atomic require back on for SMP.
13431 + */
13432 +
13433 +#include <linux/spinlock.h>
13434 +#include <linux/rtmutex.h>
13435 +#include <linux/sched.h>
13436 +#include <linux/delay.h>
13437 +#include <linux/module.h>
13438 +#include <linux/kallsyms.h>
13439 +#include <linux/syscalls.h>
13440 +#include <linux/interrupt.h>
13441 +#include <linux/plist.h>
13442 +#include <linux/fs.h>
13443 +#include <linux/futex.h>
13444 +#include <linux/hrtimer.h>
13445 +
13446 +#include "rtmutex_common.h"
13447 +
13448 +/*
13449 + * struct mutex functions
13450 + */
13451 +void __mutex_do_init(struct mutex *mutex, const char *name,
13452 +                    struct lock_class_key *key)
13453 +{
13454 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13455 +       /*
13456 +        * Make sure we are not reinitializing a held lock:
13457 +        */
13458 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
13459 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
13460 +#endif
13461 +       mutex->lock.save_state = 0;
13462 +}
13463 +EXPORT_SYMBOL(__mutex_do_init);
13464 +
13465 +void __lockfunc _mutex_lock(struct mutex *lock)
13466 +{
13467 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13468 +       rt_mutex_lock(&lock->lock);
13469 +}
13470 +EXPORT_SYMBOL(_mutex_lock);
13471 +
13472 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
13473 +{
13474 +       int ret;
13475 +
13476 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13477 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13478 +       if (ret)
13479 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13480 +       return ret;
13481 +}
13482 +EXPORT_SYMBOL(_mutex_lock_interruptible);
13483 +
13484 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
13485 +{
13486 +       int ret;
13487 +
13488 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13489 +       ret = rt_mutex_lock_killable(&lock->lock);
13490 +       if (ret)
13491 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13492 +       return ret;
13493 +}
13494 +EXPORT_SYMBOL(_mutex_lock_killable);
13495 +
13496 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13497 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
13498 +{
13499 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13500 +       rt_mutex_lock(&lock->lock);
13501 +}
13502 +EXPORT_SYMBOL(_mutex_lock_nested);
13503 +
13504 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
13505 +{
13506 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
13507 +       rt_mutex_lock(&lock->lock);
13508 +}
13509 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
13510 +
13511 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
13512 +{
13513 +       int ret;
13514 +
13515 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13516 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13517 +       if (ret)
13518 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13519 +       return ret;
13520 +}
13521 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
13522 +
13523 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
13524 +{
13525 +       int ret;
13526 +
13527 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
13528 +       ret = rt_mutex_lock_killable(&lock->lock);
13529 +       if (ret)
13530 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13531 +       return ret;
13532 +}
13533 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
13534 +#endif
13535 +
13536 +int __lockfunc _mutex_trylock(struct mutex *lock)
13537 +{
13538 +       int ret = rt_mutex_trylock(&lock->lock);
13539 +
13540 +       if (ret)
13541 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13542 +
13543 +       return ret;
13544 +}
13545 +EXPORT_SYMBOL(_mutex_trylock);
13546 +
13547 +void __lockfunc _mutex_unlock(struct mutex *lock)
13548 +{
13549 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
13550 +       rt_mutex_unlock(&lock->lock);
13551 +}
13552 +EXPORT_SYMBOL(_mutex_unlock);
13553 +
13554 +/*
13555 + * rwlock_t functions
13556 + */
13557 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
13558 +{
13559 +       int ret;
13560 +
13561 +       migrate_disable();
13562 +       ret = rt_mutex_trylock(&rwlock->lock);
13563 +       if (ret)
13564 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13565 +       else
13566 +               migrate_enable();
13567 +
13568 +       return ret;
13569 +}
13570 +EXPORT_SYMBOL(rt_write_trylock);
13571 +
13572 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
13573 +{
13574 +       int ret;
13575 +
13576 +       *flags = 0;
13577 +       ret = rt_write_trylock(rwlock);
13578 +       return ret;
13579 +}
13580 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
13581 +
13582 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
13583 +{
13584 +       struct rt_mutex *lock = &rwlock->lock;
13585 +       int ret = 1;
13586 +
13587 +       /*
13588 +        * recursive read locks succeed when current owns the lock,
13589 +        * but not when read_depth == 0 which means that the lock is
13590 +        * write locked.
13591 +        */
13592 +       if (rt_mutex_owner(lock) != current) {
13593 +               migrate_disable();
13594 +               ret = rt_mutex_trylock(lock);
13595 +               if (ret)
13596 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13597 +               else
13598 +                       migrate_enable();
13599 +
13600 +       } else if (!rwlock->read_depth) {
13601 +               ret = 0;
13602 +       }
13603 +
13604 +       if (ret)
13605 +               rwlock->read_depth++;
13606 +
13607 +       return ret;
13608 +}
13609 +EXPORT_SYMBOL(rt_read_trylock);
13610 +
13611 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
13612 +{
13613 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13614 +       __rt_spin_lock(&rwlock->lock);
13615 +}
13616 +EXPORT_SYMBOL(rt_write_lock);
13617 +
13618 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
13619 +{
13620 +       struct rt_mutex *lock = &rwlock->lock;
13621 +
13622 +
13623 +       /*
13624 +        * recursive read locks succeed when current owns the lock
13625 +        */
13626 +       if (rt_mutex_owner(lock) != current) {
13627 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13628 +               __rt_spin_lock(lock);
13629 +       }
13630 +       rwlock->read_depth++;
13631 +}
13632 +
13633 +EXPORT_SYMBOL(rt_read_lock);
13634 +
13635 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
13636 +{
13637 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13638 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13639 +       __rt_spin_unlock(&rwlock->lock);
13640 +       migrate_enable();
13641 +}
13642 +EXPORT_SYMBOL(rt_write_unlock);
13643 +
13644 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
13645 +{
13646 +       /* Release the lock only when read_depth is down to 0 */
13647 +       if (--rwlock->read_depth == 0) {
13648 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13649 +               __rt_spin_unlock(&rwlock->lock);
13650 +               migrate_enable();
13651 +       }
13652 +}
13653 +EXPORT_SYMBOL(rt_read_unlock);
13654 +
13655 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
13656 +{
13657 +       rt_write_lock(rwlock);
13658 +
13659 +       return 0;
13660 +}
13661 +EXPORT_SYMBOL(rt_write_lock_irqsave);
13662 +
13663 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
13664 +{
13665 +       rt_read_lock(rwlock);
13666 +
13667 +       return 0;
13668 +}
13669 +EXPORT_SYMBOL(rt_read_lock_irqsave);
13670 +
13671 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
13672 +{
13673 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13674 +       /*
13675 +        * Make sure we are not reinitializing a held lock:
13676 +        */
13677 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
13678 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
13679 +#endif
13680 +       rwlock->lock.save_state = 1;
13681 +       rwlock->read_depth = 0;
13682 +}
13683 +EXPORT_SYMBOL(__rt_rwlock_init);
13684 +
13685 +/**
13686 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
13687 + * @cnt: the atomic which we are to dec
13688 + * @lock: the mutex to return holding if we dec to 0
13689 + *
13690 + * return true and hold lock if we dec to 0, return false otherwise
13691 + */
13692 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
13693 +{
13694 +       /* dec if we can't possibly hit 0 */
13695 +       if (atomic_add_unless(cnt, -1, 1))
13696 +               return 0;
13697 +       /* we might hit 0, so take the lock */
13698 +       mutex_lock(lock);
13699 +       if (!atomic_dec_and_test(cnt)) {
13700 +               /* when we actually did the dec, we didn't hit 0 */
13701 +               mutex_unlock(lock);
13702 +               return 0;
13703 +       }
13704 +       /* we hit 0, and we hold the lock */
13705 +       return 1;
13706 +}
13707 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
13708 diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
13709 index 62b6cee8ea7f..0613c4b1d059 100644
13710 --- a/kernel/locking/rtmutex-debug.c
13711 +++ b/kernel/locking/rtmutex-debug.c
13712 @@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
13713         lock->name = name;
13714  }
13715  
13716 -void
13717 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
13718 -{
13719 -}
13720 -
13721 -void rt_mutex_deadlock_account_unlock(struct task_struct *task)
13722 -{
13723 -}
13724 -
13725 diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
13726 index d0519c3432b6..b585af9a1b50 100644
13727 --- a/kernel/locking/rtmutex-debug.h
13728 +++ b/kernel/locking/rtmutex-debug.h
13729 @@ -9,9 +9,6 @@
13730   * This file contains macros used solely by rtmutex.c. Debug version.
13731   */
13732  
13733 -extern void
13734 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
13735 -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
13736  extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
13737  extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
13738  extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
13739 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
13740 index 2c49d76f96c3..674ad9087eb5 100644
13741 --- a/kernel/locking/rtmutex.c
13742 +++ b/kernel/locking/rtmutex.c
13743 @@ -7,6 +7,11 @@
13744   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13745   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
13746   *  Copyright (C) 2006 Esben Nielsen
13747 + *  Adaptive Spinlocks:
13748 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
13749 + *                                  and Peter Morreale,
13750 + * Adaptive Spinlocks simplification:
13751 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
13752   *
13753   *  See Documentation/locking/rt-mutex-design.txt for details.
13754   */
13755 @@ -16,6 +21,7 @@
13756  #include <linux/sched/rt.h>
13757  #include <linux/sched/deadline.h>
13758  #include <linux/timer.h>
13759 +#include <linux/ww_mutex.h>
13760  
13761  #include "rtmutex_common.h"
13762  
13763 @@ -133,6 +139,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
13764                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
13765  }
13766  
13767 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
13768 +{
13769 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
13770 +               waiter != PI_REQUEUE_INPROGRESS;
13771 +}
13772 +
13773  /*
13774   * We can speed up the acquire/release, if there's no debugging state to be
13775   * set up.
13776 @@ -414,6 +426,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
13777         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
13778  }
13779  
13780 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
13781 +{
13782 +       if (waiter->savestate)
13783 +               wake_up_lock_sleeper(waiter->task);
13784 +       else
13785 +               wake_up_process(waiter->task);
13786 +}
13787 +
13788  /*
13789   * Max number of times we'll walk the boosting chain:
13790   */
13791 @@ -421,7 +441,8 @@ int max_lock_depth = 1024;
13792  
13793  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
13794  {
13795 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
13796 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
13797 +               p->pi_blocked_on->lock : NULL;
13798  }
13799  
13800  /*
13801 @@ -557,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13802          * reached or the state of the chain has changed while we
13803          * dropped the locks.
13804          */
13805 -       if (!waiter)
13806 +       if (!rt_mutex_real_waiter(waiter))
13807                 goto out_unlock_pi;
13808  
13809         /*
13810 @@ -719,13 +740,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13811          * follow here. This is the end of the chain we are walking.
13812          */
13813         if (!rt_mutex_owner(lock)) {
13814 +               struct rt_mutex_waiter *lock_top_waiter;
13815 +
13816                 /*
13817                  * If the requeue [7] above changed the top waiter,
13818                  * then we need to wake the new top waiter up to try
13819                  * to get the lock.
13820                  */
13821 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
13822 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
13823 +               lock_top_waiter = rt_mutex_top_waiter(lock);
13824 +               if (prerequeue_top_waiter != lock_top_waiter)
13825 +                       rt_mutex_wake_waiter(lock_top_waiter);
13826                 raw_spin_unlock_irq(&lock->wait_lock);
13827                 return 0;
13828         }
13829 @@ -818,6 +842,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13830         return ret;
13831  }
13832  
13833 +
13834 +#define STEAL_NORMAL  0
13835 +#define STEAL_LATERAL 1
13836 +
13837 +/*
13838 + * Note that RT tasks are excluded from lateral-steals to prevent the
13839 + * introduction of an unbounded latency
13840 + */
13841 +static inline int lock_is_stealable(struct task_struct *task,
13842 +                                   struct task_struct *pendowner, int mode)
13843 +{
13844 +    if (mode == STEAL_NORMAL || rt_task(task)) {
13845 +           if (task->prio >= pendowner->prio)
13846 +                   return 0;
13847 +    } else if (task->prio > pendowner->prio)
13848 +           return 0;
13849 +    return 1;
13850 +}
13851 +
13852  /*
13853   * Try to take an rt-mutex
13854   *
13855 @@ -828,8 +871,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13856   * @waiter: The waiter that is queued to the lock's wait tree if the
13857   *         callsite called task_blocked_on_lock(), otherwise NULL
13858   */
13859 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13860 -                               struct rt_mutex_waiter *waiter)
13861 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
13862 +                                 struct task_struct *task,
13863 +                                 struct rt_mutex_waiter *waiter, int mode)
13864  {
13865         /*
13866          * Before testing whether we can acquire @lock, we set the
13867 @@ -866,8 +910,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13868                  * If waiter is not the highest priority waiter of
13869                  * @lock, give up.
13870                  */
13871 -               if (waiter != rt_mutex_top_waiter(lock))
13872 +               if (waiter != rt_mutex_top_waiter(lock)) {
13873 +                       /* XXX lock_is_stealable() ? */
13874                         return 0;
13875 +               }
13876  
13877                 /*
13878                  * We can acquire the lock. Remove the waiter from the
13879 @@ -885,14 +931,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13880                  * not need to be dequeued.
13881                  */
13882                 if (rt_mutex_has_waiters(lock)) {
13883 -                       /*
13884 -                        * If @task->prio is greater than or equal to
13885 -                        * the top waiter priority (kernel view),
13886 -                        * @task lost.
13887 -                        */
13888 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
13889 -                               return 0;
13890 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
13891  
13892 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
13893 +                               return 0;
13894                         /*
13895                          * The current top waiter stays enqueued. We
13896                          * don't have to change anything in the lock
13897 @@ -936,11 +978,395 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13898          */
13899         rt_mutex_set_owner(lock, task);
13900  
13901 -       rt_mutex_deadlock_account_lock(lock, task);
13902 -
13903         return 1;
13904  }
13905  
13906 +#ifdef CONFIG_PREEMPT_RT_FULL
13907 +/*
13908 + * preemptible spin_lock functions:
13909 + */
13910 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
13911 +                                        void  (*slowfn)(struct rt_mutex *lock,
13912 +                                                        bool mg_off),
13913 +                                        bool do_mig_dis)
13914 +{
13915 +       might_sleep_no_state_check();
13916 +
13917 +       if (do_mig_dis)
13918 +               migrate_disable();
13919 +
13920 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
13921 +               return;
13922 +       else
13923 +               slowfn(lock, do_mig_dis);
13924 +}
13925 +
13926 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
13927 +                                          void  (*slowfn)(struct rt_mutex *lock))
13928 +{
13929 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
13930 +               return;
13931 +       else
13932 +               slowfn(lock);
13933 +}
13934 +#ifdef CONFIG_SMP
13935 +/*
13936 + * Note that owner is a speculative pointer and dereferencing relies
13937 + * on rcu_read_lock() and the check against the lock owner.
13938 + */
13939 +static int adaptive_wait(struct rt_mutex *lock,
13940 +                        struct task_struct *owner)
13941 +{
13942 +       int res = 0;
13943 +
13944 +       rcu_read_lock();
13945 +       for (;;) {
13946 +               if (owner != rt_mutex_owner(lock))
13947 +                       break;
13948 +               /*
13949 +                * Ensure that owner->on_cpu is dereferenced _after_
13950 +                * checking the above to be valid.
13951 +                */
13952 +               barrier();
13953 +               if (!owner->on_cpu) {
13954 +                       res = 1;
13955 +                       break;
13956 +               }
13957 +               cpu_relax();
13958 +       }
13959 +       rcu_read_unlock();
13960 +       return res;
13961 +}
13962 +#else
13963 +static int adaptive_wait(struct rt_mutex *lock,
13964 +                        struct task_struct *orig_owner)
13965 +{
13966 +       return 1;
13967 +}
13968 +#endif
13969 +
13970 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13971 +                                  struct rt_mutex_waiter *waiter,
13972 +                                  struct task_struct *task,
13973 +                                  enum rtmutex_chainwalk chwalk);
13974 +/*
13975 + * Slow path lock function spin_lock style: this variant is very
13976 + * careful not to miss any non-lock wakeups.
13977 + *
13978 + * We store the current state under p->pi_lock in p->saved_state and
13979 + * the try_to_wake_up() code handles this accordingly.
13980 + */
13981 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
13982 +                                                   bool mg_off)
13983 +{
13984 +       struct task_struct *lock_owner, *self = current;
13985 +       struct rt_mutex_waiter waiter, *top_waiter;
13986 +       unsigned long flags;
13987 +       int ret;
13988 +
13989 +       rt_mutex_init_waiter(&waiter, true);
13990 +
13991 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
13992 +
13993 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
13994 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13995 +               return;
13996 +       }
13997 +
13998 +       BUG_ON(rt_mutex_owner(lock) == self);
13999 +
14000 +       /*
14001 +        * We save whatever state the task is in and we'll restore it
14002 +        * after acquiring the lock taking real wakeups into account
14003 +        * as well. We are serialized via pi_lock against wakeups. See
14004 +        * try_to_wake_up().
14005 +        */
14006 +       raw_spin_lock(&self->pi_lock);
14007 +       self->saved_state = self->state;
14008 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14009 +       raw_spin_unlock(&self->pi_lock);
14010 +
14011 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
14012 +       BUG_ON(ret);
14013 +
14014 +       for (;;) {
14015 +               /* Try to acquire the lock again. */
14016 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
14017 +                       break;
14018 +
14019 +               top_waiter = rt_mutex_top_waiter(lock);
14020 +               lock_owner = rt_mutex_owner(lock);
14021 +
14022 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14023 +
14024 +               debug_rt_mutex_print_deadlock(&waiter);
14025 +
14026 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
14027 +                       if (mg_off)
14028 +                               migrate_enable();
14029 +                       schedule();
14030 +                       if (mg_off)
14031 +                               migrate_disable();
14032 +               }
14033 +
14034 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
14035 +
14036 +               raw_spin_lock(&self->pi_lock);
14037 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14038 +               raw_spin_unlock(&self->pi_lock);
14039 +       }
14040 +
14041 +       /*
14042 +        * Restore the task state to current->saved_state. We set it
14043 +        * to the original state above and the try_to_wake_up() code
14044 +        * has possibly updated it when a real (non-rtmutex) wakeup
14045 +        * happened while we were blocked. Clear saved_state so
14046 +        * try_to_wakeup() does not get confused.
14047 +        */
14048 +       raw_spin_lock(&self->pi_lock);
14049 +       __set_current_state_no_track(self->saved_state);
14050 +       self->saved_state = TASK_RUNNING;
14051 +       raw_spin_unlock(&self->pi_lock);
14052 +
14053 +       /*
14054 +        * try_to_take_rt_mutex() sets the waiter bit
14055 +        * unconditionally. We might have to fix that up:
14056 +        */
14057 +       fixup_rt_mutex_waiters(lock);
14058 +
14059 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
14060 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
14061 +
14062 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14063 +
14064 +       debug_rt_mutex_free_waiter(&waiter);
14065 +}
14066 +
14067 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14068 +                                   struct wake_q_head *wake_sleeper_q,
14069 +                                   struct rt_mutex *lock);
14070 +/*
14071 + * Slow path to release a rt_mutex spin_lock style
14072 + */
14073 +static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
14074 +{
14075 +       unsigned long flags;
14076 +       WAKE_Q(wake_q);
14077 +       WAKE_Q(wake_sleeper_q);
14078 +
14079 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14080 +
14081 +       debug_rt_mutex_unlock(lock);
14082 +
14083 +       if (!rt_mutex_has_waiters(lock)) {
14084 +               lock->owner = NULL;
14085 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14086 +               return;
14087 +       }
14088 +
14089 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
14090 +
14091 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14092 +       wake_up_q(&wake_q);
14093 +       wake_up_q_sleeper(&wake_sleeper_q);
14094 +
14095 +       /* Undo pi boosting.when necessary */
14096 +       rt_mutex_adjust_prio(current);
14097 +}
14098 +
14099 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
14100 +{
14101 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
14102 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14103 +}
14104 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
14105 +
14106 +void __lockfunc rt_spin_lock(spinlock_t *lock)
14107 +{
14108 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
14109 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14110 +}
14111 +EXPORT_SYMBOL(rt_spin_lock);
14112 +
14113 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
14114 +{
14115 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
14116 +}
14117 +EXPORT_SYMBOL(__rt_spin_lock);
14118 +
14119 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
14120 +{
14121 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
14122 +}
14123 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
14124 +
14125 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14126 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
14127 +{
14128 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
14129 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
14130 +}
14131 +EXPORT_SYMBOL(rt_spin_lock_nested);
14132 +#endif
14133 +
14134 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
14135 +{
14136 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14137 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14138 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
14139 +}
14140 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
14141 +
14142 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
14143 +{
14144 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14145 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14146 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
14147 +       migrate_enable();
14148 +}
14149 +EXPORT_SYMBOL(rt_spin_unlock);
14150 +
14151 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
14152 +{
14153 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
14154 +}
14155 +EXPORT_SYMBOL(__rt_spin_unlock);
14156 +
14157 +/*
14158 + * Wait for the lock to get unlocked: instead of polling for an unlock
14159 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
14160 + * schedule if there's contention:
14161 + */
14162 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
14163 +{
14164 +       spin_lock(lock);
14165 +       spin_unlock(lock);
14166 +}
14167 +EXPORT_SYMBOL(rt_spin_unlock_wait);
14168 +
14169 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
14170 +{
14171 +       int ret;
14172 +
14173 +       ret = rt_mutex_trylock(&lock->lock);
14174 +       if (ret)
14175 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14176 +       return ret;
14177 +}
14178 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
14179 +
14180 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
14181 +{
14182 +       int ret;
14183 +
14184 +       migrate_disable();
14185 +       ret = rt_mutex_trylock(&lock->lock);
14186 +       if (ret)
14187 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14188 +       else
14189 +               migrate_enable();
14190 +       return ret;
14191 +}
14192 +EXPORT_SYMBOL(rt_spin_trylock);
14193 +
14194 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
14195 +{
14196 +       int ret;
14197 +
14198 +       local_bh_disable();
14199 +       ret = rt_mutex_trylock(&lock->lock);
14200 +       if (ret) {
14201 +               migrate_disable();
14202 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14203 +       } else
14204 +               local_bh_enable();
14205 +       return ret;
14206 +}
14207 +EXPORT_SYMBOL(rt_spin_trylock_bh);
14208 +
14209 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
14210 +{
14211 +       int ret;
14212 +
14213 +       *flags = 0;
14214 +       ret = rt_mutex_trylock(&lock->lock);
14215 +       if (ret) {
14216 +               migrate_disable();
14217 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14218 +       }
14219 +       return ret;
14220 +}
14221 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
14222 +
14223 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
14224 +{
14225 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
14226 +       if (atomic_add_unless(atomic, -1, 1))
14227 +               return 0;
14228 +       rt_spin_lock(lock);
14229 +       if (atomic_dec_and_test(atomic))
14230 +               return 1;
14231 +       rt_spin_unlock(lock);
14232 +       return 0;
14233 +}
14234 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
14235 +
14236 +       void
14237 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
14238 +{
14239 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14240 +       /*
14241 +        * Make sure we are not reinitializing a held lock:
14242 +        */
14243 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
14244 +       lockdep_init_map(&lock->dep_map, name, key, 0);
14245 +#endif
14246 +}
14247 +EXPORT_SYMBOL(__rt_spin_lock_init);
14248 +
14249 +#endif /* PREEMPT_RT_FULL */
14250 +
14251 +#ifdef CONFIG_PREEMPT_RT_FULL
14252 +       static inline int __sched
14253 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
14254 +{
14255 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14256 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
14257 +
14258 +       if (!hold_ctx)
14259 +               return 0;
14260 +
14261 +       if (unlikely(ctx == hold_ctx))
14262 +               return -EALREADY;
14263 +
14264 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
14265 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
14266 +#ifdef CONFIG_DEBUG_MUTEXES
14267 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
14268 +               ctx->contending_lock = ww;
14269 +#endif
14270 +               return -EDEADLK;
14271 +       }
14272 +
14273 +       return 0;
14274 +}
14275 +#else
14276 +       static inline int __sched
14277 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
14278 +{
14279 +       BUG();
14280 +       return 0;
14281 +}
14282 +
14283 +#endif
14284 +
14285 +static inline int
14286 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14287 +                    struct rt_mutex_waiter *waiter)
14288 +{
14289 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
14290 +}
14291 +
14292  /*
14293   * Task blocks on lock.
14294   *
14295 @@ -971,6 +1397,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14296                 return -EDEADLK;
14297  
14298         raw_spin_lock(&task->pi_lock);
14299 +
14300 +       /*
14301 +        * In the case of futex requeue PI, this will be a proxy
14302 +        * lock. The task will wake unaware that it is enqueueed on
14303 +        * this lock. Avoid blocking on two locks and corrupting
14304 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
14305 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
14306 +        * before requeue (due to a signal or timeout). Do not enqueue
14307 +        * the task if PI_WAKEUP_INPROGRESS is set.
14308 +        */
14309 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
14310 +               raw_spin_unlock(&task->pi_lock);
14311 +               return -EAGAIN;
14312 +       }
14313 +
14314 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
14315 +
14316         __rt_mutex_adjust_prio(task);
14317         waiter->task = task;
14318         waiter->lock = lock;
14319 @@ -994,7 +1437,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14320                 rt_mutex_enqueue_pi(owner, waiter);
14321  
14322                 __rt_mutex_adjust_prio(owner);
14323 -               if (owner->pi_blocked_on)
14324 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
14325                         chain_walk = 1;
14326         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
14327                 chain_walk = 1;
14328 @@ -1036,6 +1479,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14329   * Called with lock->wait_lock held and interrupts disabled.
14330   */
14331  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14332 +                                   struct wake_q_head *wake_sleeper_q,
14333                                     struct rt_mutex *lock)
14334  {
14335         struct rt_mutex_waiter *waiter;
14336 @@ -1064,7 +1508,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14337  
14338         raw_spin_unlock(&current->pi_lock);
14339  
14340 -       wake_q_add(wake_q, waiter->task);
14341 +       if (waiter->savestate)
14342 +               wake_q_add(wake_sleeper_q, waiter->task);
14343 +       else
14344 +               wake_q_add(wake_q, waiter->task);
14345  }
14346  
14347  /*
14348 @@ -1078,7 +1525,7 @@ static void remove_waiter(struct rt_mutex *lock,
14349  {
14350         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
14351         struct task_struct *owner = rt_mutex_owner(lock);
14352 -       struct rt_mutex *next_lock;
14353 +       struct rt_mutex *next_lock = NULL;
14354  
14355         raw_spin_lock(&current->pi_lock);
14356         rt_mutex_dequeue(lock, waiter);
14357 @@ -1102,7 +1549,8 @@ static void remove_waiter(struct rt_mutex *lock,
14358         __rt_mutex_adjust_prio(owner);
14359  
14360         /* Store the lock on which owner is blocked or NULL */
14361 -       next_lock = task_blocked_on_lock(owner);
14362 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
14363 +               next_lock = task_blocked_on_lock(owner);
14364  
14365         raw_spin_unlock(&owner->pi_lock);
14366  
14367 @@ -1138,21 +1586,30 @@ void rt_mutex_adjust_pi(struct task_struct *task)
14368         raw_spin_lock_irqsave(&task->pi_lock, flags);
14369  
14370         waiter = task->pi_blocked_on;
14371 -       if (!waiter || (waiter->prio == task->prio &&
14372 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
14373                         !dl_prio(task->prio))) {
14374                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14375                 return;
14376         }
14377         next_lock = waiter->lock;
14378 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14379  
14380         /* gets dropped in rt_mutex_adjust_prio_chain()! */
14381         get_task_struct(task);
14382  
14383 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14384         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
14385                                    next_lock, NULL, task);
14386  }
14387  
14388 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
14389 +{
14390 +       debug_rt_mutex_init_waiter(waiter);
14391 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
14392 +       RB_CLEAR_NODE(&waiter->tree_entry);
14393 +       waiter->task = NULL;
14394 +       waiter->savestate = savestate;
14395 +}
14396 +
14397  /**
14398   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
14399   * @lock:               the rt_mutex to take
14400 @@ -1166,7 +1623,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
14401  static int __sched
14402  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14403                     struct hrtimer_sleeper *timeout,
14404 -                   struct rt_mutex_waiter *waiter)
14405 +                   struct rt_mutex_waiter *waiter,
14406 +                   struct ww_acquire_ctx *ww_ctx)
14407  {
14408         int ret = 0;
14409  
14410 @@ -1175,16 +1633,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14411                 if (try_to_take_rt_mutex(lock, current, waiter))
14412                         break;
14413  
14414 -               /*
14415 -                * TASK_INTERRUPTIBLE checks for signals and
14416 -                * timeout. Ignored otherwise.
14417 -                */
14418 -               if (unlikely(state == TASK_INTERRUPTIBLE)) {
14419 -                       /* Signal pending? */
14420 -                       if (signal_pending(current))
14421 -                               ret = -EINTR;
14422 -                       if (timeout && !timeout->task)
14423 -                               ret = -ETIMEDOUT;
14424 +               if (timeout && !timeout->task) {
14425 +                       ret = -ETIMEDOUT;
14426 +                       break;
14427 +               }
14428 +               if (signal_pending_state(state, current)) {
14429 +                       ret = -EINTR;
14430 +                       break;
14431 +               }
14432 +
14433 +               if (ww_ctx && ww_ctx->acquired > 0) {
14434 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
14435                         if (ret)
14436                                 break;
14437                 }
14438 @@ -1223,21 +1682,148 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
14439         }
14440  }
14441  
14442 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
14443 +                                                  struct ww_acquire_ctx *ww_ctx)
14444 +{
14445 +#ifdef CONFIG_DEBUG_MUTEXES
14446 +       /*
14447 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
14448 +        * but released with a normal mutex_unlock in this call.
14449 +        *
14450 +        * This should never happen, always use ww_mutex_unlock.
14451 +        */
14452 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
14453 +
14454 +       /*
14455 +        * Not quite done after calling ww_acquire_done() ?
14456 +        */
14457 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
14458 +
14459 +       if (ww_ctx->contending_lock) {
14460 +               /*
14461 +                * After -EDEADLK you tried to
14462 +                * acquire a different ww_mutex? Bad!
14463 +                */
14464 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
14465 +
14466 +               /*
14467 +                * You called ww_mutex_lock after receiving -EDEADLK,
14468 +                * but 'forgot' to unlock everything else first?
14469 +                */
14470 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
14471 +               ww_ctx->contending_lock = NULL;
14472 +       }
14473 +
14474 +       /*
14475 +        * Naughty, using a different class will lead to undefined behavior!
14476 +        */
14477 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
14478 +#endif
14479 +       ww_ctx->acquired++;
14480 +}
14481 +
14482 +#ifdef CONFIG_PREEMPT_RT_FULL
14483 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14484 +                                 struct ww_acquire_ctx *ww_ctx)
14485 +{
14486 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14487 +       struct rt_mutex_waiter *waiter, *n;
14488 +
14489 +       /*
14490 +        * This branch gets optimized out for the common case,
14491 +        * and is only important for ww_mutex_lock.
14492 +        */
14493 +       ww_mutex_lock_acquired(ww, ww_ctx);
14494 +       ww->ctx = ww_ctx;
14495 +
14496 +       /*
14497 +        * Give any possible sleeping processes the chance to wake up,
14498 +        * so they can recheck if they have to back off.
14499 +        */
14500 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
14501 +                                            tree_entry) {
14502 +               /* XXX debug rt mutex waiter wakeup */
14503 +
14504 +               BUG_ON(waiter->lock != lock);
14505 +               rt_mutex_wake_waiter(waiter);
14506 +       }
14507 +}
14508 +
14509 +#else
14510 +
14511 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14512 +                                 struct ww_acquire_ctx *ww_ctx)
14513 +{
14514 +       BUG();
14515 +}
14516 +#endif
14517 +
14518 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
14519 +                                    struct hrtimer_sleeper *timeout,
14520 +                                    enum rtmutex_chainwalk chwalk,
14521 +                                    struct ww_acquire_ctx *ww_ctx,
14522 +                                    struct rt_mutex_waiter *waiter)
14523 +{
14524 +       int ret;
14525 +
14526 +       /* Try to acquire the lock again: */
14527 +       if (try_to_take_rt_mutex(lock, current, NULL)) {
14528 +               if (ww_ctx)
14529 +                       ww_mutex_account_lock(lock, ww_ctx);
14530 +               return 0;
14531 +       }
14532 +
14533 +       set_current_state(state);
14534 +
14535 +       /* Setup the timer, when timeout != NULL */
14536 +       if (unlikely(timeout))
14537 +               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
14538 +
14539 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
14540 +
14541 +       if (likely(!ret)) {
14542 +               /* sleep on the mutex */
14543 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
14544 +                                         ww_ctx);
14545 +       } else if (ww_ctx) {
14546 +               /* ww_mutex received EDEADLK, let it become EALREADY */
14547 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
14548 +               BUG_ON(!ret);
14549 +       }
14550 +
14551 +       if (unlikely(ret)) {
14552 +               __set_current_state(TASK_RUNNING);
14553 +               if (rt_mutex_has_waiters(lock))
14554 +                       remove_waiter(lock, waiter);
14555 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
14556 +               if (!ww_ctx)
14557 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
14558 +       } else if (ww_ctx) {
14559 +               ww_mutex_account_lock(lock, ww_ctx);
14560 +       }
14561 +
14562 +       /*
14563 +        * try_to_take_rt_mutex() sets the waiter bit
14564 +        * unconditionally. We might have to fix that up.
14565 +        */
14566 +       fixup_rt_mutex_waiters(lock);
14567 +       return ret;
14568 +}
14569 +
14570  /*
14571   * Slow path lock function:
14572   */
14573  static int __sched
14574  rt_mutex_slowlock(struct rt_mutex *lock, int state,
14575                   struct hrtimer_sleeper *timeout,
14576 -                 enum rtmutex_chainwalk chwalk)
14577 +                 enum rtmutex_chainwalk chwalk,
14578 +                 struct ww_acquire_ctx *ww_ctx)
14579  {
14580         struct rt_mutex_waiter waiter;
14581         unsigned long flags;
14582         int ret = 0;
14583  
14584 -       debug_rt_mutex_init_waiter(&waiter);
14585 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
14586 -       RB_CLEAR_NODE(&waiter.tree_entry);
14587 +       rt_mutex_init_waiter(&waiter, false);
14588  
14589         /*
14590          * Technically we could use raw_spin_[un]lock_irq() here, but this can
14591 @@ -1249,36 +1835,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
14592          */
14593         raw_spin_lock_irqsave(&lock->wait_lock, flags);
14594  
14595 -       /* Try to acquire the lock again: */
14596 -       if (try_to_take_rt_mutex(lock, current, NULL)) {
14597 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14598 -               return 0;
14599 -       }
14600 -
14601 -       set_current_state(state);
14602 -
14603 -       /* Setup the timer, when timeout != NULL */
14604 -       if (unlikely(timeout))
14605 -               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
14606 -
14607 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
14608 -
14609 -       if (likely(!ret))
14610 -               /* sleep on the mutex */
14611 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
14612 -
14613 -       if (unlikely(ret)) {
14614 -               __set_current_state(TASK_RUNNING);
14615 -               if (rt_mutex_has_waiters(lock))
14616 -                       remove_waiter(lock, &waiter);
14617 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
14618 -       }
14619 -
14620 -       /*
14621 -        * try_to_take_rt_mutex() sets the waiter bit
14622 -        * unconditionally. We might have to fix that up.
14623 -        */
14624 -       fixup_rt_mutex_waiters(lock);
14625 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
14626 +                                      &waiter);
14627  
14628         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14629  
14630 @@ -1331,7 +1889,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
14631   * Return whether the current task needs to undo a potential priority boosting.
14632   */
14633  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14634 -                                       struct wake_q_head *wake_q)
14635 +                                       struct wake_q_head *wake_q,
14636 +                                       struct wake_q_head *wake_sleeper_q)
14637  {
14638         unsigned long flags;
14639  
14640 @@ -1340,8 +1899,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14641  
14642         debug_rt_mutex_unlock(lock);
14643  
14644 -       rt_mutex_deadlock_account_unlock(current);
14645 -
14646         /*
14647          * We must be careful here if the fast path is enabled. If we
14648          * have no waiters queued we cannot set owner to NULL here
14649 @@ -1387,7 +1944,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14650          *
14651          * Queue the next waiter for wakeup once we release the wait_lock.
14652          */
14653 -       mark_wakeup_next_waiter(wake_q, lock);
14654 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
14655  
14656         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14657  
14658 @@ -1403,63 +1960,79 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14659   */
14660  static inline int
14661  rt_mutex_fastlock(struct rt_mutex *lock, int state,
14662 +                 struct ww_acquire_ctx *ww_ctx,
14663                   int (*slowfn)(struct rt_mutex *lock, int state,
14664                                 struct hrtimer_sleeper *timeout,
14665 -                               enum rtmutex_chainwalk chwalk))
14666 +                               enum rtmutex_chainwalk chwalk,
14667 +                               struct ww_acquire_ctx *ww_ctx))
14668  {
14669 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14670 -               rt_mutex_deadlock_account_lock(lock, current);
14671 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14672                 return 0;
14673 -       } else
14674 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
14675 +
14676 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
14677  }
14678  
14679  static inline int
14680  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
14681                         struct hrtimer_sleeper *timeout,
14682                         enum rtmutex_chainwalk chwalk,
14683 +                       struct ww_acquire_ctx *ww_ctx,
14684                         int (*slowfn)(struct rt_mutex *lock, int state,
14685                                       struct hrtimer_sleeper *timeout,
14686 -                                     enum rtmutex_chainwalk chwalk))
14687 +                                     enum rtmutex_chainwalk chwalk,
14688 +                                     struct ww_acquire_ctx *ww_ctx))
14689  {
14690         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
14691 -           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14692 -               rt_mutex_deadlock_account_lock(lock, current);
14693 +           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14694                 return 0;
14695 -       } else
14696 -               return slowfn(lock, state, timeout, chwalk);
14697 +
14698 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
14699  }
14700  
14701  static inline int
14702  rt_mutex_fasttrylock(struct rt_mutex *lock,
14703                      int (*slowfn)(struct rt_mutex *lock))
14704  {
14705 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14706 -               rt_mutex_deadlock_account_lock(lock, current);
14707 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14708                 return 1;
14709 -       }
14710 +
14711         return slowfn(lock);
14712  }
14713  
14714  static inline void
14715  rt_mutex_fastunlock(struct rt_mutex *lock,
14716                     bool (*slowfn)(struct rt_mutex *lock,
14717 -                                  struct wake_q_head *wqh))
14718 +                                  struct wake_q_head *wqh,
14719 +                                  struct wake_q_head *wq_sleeper))
14720  {
14721         WAKE_Q(wake_q);
14722 +       WAKE_Q(wake_sleeper_q);
14723 +       bool deboost;
14724  
14725 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
14726 -               rt_mutex_deadlock_account_unlock(current);
14727 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
14728 +               return;
14729  
14730 -       } else {
14731 -               bool deboost = slowfn(lock, &wake_q);
14732 +       deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
14733  
14734 -               wake_up_q(&wake_q);
14735 +       wake_up_q(&wake_q);
14736 +       wake_up_q_sleeper(&wake_sleeper_q);
14737  
14738 -               /* Undo pi boosting if necessary: */
14739 -               if (deboost)
14740 -                       rt_mutex_adjust_prio(current);
14741 -       }
14742 +       /* Undo pi boosting if necessary: */
14743 +       if (deboost)
14744 +               rt_mutex_adjust_prio(current);
14745 +}
14746 +
14747 +/**
14748 + * rt_mutex_lock_state - lock a rt_mutex with a given state
14749 + *
14750 + * @lock:      The rt_mutex to be locked
14751 + * @state:     The state to set when blocking on the rt_mutex
14752 + */
14753 +int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
14754 +{
14755 +       might_sleep();
14756 +
14757 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
14758  }
14759  
14760  /**
14761 @@ -1469,15 +2042,13 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
14762   */
14763  void __sched rt_mutex_lock(struct rt_mutex *lock)
14764  {
14765 -       might_sleep();
14766 -
14767 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
14768 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
14769  }
14770  EXPORT_SYMBOL_GPL(rt_mutex_lock);
14771  
14772  /**
14773   * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
14774 - *
14775 + **
14776   * @lock:              the rt_mutex to be locked
14777   *
14778   * Returns:
14779 @@ -1486,23 +2057,32 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
14780   */
14781  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
14782  {
14783 -       might_sleep();
14784 -
14785 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
14786 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
14787  }
14788  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
14789  
14790 -/*
14791 - * Futex variant with full deadlock detection.
14792 +/**
14793 + * rt_mutex_lock_killable - lock a rt_mutex killable
14794 + *
14795 + * @lock:              the rt_mutex to be locked
14796 + * @detect_deadlock:   deadlock detection on/off
14797 + *
14798 + * Returns:
14799 + *  0          on success
14800 + * -EINTR      when interrupted by a signal
14801   */
14802 -int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
14803 -                             struct hrtimer_sleeper *timeout)
14804 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
14805  {
14806 -       might_sleep();
14807 +       return rt_mutex_lock_state(lock, TASK_KILLABLE);
14808 +}
14809 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
14810  
14811 -       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
14812 -                                      RT_MUTEX_FULL_CHAINWALK,
14813 -                                      rt_mutex_slowlock);
14814 +/*
14815 + * Futex variant, must not use fastpath.
14816 + */
14817 +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
14818 +{
14819 +       return rt_mutex_slowtrylock(lock);
14820  }
14821  
14822  /**
14823 @@ -1525,6 +2105,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
14824  
14825         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
14826                                        RT_MUTEX_MIN_CHAINWALK,
14827 +                                      NULL,
14828                                        rt_mutex_slowlock);
14829  }
14830  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
14831 @@ -1542,7 +2123,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
14832   */
14833  int __sched rt_mutex_trylock(struct rt_mutex *lock)
14834  {
14835 +#ifdef CONFIG_PREEMPT_RT_FULL
14836 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
14837 +#else
14838         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
14839 +#endif
14840                 return 0;
14841  
14842         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
14843 @@ -1561,20 +2146,41 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
14844  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
14845  
14846  /**
14847 - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
14848 - * @lock: the rt_mutex to be unlocked
14849 - *
14850 - * Returns: true/false indicating whether priority adjustment is
14851 - * required or not.
14852 + * Futex variant, that since futex variants do not use the fast-path, can be
14853 + * simple and will not need to retry.
14854   */
14855 -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
14856 -                                  struct wake_q_head *wqh)
14857 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
14858 +                                   struct wake_q_head *wake_q,
14859 +                                   struct wake_q_head *wq_sleeper)
14860  {
14861 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
14862 -               rt_mutex_deadlock_account_unlock(current);
14863 -               return false;
14864 +       lockdep_assert_held(&lock->wait_lock);
14865 +
14866 +       debug_rt_mutex_unlock(lock);
14867 +
14868 +       if (!rt_mutex_has_waiters(lock)) {
14869 +               lock->owner = NULL;
14870 +               return false; /* done */
14871 +       }
14872 +
14873 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
14874 +       return true; /* deboost and wakeups */
14875 +}
14876 +
14877 +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
14878 +{
14879 +       WAKE_Q(wake_q);
14880 +       WAKE_Q(wake_sleeper_q);
14881 +       bool deboost;
14882 +
14883 +       raw_spin_lock_irq(&lock->wait_lock);
14884 +       deboost = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
14885 +       raw_spin_unlock_irq(&lock->wait_lock);
14886 +
14887 +       if (deboost) {
14888 +               wake_up_q(&wake_q);
14889 +               wake_up_q_sleeper(&wake_sleeper_q);
14890 +               rt_mutex_adjust_prio(current);
14891         }
14892 -       return rt_mutex_slowunlock(lock, wqh);
14893  }
14894  
14895  /**
14896 @@ -1607,13 +2213,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
14897  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
14898  {
14899         lock->owner = NULL;
14900 -       raw_spin_lock_init(&lock->wait_lock);
14901         lock->waiters = RB_ROOT;
14902         lock->waiters_leftmost = NULL;
14903  
14904         debug_rt_mutex_init(lock, name);
14905  }
14906 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
14907 +EXPORT_SYMBOL(__rt_mutex_init);
14908  
14909  /**
14910   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
14911 @@ -1628,10 +2233,9 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
14912  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
14913                                 struct task_struct *proxy_owner)
14914  {
14915 -       __rt_mutex_init(lock, NULL);
14916 +       rt_mutex_init(lock);
14917         debug_rt_mutex_proxy_lock(lock, proxy_owner);
14918         rt_mutex_set_owner(lock, proxy_owner);
14919 -       rt_mutex_deadlock_account_lock(lock, proxy_owner);
14920  }
14921  
14922  /**
14923 @@ -1647,7 +2251,66 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
14924  {
14925         debug_rt_mutex_proxy_unlock(lock);
14926         rt_mutex_set_owner(lock, NULL);
14927 -       rt_mutex_deadlock_account_unlock(proxy_owner);
14928 +}
14929 +
14930 +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14931 +                             struct rt_mutex_waiter *waiter,
14932 +                             struct task_struct *task)
14933 +{
14934 +       int ret;
14935 +
14936 +       if (try_to_take_rt_mutex(lock, task, NULL))
14937 +               return 1;
14938 +
14939 +#ifdef CONFIG_PREEMPT_RT_FULL
14940 +       /*
14941 +        * In PREEMPT_RT there's an added race.
14942 +        * If the task, that we are about to requeue, times out,
14943 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
14944 +        * to skip this task. But right after the task sets
14945 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
14946 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
14947 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
14948 +        * lock that it blocks on. We *must not* place this task
14949 +        * on this proxy lock in that case.
14950 +        *
14951 +        * To prevent this race, we first take the task's pi_lock
14952 +        * and check if it has updated its pi_blocked_on. If it has,
14953 +        * we assume that it woke up and we return -EAGAIN.
14954 +        * Otherwise, we set the task's pi_blocked_on to
14955 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
14956 +        * it will know that we are in the process of requeuing it.
14957 +        */
14958 +       raw_spin_lock(&task->pi_lock);
14959 +       if (task->pi_blocked_on) {
14960 +               raw_spin_unlock(&task->pi_lock);
14961 +               raw_spin_unlock_irq(&lock->wait_lock);
14962 +               return -EAGAIN;
14963 +       }
14964 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
14965 +       raw_spin_unlock(&task->pi_lock);
14966 +#endif
14967 +
14968 +       /* We enforce deadlock detection for futexes */
14969 +       ret = task_blocks_on_rt_mutex(lock, waiter, task,
14970 +                                     RT_MUTEX_FULL_CHAINWALK);
14971 +
14972 +       if (ret && !rt_mutex_owner(lock)) {
14973 +               /*
14974 +                * Reset the return value. We might have
14975 +                * returned with -EDEADLK and the owner
14976 +                * released the lock while we were walking the
14977 +                * pi chain.  Let the waiter sort it out.
14978 +                */
14979 +               ret = 0;
14980 +       }
14981 +
14982 +       if (ret && rt_mutex_has_waiters(lock))
14983 +               remove_waiter(lock, waiter);
14984 +
14985 +       debug_rt_mutex_print_deadlock(waiter);
14986 +
14987 +       return ret;
14988  }
14989  
14990  /**
14991 @@ -1670,33 +2333,9 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14992         int ret;
14993  
14994         raw_spin_lock_irq(&lock->wait_lock);
14995 -
14996 -       if (try_to_take_rt_mutex(lock, task, NULL)) {
14997 -               raw_spin_unlock_irq(&lock->wait_lock);
14998 -               return 1;
14999 -       }
15000 -
15001 -       /* We enforce deadlock detection for futexes */
15002 -       ret = task_blocks_on_rt_mutex(lock, waiter, task,
15003 -                                     RT_MUTEX_FULL_CHAINWALK);
15004 -
15005 -       if (ret && !rt_mutex_owner(lock)) {
15006 -               /*
15007 -                * Reset the return value. We might have
15008 -                * returned with -EDEADLK and the owner
15009 -                * released the lock while we were walking the
15010 -                * pi chain.  Let the waiter sort it out.
15011 -                */
15012 -               ret = 0;
15013 -       }
15014 -
15015 -       if (unlikely(ret))
15016 -               remove_waiter(lock, waiter);
15017 -
15018 +       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
15019         raw_spin_unlock_irq(&lock->wait_lock);
15020  
15021 -       debug_rt_mutex_print_deadlock(waiter);
15022 -
15023         return ret;
15024  }
15025  
15026 @@ -1721,21 +2360,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
15027  }
15028  
15029  /**
15030 - * rt_mutex_finish_proxy_lock() - Complete lock acquisition
15031 + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
15032   * @lock:              the rt_mutex we were woken on
15033   * @to:                        the timeout, null if none. hrtimer should already have
15034   *                     been started.
15035   * @waiter:            the pre-initialized rt_mutex_waiter
15036   *
15037 - * Complete the lock acquisition started our behalf by another thread.
15038 + * Wait for the the lock acquisition started on our behalf by
15039 + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
15040 + * rt_mutex_cleanup_proxy_lock().
15041   *
15042   * Returns:
15043   *  0 - success
15044   * <0 - error, one of -EINTR, -ETIMEDOUT
15045   *
15046 - * Special API call for PI-futex requeue support
15047 + * Special API call for PI-futex support
15048   */
15049 -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15050 +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
15051                                struct hrtimer_sleeper *to,
15052                                struct rt_mutex_waiter *waiter)
15053  {
15054 @@ -1746,10 +2387,47 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15055         set_current_state(TASK_INTERRUPTIBLE);
15056  
15057         /* sleep on the mutex */
15058 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
15059 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
15060  
15061 -       if (unlikely(ret))
15062 +       raw_spin_unlock_irq(&lock->wait_lock);
15063 +
15064 +       return ret;
15065 +}
15066 +
15067 +/**
15068 + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
15069 + * @lock:              the rt_mutex we were woken on
15070 + * @waiter:            the pre-initialized rt_mutex_waiter
15071 + *
15072 + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
15073 + *
15074 + * Unless we acquired the lock; we're still enqueued on the wait-list and can
15075 + * in fact still be granted ownership until we're removed. Therefore we can
15076 + * find we are in fact the owner and must disregard the
15077 + * rt_mutex_wait_proxy_lock() failure.
15078 + *
15079 + * Returns:
15080 + *  true  - did the cleanup, we done.
15081 + *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
15082 + *          caller should disregards its return value.
15083 + *
15084 + * Special API call for PI-futex support
15085 + */
15086 +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
15087 +                                struct rt_mutex_waiter *waiter)
15088 +{
15089 +       bool cleanup = false;
15090 +
15091 +       raw_spin_lock_irq(&lock->wait_lock);
15092 +       /*
15093 +        * Unless we're the owner; we're still enqueued on the wait_list.
15094 +        * So check if we became owner, if not, take us off the wait_list.
15095 +        */
15096 +       if (rt_mutex_owner(lock) != current) {
15097                 remove_waiter(lock, waiter);
15098 +               fixup_rt_mutex_waiters(lock);
15099 +               cleanup = true;
15100 +       }
15101  
15102         /*
15103          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
15104 @@ -1759,5 +2437,91 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15105  
15106         raw_spin_unlock_irq(&lock->wait_lock);
15107  
15108 +       return cleanup;
15109 +}
15110 +
15111 +static inline int
15112 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
15113 +{
15114 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
15115 +       unsigned tmp;
15116 +
15117 +       if (ctx->deadlock_inject_countdown-- == 0) {
15118 +               tmp = ctx->deadlock_inject_interval;
15119 +               if (tmp > UINT_MAX/4)
15120 +                       tmp = UINT_MAX;
15121 +               else
15122 +                       tmp = tmp*2 + tmp + tmp/2;
15123 +
15124 +               ctx->deadlock_inject_interval = tmp;
15125 +               ctx->deadlock_inject_countdown = tmp;
15126 +               ctx->contending_lock = lock;
15127 +
15128 +               ww_mutex_unlock(lock);
15129 +
15130 +               return -EDEADLK;
15131 +       }
15132 +#endif
15133 +
15134 +       return 0;
15135 +}
15136 +
15137 +#ifdef CONFIG_PREEMPT_RT_FULL
15138 +int __sched
15139 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
15140 +{
15141 +       int ret;
15142 +
15143 +       might_sleep();
15144 +
15145 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
15146 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
15147 +       if (ret)
15148 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
15149 +       else if (!ret && ww_ctx->acquired > 1)
15150 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
15151 +
15152         return ret;
15153  }
15154 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
15155 +
15156 +int __sched
15157 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
15158 +{
15159 +       int ret;
15160 +
15161 +       might_sleep();
15162 +
15163 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
15164 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
15165 +       if (ret)
15166 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
15167 +       else if (!ret && ww_ctx->acquired > 1)
15168 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
15169 +
15170 +       return ret;
15171 +}
15172 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
15173 +
15174 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
15175 +{
15176 +       int nest = !!lock->ctx;
15177 +
15178 +       /*
15179 +        * The unlocking fastpath is the 0->1 transition from 'locked'
15180 +        * into 'unlocked' state:
15181 +        */
15182 +       if (nest) {
15183 +#ifdef CONFIG_DEBUG_MUTEXES
15184 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
15185 +#endif
15186 +               if (lock->ctx->acquired > 0)
15187 +                       lock->ctx->acquired--;
15188 +               lock->ctx = NULL;
15189 +       }
15190 +
15191 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
15192 +       rt_mutex_unlock(&lock->base.lock);
15193 +}
15194 +EXPORT_SYMBOL(ww_mutex_unlock);
15195 +#endif
15196 diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
15197 index c4060584c407..6607802efa8b 100644
15198 --- a/kernel/locking/rtmutex.h
15199 +++ b/kernel/locking/rtmutex.h
15200 @@ -11,8 +11,6 @@
15201   */
15202  
15203  #define rt_mutex_deadlock_check(l)                     (0)
15204 -#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
15205 -#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
15206  #define debug_rt_mutex_init_waiter(w)                  do { } while (0)
15207  #define debug_rt_mutex_free_waiter(w)                  do { } while (0)
15208  #define debug_rt_mutex_lock(l)                         do { } while (0)
15209 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
15210 index e317e1cbb3eb..819826407462 100644
15211 --- a/kernel/locking/rtmutex_common.h
15212 +++ b/kernel/locking/rtmutex_common.h
15213 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
15214         struct rb_node          pi_tree_entry;
15215         struct task_struct      *task;
15216         struct rt_mutex         *lock;
15217 +       bool                    savestate;
15218  #ifdef CONFIG_DEBUG_RT_MUTEXES
15219         unsigned long           ip;
15220         struct pid              *deadlock_task_pid;
15221 @@ -98,22 +99,45 @@ enum rtmutex_chainwalk {
15222  /*
15223   * PI-futex support (proxy locking functions, etc.):
15224   */
15225 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
15226 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
15227 +
15228  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
15229  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
15230                                        struct task_struct *proxy_owner);
15231  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
15232                                   struct task_struct *proxy_owner);
15233 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
15234 +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15235 +                                    struct rt_mutex_waiter *waiter,
15236 +                                    struct task_struct *task);
15237  extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15238                                      struct rt_mutex_waiter *waiter,
15239                                      struct task_struct *task);
15240 -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15241 -                                     struct hrtimer_sleeper *to,
15242 -                                     struct rt_mutex_waiter *waiter);
15243 -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
15244 -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
15245 -                                 struct wake_q_head *wqh);
15246 +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
15247 +                              struct hrtimer_sleeper *to,
15248 +                              struct rt_mutex_waiter *waiter);
15249 +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
15250 +                                struct rt_mutex_waiter *waiter);
15251 +
15252 +extern int rt_mutex_futex_trylock(struct rt_mutex *l);
15253 +
15254 +extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
15255 +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
15256 +                                struct wake_q_head *wqh,
15257 +                                struct wake_q_head *wq_sleeper);
15258 +
15259  extern void rt_mutex_adjust_prio(struct task_struct *task);
15260  
15261 +/* RW semaphore special interface */
15262 +struct ww_acquire_ctx;
15263 +
15264 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
15265 +                                    struct hrtimer_sleeper *timeout,
15266 +                                    enum rtmutex_chainwalk chwalk,
15267 +                                    struct ww_acquire_ctx *ww_ctx,
15268 +                                    struct rt_mutex_waiter *waiter);
15269 +
15270  #ifdef CONFIG_DEBUG_RT_MUTEXES
15271  # include "rtmutex-debug.h"
15272  #else
15273 diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
15274 new file mode 100644
15275 index 000000000000..4a708ffcded6
15276 --- /dev/null
15277 +++ b/kernel/locking/rwsem-rt.c
15278 @@ -0,0 +1,268 @@
15279 +/*
15280 + */
15281 +#include <linux/rwsem.h>
15282 +#include <linux/sched.h>
15283 +#include <linux/export.h>
15284 +
15285 +#include "rtmutex_common.h"
15286 +
15287 +/*
15288 + * RT-specific reader/writer semaphores
15289 + *
15290 + * down_write()
15291 + *  1) Lock sem->rtmutex
15292 + *  2) Remove the reader BIAS to force readers into the slow path
15293 + *  3) Wait until all readers have left the critical region
15294 + *  4) Mark it write locked
15295 + *
15296 + * up_write()
15297 + *  1) Remove the write locked marker
15298 + *  2) Set the reader BIAS so readers can use the fast path again
15299 + *  3) Unlock sem->rtmutex to release blocked readers
15300 + *
15301 + * down_read()
15302 + *  1) Try fast path acquisition (reader BIAS is set)
15303 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
15304 + *  3) If !writelocked, acquire it for read
15305 + *  4) If writelocked, block on sem->rtmutex
15306 + *  5) unlock sem->rtmutex, goto 1)
15307 + *
15308 + * up_read()
15309 + *  1) Try fast path release (reader count != 1)
15310 + *  2) Wake the writer waiting in down_write()#3
15311 + *
15312 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
15313 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
15314 + * are subject to the rtmutex priority/DL inheritance mechanism.
15315 + *
15316 + * It's possible to make the rw semaphores writer fair by keeping a list of
15317 + * active readers. A blocked writer would force all newly incoming readers to
15318 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
15319 + * reader after the other. We can't use multi-reader inheritance because there
15320 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
15321 + * reader boosting/handover mechanism is a major surgery for a very dubious
15322 + * value.
15323 + *
15324 + * The risk of writer starvation is there, but the pathological use cases
15325 + * which trigger it are not necessarily the typical RT workloads.
15326 + */
15327 +
15328 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
15329 +                 struct lock_class_key *key)
15330 +{
15331 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15332 +       /*
15333 +        * Make sure we are not reinitializing a held semaphore:
15334 +        */
15335 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
15336 +       lockdep_init_map(&sem->dep_map, name, key, 0);
15337 +#endif
15338 +       atomic_set(&sem->readers, READER_BIAS);
15339 +}
15340 +EXPORT_SYMBOL(__rwsem_init);
15341 +
15342 +int __down_read_trylock(struct rw_semaphore *sem)
15343 +{
15344 +       int r, old;
15345 +
15346 +       /*
15347 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
15348 +        * set.
15349 +        */
15350 +       for (r = atomic_read(&sem->readers); r < 0;) {
15351 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
15352 +               if (likely(old == r))
15353 +                       return 1;
15354 +               r = old;
15355 +       }
15356 +       return 0;
15357 +}
15358 +
15359 +void __sched __down_read(struct rw_semaphore *sem)
15360 +{
15361 +       struct rt_mutex *m = &sem->rtmutex;
15362 +       struct rt_mutex_waiter waiter;
15363 +
15364 +       if (__down_read_trylock(sem))
15365 +               return;
15366 +
15367 +       might_sleep();
15368 +       raw_spin_lock_irq(&m->wait_lock);
15369 +       /*
15370 +        * Allow readers as long as the writer has not completely
15371 +        * acquired the semaphore for write.
15372 +        */
15373 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
15374 +               atomic_inc(&sem->readers);
15375 +               raw_spin_unlock_irq(&m->wait_lock);
15376 +               return;
15377 +       }
15378 +
15379 +       /*
15380 +        * Call into the slow lock path with the rtmutex->wait_lock
15381 +        * held, so this can't result in the following race:
15382 +        *
15383 +        * Reader1              Reader2         Writer
15384 +        *                      down_read()
15385 +        *                                      down_write()
15386 +        *                                      rtmutex_lock(m)
15387 +        *                                      swait()
15388 +        * down_read()
15389 +        * unlock(m->wait_lock)
15390 +        *                      up_read()
15391 +        *                      swake()
15392 +        *                                      lock(m->wait_lock)
15393 +        *                                      sem->writelocked=true
15394 +        *                                      unlock(m->wait_lock)
15395 +        *
15396 +        *                                      up_write()
15397 +        *                                      sem->writelocked=false
15398 +        *                                      rtmutex_unlock(m)
15399 +        *                      down_read()
15400 +        *                                      down_write()
15401 +        *                                      rtmutex_lock(m)
15402 +        *                                      swait()
15403 +        * rtmutex_lock(m)
15404 +        *
15405 +        * That would put Reader1 behind the writer waiting on
15406 +        * Reader2 to call up_read() which might be unbound.
15407 +        */
15408 +       rt_mutex_init_waiter(&waiter, false);
15409 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
15410 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
15411 +                                &waiter);
15412 +       /*
15413 +        * The slowlock() above is guaranteed to return with the rtmutex is
15414 +        * now held, so there can't be a writer active. Increment the reader
15415 +        * count and immediately drop the rtmutex again.
15416 +        */
15417 +       atomic_inc(&sem->readers);
15418 +       raw_spin_unlock_irq(&m->wait_lock);
15419 +       rt_mutex_unlock(m);
15420 +
15421 +       debug_rt_mutex_free_waiter(&waiter);
15422 +}
15423 +
15424 +void __up_read(struct rw_semaphore *sem)
15425 +{
15426 +       struct rt_mutex *m = &sem->rtmutex;
15427 +       struct task_struct *tsk;
15428 +
15429 +       /*
15430 +        * sem->readers can only hit 0 when a writer is waiting for the
15431 +        * active readers to leave the critical region.
15432 +        */
15433 +       if (!atomic_dec_and_test(&sem->readers))
15434 +               return;
15435 +
15436 +       might_sleep();
15437 +       raw_spin_lock_irq(&m->wait_lock);
15438 +       /*
15439 +        * Wake the writer, i.e. the rtmutex owner. It might release the
15440 +        * rtmutex concurrently in the fast path (due to a signal), but to
15441 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
15442 +        * case which can happen is a spurious wakeup.
15443 +        */
15444 +       tsk = rt_mutex_owner(m);
15445 +       if (tsk)
15446 +               wake_up_process(tsk);
15447 +
15448 +       raw_spin_unlock_irq(&m->wait_lock);
15449 +}
15450 +
15451 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
15452 +                             unsigned long flags)
15453 +{
15454 +       struct rt_mutex *m = &sem->rtmutex;
15455 +
15456 +       atomic_add(READER_BIAS - bias, &sem->readers);
15457 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15458 +       rt_mutex_unlock(m);
15459 +}
15460 +
15461 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
15462 +{
15463 +       struct rt_mutex *m = &sem->rtmutex;
15464 +       unsigned long flags;
15465 +
15466 +       /* Take the rtmutex as a first step */
15467 +       if (rt_mutex_lock_state(m, state))
15468 +               return -EINTR;
15469 +
15470 +       /* Force readers into slow path */
15471 +       atomic_sub(READER_BIAS, &sem->readers);
15472 +       might_sleep();
15473 +
15474 +       set_current_state(state);
15475 +       for (;;) {
15476 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
15477 +               /* Have all readers left the critical region? */
15478 +               if (!atomic_read(&sem->readers)) {
15479 +                       atomic_set(&sem->readers, WRITER_BIAS);
15480 +                       __set_current_state(TASK_RUNNING);
15481 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15482 +                       return 0;
15483 +               }
15484 +
15485 +               if (signal_pending_state(state, current)) {
15486 +                       __set_current_state(TASK_RUNNING);
15487 +                       __up_write_unlock(sem, 0, flags);
15488 +                       return -EINTR;
15489 +               }
15490 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15491 +
15492 +               if (atomic_read(&sem->readers) != 0) {
15493 +                       schedule();
15494 +                       set_current_state(state);
15495 +               }
15496 +       }
15497 +}
15498 +
15499 +void __sched __down_write(struct rw_semaphore *sem)
15500 +{
15501 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
15502 +}
15503 +
15504 +int __sched __down_write_killable(struct rw_semaphore *sem)
15505 +{
15506 +       return __down_write_common(sem, TASK_KILLABLE);
15507 +}
15508 +
15509 +int __down_write_trylock(struct rw_semaphore *sem)
15510 +{
15511 +       struct rt_mutex *m = &sem->rtmutex;
15512 +       unsigned long flags;
15513 +
15514 +       if (!rt_mutex_trylock(m))
15515 +               return 0;
15516 +
15517 +       atomic_sub(READER_BIAS, &sem->readers);
15518 +
15519 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
15520 +       if (!atomic_read(&sem->readers)) {
15521 +               atomic_set(&sem->readers, WRITER_BIAS);
15522 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15523 +               return 1;
15524 +       }
15525 +       __up_write_unlock(sem, 0, flags);
15526 +       return 0;
15527 +}
15528 +
15529 +void __up_write(struct rw_semaphore *sem)
15530 +{
15531 +       struct rt_mutex *m = &sem->rtmutex;
15532 +       unsigned long flags;
15533 +
15534 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
15535 +       __up_write_unlock(sem, WRITER_BIAS, flags);
15536 +}
15537 +
15538 +void __downgrade_write(struct rw_semaphore *sem)
15539 +{
15540 +       struct rt_mutex *m = &sem->rtmutex;
15541 +       unsigned long flags;
15542 +
15543 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
15544 +       /* Release it and account current as reader */
15545 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
15546 +}
15547 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
15548 index db3ccb1dd614..909779647bd1 100644
15549 --- a/kernel/locking/spinlock.c
15550 +++ b/kernel/locking/spinlock.c
15551 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
15552   *         __[spin|read|write]_lock_bh()
15553   */
15554  BUILD_LOCK_OPS(spin, raw_spinlock);
15555 +
15556 +#ifndef CONFIG_PREEMPT_RT_FULL
15557  BUILD_LOCK_OPS(read, rwlock);
15558  BUILD_LOCK_OPS(write, rwlock);
15559 +#endif
15560  
15561  #endif
15562  
15563 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
15564  EXPORT_SYMBOL(_raw_spin_unlock_bh);
15565  #endif
15566  
15567 +#ifndef CONFIG_PREEMPT_RT_FULL
15568 +
15569  #ifndef CONFIG_INLINE_READ_TRYLOCK
15570  int __lockfunc _raw_read_trylock(rwlock_t *lock)
15571  {
15572 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
15573  EXPORT_SYMBOL(_raw_write_unlock_bh);
15574  #endif
15575  
15576 +#endif /* !PREEMPT_RT_FULL */
15577 +
15578  #ifdef CONFIG_DEBUG_LOCK_ALLOC
15579  
15580  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
15581 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
15582 index 0374a596cffa..94970338d518 100644
15583 --- a/kernel/locking/spinlock_debug.c
15584 +++ b/kernel/locking/spinlock_debug.c
15585 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
15586  
15587  EXPORT_SYMBOL(__raw_spin_lock_init);
15588  
15589 +#ifndef CONFIG_PREEMPT_RT_FULL
15590  void __rwlock_init(rwlock_t *lock, const char *name,
15591                    struct lock_class_key *key)
15592  {
15593 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
15594  }
15595  
15596  EXPORT_SYMBOL(__rwlock_init);
15597 +#endif
15598  
15599  static void spin_dump(raw_spinlock_t *lock, const char *msg)
15600  {
15601 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
15602         arch_spin_unlock(&lock->raw_lock);
15603  }
15604  
15605 +#ifndef CONFIG_PREEMPT_RT_FULL
15606  static void rwlock_bug(rwlock_t *lock, const char *msg)
15607  {
15608         if (!debug_locks_off())
15609 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
15610         debug_write_unlock(lock);
15611         arch_write_unlock(&lock->raw_lock);
15612  }
15613 +
15614 +#endif
15615 diff --git a/kernel/module.c b/kernel/module.c
15616 index 0e54d5bf0097..f27764fbfa24 100644
15617 --- a/kernel/module.c
15618 +++ b/kernel/module.c
15619 @@ -660,16 +660,7 @@ static void percpu_modcopy(struct module *mod,
15620                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
15621  }
15622  
15623 -/**
15624 - * is_module_percpu_address - test whether address is from module static percpu
15625 - * @addr: address to test
15626 - *
15627 - * Test whether @addr belongs to module static percpu area.
15628 - *
15629 - * RETURNS:
15630 - * %true if @addr is from module static percpu area
15631 - */
15632 -bool is_module_percpu_address(unsigned long addr)
15633 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
15634  {
15635         struct module *mod;
15636         unsigned int cpu;
15637 @@ -683,9 +674,15 @@ bool is_module_percpu_address(unsigned long addr)
15638                         continue;
15639                 for_each_possible_cpu(cpu) {
15640                         void *start = per_cpu_ptr(mod->percpu, cpu);
15641 +                       void *va = (void *)addr;
15642  
15643 -                       if ((void *)addr >= start &&
15644 -                           (void *)addr < start + mod->percpu_size) {
15645 +                       if (va >= start && va < start + mod->percpu_size) {
15646 +                               if (can_addr) {
15647 +                                       *can_addr = (unsigned long) (va - start);
15648 +                                       *can_addr += (unsigned long)
15649 +                                               per_cpu_ptr(mod->percpu,
15650 +                                                           get_boot_cpu_id());
15651 +                               }
15652                                 preempt_enable();
15653                                 return true;
15654                         }
15655 @@ -696,6 +693,20 @@ bool is_module_percpu_address(unsigned long addr)
15656         return false;
15657  }
15658  
15659 +/**
15660 + * is_module_percpu_address - test whether address is from module static percpu
15661 + * @addr: address to test
15662 + *
15663 + * Test whether @addr belongs to module static percpu area.
15664 + *
15665 + * RETURNS:
15666 + * %true if @addr is from module static percpu area
15667 + */
15668 +bool is_module_percpu_address(unsigned long addr)
15669 +{
15670 +       return __is_module_percpu_address(addr, NULL);
15671 +}
15672 +
15673  #else /* ... !CONFIG_SMP */
15674  
15675  static inline void __percpu *mod_percpu(struct module *mod)
15676 @@ -727,6 +738,11 @@ bool is_module_percpu_address(unsigned long addr)
15677         return false;
15678  }
15679  
15680 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
15681 +{
15682 +       return false;
15683 +}
15684 +
15685  #endif /* CONFIG_SMP */
15686  
15687  #define MODINFO_ATTR(field)    \
15688 diff --git a/kernel/panic.c b/kernel/panic.c
15689 index e6480e20379e..7e9c1918a94e 100644
15690 --- a/kernel/panic.c
15691 +++ b/kernel/panic.c
15692 @@ -482,9 +482,11 @@ static u64 oops_id;
15693  
15694  static int init_oops_id(void)
15695  {
15696 +#ifndef CONFIG_PREEMPT_RT_FULL
15697         if (!oops_id)
15698                 get_random_bytes(&oops_id, sizeof(oops_id));
15699         else
15700 +#endif
15701                 oops_id++;
15702  
15703         return 0;
15704 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
15705 index b26dbc48c75b..968255f27a33 100644
15706 --- a/kernel/power/hibernate.c
15707 +++ b/kernel/power/hibernate.c
15708 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
15709  
15710         local_irq_disable();
15711  
15712 +       system_state = SYSTEM_SUSPEND;
15713 +
15714         error = syscore_suspend();
15715         if (error) {
15716                 printk(KERN_ERR "PM: Some system devices failed to power down, "
15717 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
15718         syscore_resume();
15719  
15720   Enable_irqs:
15721 +       system_state = SYSTEM_RUNNING;
15722         local_irq_enable();
15723  
15724   Enable_cpus:
15725 @@ -446,6 +449,7 @@ static int resume_target_kernel(bool platform_mode)
15726                 goto Enable_cpus;
15727  
15728         local_irq_disable();
15729 +       system_state = SYSTEM_SUSPEND;
15730  
15731         error = syscore_suspend();
15732         if (error)
15733 @@ -479,6 +483,7 @@ static int resume_target_kernel(bool platform_mode)
15734         syscore_resume();
15735  
15736   Enable_irqs:
15737 +       system_state = SYSTEM_RUNNING;
15738         local_irq_enable();
15739  
15740   Enable_cpus:
15741 @@ -564,6 +569,7 @@ int hibernation_platform_enter(void)
15742                 goto Enable_cpus;
15743  
15744         local_irq_disable();
15745 +       system_state = SYSTEM_SUSPEND;
15746         syscore_suspend();
15747         if (pm_wakeup_pending()) {
15748                 error = -EAGAIN;
15749 @@ -576,6 +582,7 @@ int hibernation_platform_enter(void)
15750  
15751   Power_up:
15752         syscore_resume();
15753 +       system_state = SYSTEM_RUNNING;
15754         local_irq_enable();
15755  
15756   Enable_cpus:
15757 @@ -676,6 +683,10 @@ static int load_image_and_restore(void)
15758         return error;
15759  }
15760  
15761 +#ifndef CONFIG_SUSPEND
15762 +bool pm_in_action;
15763 +#endif
15764 +
15765  /**
15766   * hibernate - Carry out system hibernation, including saving the image.
15767   */
15768 @@ -689,6 +700,8 @@ int hibernate(void)
15769                 return -EPERM;
15770         }
15771  
15772 +       pm_in_action = true;
15773 +
15774         lock_system_sleep();
15775         /* The snapshot device should not be opened while we're running */
15776         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
15777 @@ -766,6 +779,7 @@ int hibernate(void)
15778         atomic_inc(&snapshot_device_available);
15779   Unlock:
15780         unlock_system_sleep();
15781 +       pm_in_action = false;
15782         return error;
15783  }
15784  
15785 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
15786 index 6ccb08f57fcb..c8cbb5ed2fe3 100644
15787 --- a/kernel/power/suspend.c
15788 +++ b/kernel/power/suspend.c
15789 @@ -369,6 +369,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
15790         arch_suspend_disable_irqs();
15791         BUG_ON(!irqs_disabled());
15792  
15793 +       system_state = SYSTEM_SUSPEND;
15794 +
15795         error = syscore_suspend();
15796         if (!error) {
15797                 *wakeup = pm_wakeup_pending();
15798 @@ -385,6 +387,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
15799                 syscore_resume();
15800         }
15801  
15802 +       system_state = SYSTEM_RUNNING;
15803 +
15804         arch_suspend_enable_irqs();
15805         BUG_ON(irqs_disabled());
15806  
15807 @@ -527,6 +531,8 @@ static int enter_state(suspend_state_t state)
15808         return error;
15809  }
15810  
15811 +bool pm_in_action;
15812 +
15813  /**
15814   * pm_suspend - Externally visible function for suspending the system.
15815   * @state: System sleep state to enter.
15816 @@ -541,6 +547,8 @@ int pm_suspend(suspend_state_t state)
15817         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
15818                 return -EINVAL;
15819  
15820 +       pm_in_action = true;
15821 +
15822         error = enter_state(state);
15823         if (error) {
15824                 suspend_stats.fail++;
15825 @@ -548,6 +556,7 @@ int pm_suspend(suspend_state_t state)
15826         } else {
15827                 suspend_stats.success++;
15828         }
15829 +       pm_in_action = false;
15830         return error;
15831  }
15832  EXPORT_SYMBOL(pm_suspend);
15833 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
15834 index 9c5b231684d0..cf15bdb6855b 100644
15835 --- a/kernel/printk/printk.c
15836 +++ b/kernel/printk/printk.c
15837 @@ -351,6 +351,65 @@ __packed __aligned(4)
15838   */
15839  DEFINE_RAW_SPINLOCK(logbuf_lock);
15840  
15841 +#ifdef CONFIG_EARLY_PRINTK
15842 +struct console *early_console;
15843 +
15844 +static void early_vprintk(const char *fmt, va_list ap)
15845 +{
15846 +       if (early_console) {
15847 +               char buf[512];
15848 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
15849 +
15850 +               early_console->write(early_console, buf, n);
15851 +       }
15852 +}
15853 +
15854 +asmlinkage void early_printk(const char *fmt, ...)
15855 +{
15856 +       va_list ap;
15857 +
15858 +       va_start(ap, fmt);
15859 +       early_vprintk(fmt, ap);
15860 +       va_end(ap);
15861 +}
15862 +
15863 +/*
15864 + * This is independent of any log levels - a global
15865 + * kill switch that turns off all of printk.
15866 + *
15867 + * Used by the NMI watchdog if early-printk is enabled.
15868 + */
15869 +static bool __read_mostly printk_killswitch;
15870 +
15871 +static int __init force_early_printk_setup(char *str)
15872 +{
15873 +       printk_killswitch = true;
15874 +       return 0;
15875 +}
15876 +early_param("force_early_printk", force_early_printk_setup);
15877 +
15878 +void printk_kill(void)
15879 +{
15880 +       printk_killswitch = true;
15881 +}
15882 +
15883 +#ifdef CONFIG_PRINTK
15884 +static int forced_early_printk(const char *fmt, va_list ap)
15885 +{
15886 +       if (!printk_killswitch)
15887 +               return 0;
15888 +       early_vprintk(fmt, ap);
15889 +       return 1;
15890 +}
15891 +#endif
15892 +
15893 +#else
15894 +static inline int forced_early_printk(const char *fmt, va_list ap)
15895 +{
15896 +       return 0;
15897 +}
15898 +#endif
15899 +
15900  #ifdef CONFIG_PRINTK
15901  DECLARE_WAIT_QUEUE_HEAD(log_wait);
15902  /* the next printk record to read by syslog(READ) or /proc/kmsg */
15903 @@ -1337,6 +1396,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15904  {
15905         char *text;
15906         int len = 0;
15907 +       int attempts = 0;
15908  
15909         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
15910         if (!text)
15911 @@ -1348,6 +1408,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15912                 u64 seq;
15913                 u32 idx;
15914                 enum log_flags prev;
15915 +               int num_msg;
15916 +try_again:
15917 +               attempts++;
15918 +               if (attempts > 10) {
15919 +                       len = -EBUSY;
15920 +                       goto out;
15921 +               }
15922 +               num_msg = 0;
15923  
15924                 /*
15925                  * Find first record that fits, including all following records,
15926 @@ -1363,6 +1431,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15927                         prev = msg->flags;
15928                         idx = log_next(idx);
15929                         seq++;
15930 +                       num_msg++;
15931 +                       if (num_msg > 5) {
15932 +                               num_msg = 0;
15933 +                               raw_spin_unlock_irq(&logbuf_lock);
15934 +                               raw_spin_lock_irq(&logbuf_lock);
15935 +                               if (clear_seq < log_first_seq)
15936 +                                       goto try_again;
15937 +                       }
15938                 }
15939  
15940                 /* move first record forward until length fits into the buffer */
15941 @@ -1376,6 +1452,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15942                         prev = msg->flags;
15943                         idx = log_next(idx);
15944                         seq++;
15945 +                       num_msg++;
15946 +                       if (num_msg > 5) {
15947 +                               num_msg = 0;
15948 +                               raw_spin_unlock_irq(&logbuf_lock);
15949 +                               raw_spin_lock_irq(&logbuf_lock);
15950 +                               if (clear_seq < log_first_seq)
15951 +                                       goto try_again;
15952 +                       }
15953                 }
15954  
15955                 /* last message fitting into this dump */
15956 @@ -1416,6 +1500,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15957                 clear_seq = log_next_seq;
15958                 clear_idx = log_next_idx;
15959         }
15960 +out:
15961         raw_spin_unlock_irq(&logbuf_lock);
15962  
15963         kfree(text);
15964 @@ -1569,6 +1654,12 @@ static void call_console_drivers(int level,
15965         if (!console_drivers)
15966                 return;
15967  
15968 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
15969 +               if (in_irq() || in_nmi())
15970 +                       return;
15971 +       }
15972 +
15973 +       migrate_disable();
15974         for_each_console(con) {
15975                 if (exclusive_console && con != exclusive_console)
15976                         continue;
15977 @@ -1584,6 +1675,7 @@ static void call_console_drivers(int level,
15978                 else
15979                         con->write(con, text, len);
15980         }
15981 +       migrate_enable();
15982  }
15983  
15984  /*
15985 @@ -1781,6 +1873,13 @@ asmlinkage int vprintk_emit(int facility, int level,
15986         /* cpu currently holding logbuf_lock in this function */
15987         static unsigned int logbuf_cpu = UINT_MAX;
15988  
15989 +       /*
15990 +        * Fall back to early_printk if a debugging subsystem has
15991 +        * killed printk output
15992 +        */
15993 +       if (unlikely(forced_early_printk(fmt, args)))
15994 +               return 1;
15995 +
15996         if (level == LOGLEVEL_SCHED) {
15997                 level = LOGLEVEL_DEFAULT;
15998                 in_sched = true;
15999 @@ -1885,13 +1984,23 @@ asmlinkage int vprintk_emit(int facility, int level,
16000  
16001         /* If called from the scheduler, we can not call up(). */
16002         if (!in_sched) {
16003 +               int may_trylock = 1;
16004 +
16005                 lockdep_off();
16006 +#ifdef CONFIG_PREEMPT_RT_FULL
16007 +               /*
16008 +                * we can't take a sleeping lock with IRQs or preeption disabled
16009 +                * so we can't print in these contexts
16010 +                */
16011 +               if (!(preempt_count() == 0 && !irqs_disabled()))
16012 +                       may_trylock = 0;
16013 +#endif
16014                 /*
16015                  * Try to acquire and then immediately release the console
16016                  * semaphore.  The release will print out buffers and wake up
16017                  * /dev/kmsg and syslog() users.
16018                  */
16019 -               if (console_trylock())
16020 +               if (may_trylock && console_trylock())
16021                         console_unlock();
16022                 lockdep_on();
16023         }
16024 @@ -2014,26 +2123,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
16025  
16026  #endif /* CONFIG_PRINTK */
16027  
16028 -#ifdef CONFIG_EARLY_PRINTK
16029 -struct console *early_console;
16030 -
16031 -asmlinkage __visible void early_printk(const char *fmt, ...)
16032 -{
16033 -       va_list ap;
16034 -       char buf[512];
16035 -       int n;
16036 -
16037 -       if (!early_console)
16038 -               return;
16039 -
16040 -       va_start(ap, fmt);
16041 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
16042 -       va_end(ap);
16043 -
16044 -       early_console->write(early_console, buf, n);
16045 -}
16046 -#endif
16047 -
16048  static int __add_preferred_console(char *name, int idx, char *options,
16049                                    char *brl_options)
16050  {
16051 @@ -2303,11 +2392,16 @@ static void console_cont_flush(char *text, size_t size)
16052                 goto out;
16053  
16054         len = cont_print_text(text, size);
16055 +#ifdef CONFIG_PREEMPT_RT_FULL
16056 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
16057 +       call_console_drivers(cont.level, NULL, 0, text, len);
16058 +#else
16059         raw_spin_unlock(&logbuf_lock);
16060         stop_critical_timings();
16061         call_console_drivers(cont.level, NULL, 0, text, len);
16062         start_critical_timings();
16063         local_irq_restore(flags);
16064 +#endif
16065         return;
16066  out:
16067         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
16068 @@ -2431,13 +2525,17 @@ void console_unlock(void)
16069                 console_idx = log_next(console_idx);
16070                 console_seq++;
16071                 console_prev = msg->flags;
16072 +#ifdef CONFIG_PREEMPT_RT_FULL
16073 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
16074 +               call_console_drivers(level, ext_text, ext_len, text, len);
16075 +#else
16076                 raw_spin_unlock(&logbuf_lock);
16077  
16078                 stop_critical_timings();        /* don't trace print latency */
16079                 call_console_drivers(level, ext_text, ext_len, text, len);
16080                 start_critical_timings();
16081                 local_irq_restore(flags);
16082 -
16083 +#endif
16084                 if (do_cond_resched)
16085                         cond_resched();
16086         }
16087 @@ -2489,6 +2587,11 @@ void console_unblank(void)
16088  {
16089         struct console *c;
16090  
16091 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
16092 +               if (in_irq() || in_nmi())
16093 +                       return;
16094 +       }
16095 +
16096         /*
16097          * console_unblank can no longer be called in interrupt context unless
16098          * oops_in_progress is set to 1..
16099 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
16100 index 49ba7c1ade9d..44f44b47ec07 100644
16101 --- a/kernel/ptrace.c
16102 +++ b/kernel/ptrace.c
16103 @@ -166,7 +166,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
16104  
16105         spin_lock_irq(&task->sighand->siglock);
16106         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
16107 -               task->state = __TASK_TRACED;
16108 +               unsigned long flags;
16109 +
16110 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
16111 +               if (task->state & __TASK_TRACED)
16112 +                       task->state = __TASK_TRACED;
16113 +               else
16114 +                       task->saved_state = __TASK_TRACED;
16115 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
16116                 ret = true;
16117         }
16118         spin_unlock_irq(&task->sighand->siglock);
16119 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
16120 index bf08fee53dc7..eeb8ce4ad7b6 100644
16121 --- a/kernel/rcu/rcutorture.c
16122 +++ b/kernel/rcu/rcutorture.c
16123 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
16124         .name           = "rcu"
16125  };
16126  
16127 +#ifndef CONFIG_PREEMPT_RT_FULL
16128  /*
16129   * Definitions for rcu_bh torture testing.
16130   */
16131 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
16132         .name           = "rcu_bh"
16133  };
16134  
16135 +#else
16136 +static struct rcu_torture_ops rcu_bh_ops = {
16137 +       .ttype          = INVALID_RCU_FLAVOR,
16138 +};
16139 +#endif
16140 +
16141  /*
16142   * Don't even think about trying any of these in real life!!!
16143   * The names includes "busted", and they really means it!
16144 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
16145 index 10f62c6f48e7..dbee19478f09 100644
16146 --- a/kernel/rcu/tree.c
16147 +++ b/kernel/rcu/tree.c
16148 @@ -55,6 +55,11 @@
16149  #include <linux/random.h>
16150  #include <linux/trace_events.h>
16151  #include <linux/suspend.h>
16152 +#include <linux/delay.h>
16153 +#include <linux/gfp.h>
16154 +#include <linux/oom.h>
16155 +#include <linux/smpboot.h>
16156 +#include "../time/tick-internal.h"
16157  
16158  #include "tree.h"
16159  #include "rcu.h"
16160 @@ -260,6 +265,19 @@ void rcu_sched_qs(void)
16161                            this_cpu_ptr(&rcu_sched_data), true);
16162  }
16163  
16164 +#ifdef CONFIG_PREEMPT_RT_FULL
16165 +static void rcu_preempt_qs(void);
16166 +
16167 +void rcu_bh_qs(void)
16168 +{
16169 +       unsigned long flags;
16170 +
16171 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
16172 +       local_irq_save(flags);
16173 +       rcu_preempt_qs();
16174 +       local_irq_restore(flags);
16175 +}
16176 +#else
16177  void rcu_bh_qs(void)
16178  {
16179         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
16180 @@ -269,6 +287,7 @@ void rcu_bh_qs(void)
16181                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
16182         }
16183  }
16184 +#endif
16185  
16186  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
16187  
16188 @@ -449,11 +468,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
16189  /*
16190   * Return the number of RCU BH batches started thus far for debug & stats.
16191   */
16192 +#ifndef CONFIG_PREEMPT_RT_FULL
16193  unsigned long rcu_batches_started_bh(void)
16194  {
16195         return rcu_bh_state.gpnum;
16196  }
16197  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
16198 +#endif
16199  
16200  /*
16201   * Return the number of RCU batches completed thus far for debug & stats.
16202 @@ -473,6 +494,7 @@ unsigned long rcu_batches_completed_sched(void)
16203  }
16204  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
16205  
16206 +#ifndef CONFIG_PREEMPT_RT_FULL
16207  /*
16208   * Return the number of RCU BH batches completed thus far for debug & stats.
16209   */
16210 @@ -481,6 +503,7 @@ unsigned long rcu_batches_completed_bh(void)
16211         return rcu_bh_state.completed;
16212  }
16213  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
16214 +#endif
16215  
16216  /*
16217   * Return the number of RCU expedited batches completed thus far for
16218 @@ -504,6 +527,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
16219  }
16220  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
16221  
16222 +#ifndef CONFIG_PREEMPT_RT_FULL
16223  /*
16224   * Force a quiescent state.
16225   */
16226 @@ -522,6 +546,13 @@ void rcu_bh_force_quiescent_state(void)
16227  }
16228  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
16229  
16230 +#else
16231 +void rcu_force_quiescent_state(void)
16232 +{
16233 +}
16234 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
16235 +#endif
16236 +
16237  /*
16238   * Force a quiescent state for RCU-sched.
16239   */
16240 @@ -572,9 +603,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
16241         case RCU_FLAVOR:
16242                 rsp = rcu_state_p;
16243                 break;
16244 +#ifndef CONFIG_PREEMPT_RT_FULL
16245         case RCU_BH_FLAVOR:
16246                 rsp = &rcu_bh_state;
16247                 break;
16248 +#endif
16249         case RCU_SCHED_FLAVOR:
16250                 rsp = &rcu_sched_state;
16251                 break;
16252 @@ -3016,18 +3049,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
16253  /*
16254   * Do RCU core processing for the current CPU.
16255   */
16256 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
16257 +static __latent_entropy void rcu_process_callbacks(void)
16258  {
16259         struct rcu_state *rsp;
16260  
16261         if (cpu_is_offline(smp_processor_id()))
16262                 return;
16263 -       trace_rcu_utilization(TPS("Start RCU core"));
16264         for_each_rcu_flavor(rsp)
16265                 __rcu_process_callbacks(rsp);
16266 -       trace_rcu_utilization(TPS("End RCU core"));
16267  }
16268  
16269 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
16270  /*
16271   * Schedule RCU callback invocation.  If the specified type of RCU
16272   * does not support RCU priority boosting, just do a direct call,
16273 @@ -3039,19 +3071,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
16274  {
16275         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
16276                 return;
16277 -       if (likely(!rsp->boost)) {
16278 -               rcu_do_batch(rsp, rdp);
16279 -               return;
16280 -       }
16281 -       invoke_rcu_callbacks_kthread();
16282 +       rcu_do_batch(rsp, rdp);
16283  }
16284  
16285 +static void rcu_wake_cond(struct task_struct *t, int status)
16286 +{
16287 +       /*
16288 +        * If the thread is yielding, only wake it when this
16289 +        * is invoked from idle
16290 +        */
16291 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
16292 +               wake_up_process(t);
16293 +}
16294 +
16295 +/*
16296 + * Wake up this CPU's rcuc kthread to do RCU core processing.
16297 + */
16298  static void invoke_rcu_core(void)
16299  {
16300 -       if (cpu_online(smp_processor_id()))
16301 -               raise_softirq(RCU_SOFTIRQ);
16302 +       unsigned long flags;
16303 +       struct task_struct *t;
16304 +
16305 +       if (!cpu_online(smp_processor_id()))
16306 +               return;
16307 +       local_irq_save(flags);
16308 +       __this_cpu_write(rcu_cpu_has_work, 1);
16309 +       t = __this_cpu_read(rcu_cpu_kthread_task);
16310 +       if (t != NULL && current != t)
16311 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
16312 +       local_irq_restore(flags);
16313  }
16314  
16315 +static void rcu_cpu_kthread_park(unsigned int cpu)
16316 +{
16317 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16318 +}
16319 +
16320 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
16321 +{
16322 +       return __this_cpu_read(rcu_cpu_has_work);
16323 +}
16324 +
16325 +/*
16326 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16327 + * RCU softirq used in flavors and configurations of RCU that do not
16328 + * support RCU priority boosting.
16329 + */
16330 +static void rcu_cpu_kthread(unsigned int cpu)
16331 +{
16332 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16333 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16334 +       int spincnt;
16335 +
16336 +       for (spincnt = 0; spincnt < 10; spincnt++) {
16337 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16338 +               local_bh_disable();
16339 +               *statusp = RCU_KTHREAD_RUNNING;
16340 +               this_cpu_inc(rcu_cpu_kthread_loops);
16341 +               local_irq_disable();
16342 +               work = *workp;
16343 +               *workp = 0;
16344 +               local_irq_enable();
16345 +               if (work)
16346 +                       rcu_process_callbacks();
16347 +               local_bh_enable();
16348 +               if (*workp == 0) {
16349 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16350 +                       *statusp = RCU_KTHREAD_WAITING;
16351 +                       return;
16352 +               }
16353 +       }
16354 +       *statusp = RCU_KTHREAD_YIELDING;
16355 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16356 +       schedule_timeout_interruptible(2);
16357 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16358 +       *statusp = RCU_KTHREAD_WAITING;
16359 +}
16360 +
16361 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16362 +       .store                  = &rcu_cpu_kthread_task,
16363 +       .thread_should_run      = rcu_cpu_kthread_should_run,
16364 +       .thread_fn              = rcu_cpu_kthread,
16365 +       .thread_comm            = "rcuc/%u",
16366 +       .setup                  = rcu_cpu_kthread_setup,
16367 +       .park                   = rcu_cpu_kthread_park,
16368 +};
16369 +
16370 +/*
16371 + * Spawn per-CPU RCU core processing kthreads.
16372 + */
16373 +static int __init rcu_spawn_core_kthreads(void)
16374 +{
16375 +       int cpu;
16376 +
16377 +       for_each_possible_cpu(cpu)
16378 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
16379 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16380 +       return 0;
16381 +}
16382 +early_initcall(rcu_spawn_core_kthreads);
16383 +
16384  /*
16385   * Handle any core-RCU processing required by a call_rcu() invocation.
16386   */
16387 @@ -3195,6 +3314,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
16388  }
16389  EXPORT_SYMBOL_GPL(call_rcu_sched);
16390  
16391 +#ifndef CONFIG_PREEMPT_RT_FULL
16392  /*
16393   * Queue an RCU callback for invocation after a quicker grace period.
16394   */
16395 @@ -3203,6 +3323,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
16396         __call_rcu(head, func, &rcu_bh_state, -1, 0);
16397  }
16398  EXPORT_SYMBOL_GPL(call_rcu_bh);
16399 +#endif
16400  
16401  /*
16402   * Queue an RCU callback for lazy invocation after a grace period.
16403 @@ -3294,6 +3415,7 @@ void synchronize_sched(void)
16404  }
16405  EXPORT_SYMBOL_GPL(synchronize_sched);
16406  
16407 +#ifndef CONFIG_PREEMPT_RT_FULL
16408  /**
16409   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
16410   *
16411 @@ -3320,6 +3442,7 @@ void synchronize_rcu_bh(void)
16412                 wait_rcu_gp(call_rcu_bh);
16413  }
16414  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
16415 +#endif
16416  
16417  /**
16418   * get_state_synchronize_rcu - Snapshot current RCU state
16419 @@ -3698,6 +3821,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
16420         mutex_unlock(&rsp->barrier_mutex);
16421  }
16422  
16423 +#ifndef CONFIG_PREEMPT_RT_FULL
16424  /**
16425   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
16426   */
16427 @@ -3706,6 +3830,7 @@ void rcu_barrier_bh(void)
16428         _rcu_barrier(&rcu_bh_state);
16429  }
16430  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
16431 +#endif
16432  
16433  /**
16434   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
16435 @@ -4227,12 +4352,13 @@ void __init rcu_init(void)
16436  
16437         rcu_bootup_announce();
16438         rcu_init_geometry();
16439 +#ifndef CONFIG_PREEMPT_RT_FULL
16440         rcu_init_one(&rcu_bh_state);
16441 +#endif
16442         rcu_init_one(&rcu_sched_state);
16443         if (dump_tree)
16444                 rcu_dump_rcu_node_tree(&rcu_sched_state);
16445         __rcu_init_preempt();
16446 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
16447  
16448         /*
16449          * We don't need protection against CPU-hotplug here because
16450 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
16451 index e99a5234d9ed..958ac107062c 100644
16452 --- a/kernel/rcu/tree.h
16453 +++ b/kernel/rcu/tree.h
16454 @@ -588,18 +588,18 @@ extern struct list_head rcu_struct_flavors;
16455   */
16456  extern struct rcu_state rcu_sched_state;
16457  
16458 +#ifndef CONFIG_PREEMPT_RT_FULL
16459  extern struct rcu_state rcu_bh_state;
16460 +#endif
16461  
16462  #ifdef CONFIG_PREEMPT_RCU
16463  extern struct rcu_state rcu_preempt_state;
16464  #endif /* #ifdef CONFIG_PREEMPT_RCU */
16465  
16466 -#ifdef CONFIG_RCU_BOOST
16467  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16468  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
16469  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16470  DECLARE_PER_CPU(char, rcu_cpu_has_work);
16471 -#endif /* #ifdef CONFIG_RCU_BOOST */
16472  
16473  #ifndef RCU_TREE_NONCORE
16474  
16475 @@ -619,10 +619,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
16476  static void __init __rcu_init_preempt(void);
16477  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
16478  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
16479 -static void invoke_rcu_callbacks_kthread(void);
16480  static bool rcu_is_callbacks_kthread(void);
16481 +static void rcu_cpu_kthread_setup(unsigned int cpu);
16482  #ifdef CONFIG_RCU_BOOST
16483 -static void rcu_preempt_do_callbacks(void);
16484  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
16485                                                  struct rcu_node *rnp);
16486  #endif /* #ifdef CONFIG_RCU_BOOST */
16487 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
16488 index 56583e764ebf..7c656f8e192f 100644
16489 --- a/kernel/rcu/tree_plugin.h
16490 +++ b/kernel/rcu/tree_plugin.h
16491 @@ -24,25 +24,10 @@
16492   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
16493   */
16494  
16495 -#include <linux/delay.h>
16496 -#include <linux/gfp.h>
16497 -#include <linux/oom.h>
16498 -#include <linux/smpboot.h>
16499 -#include "../time/tick-internal.h"
16500 -
16501  #ifdef CONFIG_RCU_BOOST
16502  
16503  #include "../locking/rtmutex_common.h"
16504  
16505 -/*
16506 - * Control variables for per-CPU and per-rcu_node kthreads.  These
16507 - * handle all flavors of RCU.
16508 - */
16509 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
16510 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16511 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16512 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
16513 -
16514  #else /* #ifdef CONFIG_RCU_BOOST */
16515  
16516  /*
16517 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
16518  
16519  #endif /* #else #ifdef CONFIG_RCU_BOOST */
16520  
16521 +/*
16522 + * Control variables for per-CPU and per-rcu_node kthreads.  These
16523 + * handle all flavors of RCU.
16524 + */
16525 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16526 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16527 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
16528 +
16529  #ifdef CONFIG_RCU_NOCB_CPU
16530  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
16531  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
16532 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
16533         }
16534  
16535         /* Hardware IRQ handlers cannot block, complain if they get here. */
16536 -       if (in_irq() || in_serving_softirq()) {
16537 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
16538                 lockdep_rcu_suspicious(__FILE__, __LINE__,
16539                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
16540                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
16541 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
16542                 t->rcu_read_unlock_special.b.need_qs = true;
16543  }
16544  
16545 -#ifdef CONFIG_RCU_BOOST
16546 -
16547 -static void rcu_preempt_do_callbacks(void)
16548 -{
16549 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
16550 -}
16551 -
16552 -#endif /* #ifdef CONFIG_RCU_BOOST */
16553 -
16554  /*
16555   * Queue a preemptible-RCU callback for invocation after a grace period.
16556   */
16557 @@ -829,6 +813,19 @@ void exit_rcu(void)
16558  
16559  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
16560  
16561 +/*
16562 + * If boosting, set rcuc kthreads to realtime priority.
16563 + */
16564 +static void rcu_cpu_kthread_setup(unsigned int cpu)
16565 +{
16566 +#ifdef CONFIG_RCU_BOOST
16567 +       struct sched_param sp;
16568 +
16569 +       sp.sched_priority = kthread_prio;
16570 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
16571 +#endif /* #ifdef CONFIG_RCU_BOOST */
16572 +}
16573 +
16574  #ifdef CONFIG_RCU_BOOST
16575  
16576  #include "../locking/rtmutex_common.h"
16577 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
16578  
16579  #endif /* #else #ifdef CONFIG_RCU_TRACE */
16580  
16581 -static void rcu_wake_cond(struct task_struct *t, int status)
16582 -{
16583 -       /*
16584 -        * If the thread is yielding, only wake it when this
16585 -        * is invoked from idle
16586 -        */
16587 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
16588 -               wake_up_process(t);
16589 -}
16590 -
16591  /*
16592   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
16593   * or ->boost_tasks, advancing the pointer to the next task in the
16594 @@ -1013,23 +1000,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
16595  }
16596  
16597  /*
16598 - * Wake up the per-CPU kthread to invoke RCU callbacks.
16599 - */
16600 -static void invoke_rcu_callbacks_kthread(void)
16601 -{
16602 -       unsigned long flags;
16603 -
16604 -       local_irq_save(flags);
16605 -       __this_cpu_write(rcu_cpu_has_work, 1);
16606 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
16607 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
16608 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
16609 -                             __this_cpu_read(rcu_cpu_kthread_status));
16610 -       }
16611 -       local_irq_restore(flags);
16612 -}
16613 -
16614 -/*
16615   * Is the current CPU running the RCU-callbacks kthread?
16616   * Caller must have preemption disabled.
16617   */
16618 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
16619         return 0;
16620  }
16621  
16622 -static void rcu_kthread_do_work(void)
16623 -{
16624 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
16625 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
16626 -       rcu_preempt_do_callbacks();
16627 -}
16628 -
16629 -static void rcu_cpu_kthread_setup(unsigned int cpu)
16630 -{
16631 -       struct sched_param sp;
16632 -
16633 -       sp.sched_priority = kthread_prio;
16634 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
16635 -}
16636 -
16637 -static void rcu_cpu_kthread_park(unsigned int cpu)
16638 -{
16639 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16640 -}
16641 -
16642 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
16643 -{
16644 -       return __this_cpu_read(rcu_cpu_has_work);
16645 -}
16646 -
16647 -/*
16648 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16649 - * RCU softirq used in flavors and configurations of RCU that do not
16650 - * support RCU priority boosting.
16651 - */
16652 -static void rcu_cpu_kthread(unsigned int cpu)
16653 -{
16654 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16655 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16656 -       int spincnt;
16657 -
16658 -       for (spincnt = 0; spincnt < 10; spincnt++) {
16659 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16660 -               local_bh_disable();
16661 -               *statusp = RCU_KTHREAD_RUNNING;
16662 -               this_cpu_inc(rcu_cpu_kthread_loops);
16663 -               local_irq_disable();
16664 -               work = *workp;
16665 -               *workp = 0;
16666 -               local_irq_enable();
16667 -               if (work)
16668 -                       rcu_kthread_do_work();
16669 -               local_bh_enable();
16670 -               if (*workp == 0) {
16671 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16672 -                       *statusp = RCU_KTHREAD_WAITING;
16673 -                       return;
16674 -               }
16675 -       }
16676 -       *statusp = RCU_KTHREAD_YIELDING;
16677 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16678 -       schedule_timeout_interruptible(2);
16679 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16680 -       *statusp = RCU_KTHREAD_WAITING;
16681 -}
16682 -
16683  /*
16684   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
16685   * served by the rcu_node in question.  The CPU hotplug lock is still
16686 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
16687         free_cpumask_var(cm);
16688  }
16689  
16690 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16691 -       .store                  = &rcu_cpu_kthread_task,
16692 -       .thread_should_run      = rcu_cpu_kthread_should_run,
16693 -       .thread_fn              = rcu_cpu_kthread,
16694 -       .thread_comm            = "rcuc/%u",
16695 -       .setup                  = rcu_cpu_kthread_setup,
16696 -       .park                   = rcu_cpu_kthread_park,
16697 -};
16698 -
16699  /*
16700   * Spawn boost kthreads -- called as soon as the scheduler is running.
16701   */
16702  static void __init rcu_spawn_boost_kthreads(void)
16703  {
16704         struct rcu_node *rnp;
16705 -       int cpu;
16706 -
16707 -       for_each_possible_cpu(cpu)
16708 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
16709 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16710         rcu_for_each_leaf_node(rcu_state_p, rnp)
16711                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
16712  }
16713 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
16714         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
16715  }
16716  
16717 -static void invoke_rcu_callbacks_kthread(void)
16718 -{
16719 -       WARN_ON_ONCE(1);
16720 -}
16721 -
16722  static bool rcu_is_callbacks_kthread(void)
16723  {
16724         return false;
16725 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
16726  
16727  #endif /* #else #ifdef CONFIG_RCU_BOOST */
16728  
16729 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
16730 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
16731  
16732  /*
16733   * Check to see if any future RCU-related work will need to be done
16734 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
16735         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
16736                ? 0 : rcu_cpu_has_callbacks(NULL);
16737  }
16738 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
16739  
16740 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
16741  /*
16742   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
16743   * after it.
16744 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
16745         return cbs_ready;
16746  }
16747  
16748 +#ifndef CONFIG_PREEMPT_RT_FULL
16749 +
16750  /*
16751   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
16752   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
16753 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
16754         *nextevt = basemono + dj * TICK_NSEC;
16755         return 0;
16756  }
16757 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
16758  
16759  /*
16760   * Prepare a CPU for idle from an RCU perspective.  The first major task
16761 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
16762 index 4f6db7e6a117..ee02e1e1b3e5 100644
16763 --- a/kernel/rcu/update.c
16764 +++ b/kernel/rcu/update.c
16765 @@ -62,7 +62,7 @@
16766  #ifndef CONFIG_TINY_RCU
16767  module_param(rcu_expedited, int, 0);
16768  module_param(rcu_normal, int, 0);
16769 -static int rcu_normal_after_boot;
16770 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
16771  module_param(rcu_normal_after_boot, int, 0);
16772  #endif /* #ifndef CONFIG_TINY_RCU */
16773  
16774 @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
16775  }
16776  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
16777  
16778 -static atomic_t rcu_expedited_nesting =
16779 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
16780 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
16781  
16782  /*
16783   * Should normal grace-period primitives be expedited?  Intended for
16784 @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
16785   */
16786  void rcu_end_inkernel_boot(void)
16787  {
16788 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
16789 -               rcu_unexpedite_gp();
16790 +       rcu_unexpedite_gp();
16791         if (rcu_normal_after_boot)
16792                 WRITE_ONCE(rcu_normal, 1);
16793  }
16794 @@ -298,6 +296,7 @@ int rcu_read_lock_held(void)
16795  }
16796  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
16797  
16798 +#ifndef CONFIG_PREEMPT_RT_FULL
16799  /**
16800   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
16801   *
16802 @@ -324,6 +323,7 @@ int rcu_read_lock_bh_held(void)
16803         return in_softirq() || irqs_disabled();
16804  }
16805  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
16806 +#endif
16807  
16808  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
16809  
16810 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
16811 index 5e59b832ae2b..7337a7f60e3f 100644
16812 --- a/kernel/sched/Makefile
16813 +++ b/kernel/sched/Makefile
16814 @@ -17,7 +17,7 @@ endif
16815  
16816  obj-y += core.o loadavg.o clock.o cputime.o
16817  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16818 -obj-y += wait.o swait.o completion.o idle.o
16819 +obj-y += wait.o swait.o swork.o completion.o idle.o
16820  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
16821  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
16822  obj-$(CONFIG_SCHEDSTATS) += stats.o
16823 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
16824 index 8d0f35debf35..b62cf6400fe0 100644
16825 --- a/kernel/sched/completion.c
16826 +++ b/kernel/sched/completion.c
16827 @@ -30,10 +30,10 @@ void complete(struct completion *x)
16828  {
16829         unsigned long flags;
16830  
16831 -       spin_lock_irqsave(&x->wait.lock, flags);
16832 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16833         x->done++;
16834 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
16835 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16836 +       swake_up_locked(&x->wait);
16837 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16838  }
16839  EXPORT_SYMBOL(complete);
16840  
16841 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
16842  {
16843         unsigned long flags;
16844  
16845 -       spin_lock_irqsave(&x->wait.lock, flags);
16846 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16847         x->done += UINT_MAX/2;
16848 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
16849 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16850 +       swake_up_all_locked(&x->wait);
16851 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16852  }
16853  EXPORT_SYMBOL(complete_all);
16854  
16855 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
16856                    long (*action)(long), long timeout, int state)
16857  {
16858         if (!x->done) {
16859 -               DECLARE_WAITQUEUE(wait, current);
16860 +               DECLARE_SWAITQUEUE(wait);
16861  
16862 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
16863 +               __prepare_to_swait(&x->wait, &wait);
16864                 do {
16865                         if (signal_pending_state(state, current)) {
16866                                 timeout = -ERESTARTSYS;
16867                                 break;
16868                         }
16869                         __set_current_state(state);
16870 -                       spin_unlock_irq(&x->wait.lock);
16871 +                       raw_spin_unlock_irq(&x->wait.lock);
16872                         timeout = action(timeout);
16873 -                       spin_lock_irq(&x->wait.lock);
16874 +                       raw_spin_lock_irq(&x->wait.lock);
16875                 } while (!x->done && timeout);
16876 -               __remove_wait_queue(&x->wait, &wait);
16877 +               __finish_swait(&x->wait, &wait);
16878                 if (!x->done)
16879                         return timeout;
16880         }
16881 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
16882  {
16883         might_sleep();
16884  
16885 -       spin_lock_irq(&x->wait.lock);
16886 +       raw_spin_lock_irq(&x->wait.lock);
16887         timeout = do_wait_for_common(x, action, timeout, state);
16888 -       spin_unlock_irq(&x->wait.lock);
16889 +       raw_spin_unlock_irq(&x->wait.lock);
16890         return timeout;
16891  }
16892  
16893 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
16894         if (!READ_ONCE(x->done))
16895                 return 0;
16896  
16897 -       spin_lock_irqsave(&x->wait.lock, flags);
16898 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16899         if (!x->done)
16900                 ret = 0;
16901         else
16902                 x->done--;
16903 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16904 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16905         return ret;
16906  }
16907  EXPORT_SYMBOL(try_wait_for_completion);
16908 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
16909          * after it's acquired the lock.
16910          */
16911         smp_rmb();
16912 -       spin_unlock_wait(&x->wait.lock);
16913 +       raw_spin_unlock_wait(&x->wait.lock);
16914         return true;
16915  }
16916  EXPORT_SYMBOL(completion_done);
16917 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
16918 index 154fd689fe02..a6aa5801b21e 100644
16919 --- a/kernel/sched/core.c
16920 +++ b/kernel/sched/core.c
16921 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
16922   * Number of tasks to iterate in a single balance run.
16923   * Limited because this is done with IRQs disabled.
16924   */
16925 +#ifndef CONFIG_PREEMPT_RT_FULL
16926  const_debug unsigned int sysctl_sched_nr_migrate = 32;
16927 +#else
16928 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
16929 +#endif
16930  
16931  /*
16932   * period over which we average the RT time consumption, measured
16933 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
16934  
16935         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16936         rq->hrtick_timer.function = hrtick;
16937 +       rq->hrtick_timer.irqsafe = 1;
16938  }
16939  #else  /* CONFIG_SCHED_HRTICK */
16940  static inline void hrtick_clear(struct rq *rq)
16941 @@ -449,7 +454,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
16942         head->lastp = &node->next;
16943  }
16944  
16945 -void wake_up_q(struct wake_q_head *head)
16946 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
16947  {
16948         struct wake_q_node *node = head->first;
16949  
16950 @@ -466,7 +471,10 @@ void wake_up_q(struct wake_q_head *head)
16951                  * wake_up_process() implies a wmb() to pair with the queueing
16952                  * in wake_q_add() so as not to miss wakeups.
16953                  */
16954 -               wake_up_process(task);
16955 +               if (sleeper)
16956 +                       wake_up_lock_sleeper(task);
16957 +               else
16958 +                       wake_up_process(task);
16959                 put_task_struct(task);
16960         }
16961  }
16962 @@ -502,6 +510,38 @@ void resched_curr(struct rq *rq)
16963                 trace_sched_wake_idle_without_ipi(cpu);
16964  }
16965  
16966 +#ifdef CONFIG_PREEMPT_LAZY
16967 +void resched_curr_lazy(struct rq *rq)
16968 +{
16969 +       struct task_struct *curr = rq->curr;
16970 +       int cpu;
16971 +
16972 +       if (!sched_feat(PREEMPT_LAZY)) {
16973 +               resched_curr(rq);
16974 +               return;
16975 +       }
16976 +
16977 +       lockdep_assert_held(&rq->lock);
16978 +
16979 +       if (test_tsk_need_resched(curr))
16980 +               return;
16981 +
16982 +       if (test_tsk_need_resched_lazy(curr))
16983 +               return;
16984 +
16985 +       set_tsk_need_resched_lazy(curr);
16986 +
16987 +       cpu = cpu_of(rq);
16988 +       if (cpu == smp_processor_id())
16989 +               return;
16990 +
16991 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
16992 +       smp_mb();
16993 +       if (!tsk_is_polling(curr))
16994 +               smp_send_reschedule(cpu);
16995 +}
16996 +#endif
16997 +
16998  void resched_cpu(int cpu)
16999  {
17000         struct rq *rq = cpu_rq(cpu);
17001 @@ -525,11 +565,14 @@ void resched_cpu(int cpu)
17002   */
17003  int get_nohz_timer_target(void)
17004  {
17005 -       int i, cpu = smp_processor_id();
17006 +       int i, cpu;
17007         struct sched_domain *sd;
17008  
17009 +       preempt_disable_rt();
17010 +       cpu = smp_processor_id();
17011 +
17012         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
17013 -               return cpu;
17014 +               goto preempt_en_rt;
17015  
17016         rcu_read_lock();
17017         for_each_domain(cpu, sd) {
17018 @@ -548,6 +591,8 @@ int get_nohz_timer_target(void)
17019                 cpu = housekeeping_any_cpu();
17020  unlock:
17021         rcu_read_unlock();
17022 +preempt_en_rt:
17023 +       preempt_enable_rt();
17024         return cpu;
17025  }
17026  /*
17027 @@ -1100,6 +1145,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
17028  
17029         lockdep_assert_held(&p->pi_lock);
17030  
17031 +       if (__migrate_disabled(p)) {
17032 +               cpumask_copy(&p->cpus_allowed, new_mask);
17033 +               return;
17034 +       }
17035 +
17036         queued = task_on_rq_queued(p);
17037         running = task_current(rq, p);
17038  
17039 @@ -1122,6 +1172,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
17040                 set_curr_task(rq, p);
17041  }
17042  
17043 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
17044 +static DEFINE_MUTEX(sched_down_mutex);
17045 +static cpumask_t sched_down_cpumask;
17046 +
17047 +void tell_sched_cpu_down_begin(int cpu)
17048 +{
17049 +       mutex_lock(&sched_down_mutex);
17050 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
17051 +       mutex_unlock(&sched_down_mutex);
17052 +}
17053 +
17054 +void tell_sched_cpu_down_done(int cpu)
17055 +{
17056 +       mutex_lock(&sched_down_mutex);
17057 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
17058 +       mutex_unlock(&sched_down_mutex);
17059 +}
17060 +
17061 +/**
17062 + * migrate_me - try to move the current task off this cpu
17063 + *
17064 + * Used by the pin_current_cpu() code to try to get tasks
17065 + * to move off the current CPU as it is going down.
17066 + * It will only move the task if the task isn't pinned to
17067 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
17068 + * and the task has to be in a RUNNING state. Otherwise the
17069 + * movement of the task will wake it up (change its state
17070 + * to running) when the task did not expect it.
17071 + *
17072 + * Returns 1 if it succeeded in moving the current task
17073 + *         0 otherwise.
17074 + */
17075 +int migrate_me(void)
17076 +{
17077 +       struct task_struct *p = current;
17078 +       struct migration_arg arg;
17079 +       struct cpumask *cpumask;
17080 +       struct cpumask *mask;
17081 +       unsigned int dest_cpu;
17082 +       struct rq_flags rf;
17083 +       struct rq *rq;
17084 +
17085 +       /*
17086 +        * We can not migrate tasks bounded to a CPU or tasks not
17087 +        * running. The movement of the task will wake it up.
17088 +        */
17089 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
17090 +               return 0;
17091 +
17092 +       mutex_lock(&sched_down_mutex);
17093 +       rq = task_rq_lock(p, &rf);
17094 +
17095 +       cpumask = this_cpu_ptr(&sched_cpumasks);
17096 +       mask = &p->cpus_allowed;
17097 +
17098 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
17099 +
17100 +       if (!cpumask_weight(cpumask)) {
17101 +               /* It's only on this CPU? */
17102 +               task_rq_unlock(rq, p, &rf);
17103 +               mutex_unlock(&sched_down_mutex);
17104 +               return 0;
17105 +       }
17106 +
17107 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
17108 +
17109 +       arg.task = p;
17110 +       arg.dest_cpu = dest_cpu;
17111 +
17112 +       task_rq_unlock(rq, p, &rf);
17113 +
17114 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
17115 +       tlb_migrate_finish(p->mm);
17116 +       mutex_unlock(&sched_down_mutex);
17117 +
17118 +       return 1;
17119 +}
17120 +
17121  /*
17122   * Change a given task's CPU affinity. Migrate the thread to a
17123   * proper CPU and schedule it away if the CPU it's executing on
17124 @@ -1179,7 +1307,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
17125         }
17126  
17127         /* Can the task run on the task's current CPU? If so, we're done */
17128 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
17129 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
17130                 goto out;
17131  
17132         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
17133 @@ -1366,6 +1494,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
17134         return ret;
17135  }
17136  
17137 +static bool check_task_state(struct task_struct *p, long match_state)
17138 +{
17139 +       bool match = false;
17140 +
17141 +       raw_spin_lock_irq(&p->pi_lock);
17142 +       if (p->state == match_state || p->saved_state == match_state)
17143 +               match = true;
17144 +       raw_spin_unlock_irq(&p->pi_lock);
17145 +
17146 +       return match;
17147 +}
17148 +
17149  /*
17150   * wait_task_inactive - wait for a thread to unschedule.
17151   *
17152 @@ -1410,7 +1550,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
17153                  * is actually now running somewhere else!
17154                  */
17155                 while (task_running(rq, p)) {
17156 -                       if (match_state && unlikely(p->state != match_state))
17157 +                       if (match_state && !check_task_state(p, match_state))
17158                                 return 0;
17159                         cpu_relax();
17160                 }
17161 @@ -1425,7 +1565,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
17162                 running = task_running(rq, p);
17163                 queued = task_on_rq_queued(p);
17164                 ncsw = 0;
17165 -               if (!match_state || p->state == match_state)
17166 +               if (!match_state || p->state == match_state ||
17167 +                   p->saved_state == match_state)
17168                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
17169                 task_rq_unlock(rq, p, &rf);
17170  
17171 @@ -1680,10 +1821,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
17172  {
17173         activate_task(rq, p, en_flags);
17174         p->on_rq = TASK_ON_RQ_QUEUED;
17175 -
17176 -       /* if a worker is waking up, notify workqueue */
17177 -       if (p->flags & PF_WQ_WORKER)
17178 -               wq_worker_waking_up(p, cpu_of(rq));
17179  }
17180  
17181  /*
17182 @@ -2018,8 +2155,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
17183          */
17184         smp_mb__before_spinlock();
17185         raw_spin_lock_irqsave(&p->pi_lock, flags);
17186 -       if (!(p->state & state))
17187 +       if (!(p->state & state)) {
17188 +               /*
17189 +                * The task might be running due to a spinlock sleeper
17190 +                * wakeup. Check the saved state and set it to running
17191 +                * if the wakeup condition is true.
17192 +                */
17193 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
17194 +                       if (p->saved_state & state) {
17195 +                               p->saved_state = TASK_RUNNING;
17196 +                               success = 1;
17197 +                       }
17198 +               }
17199                 goto out;
17200 +       }
17201 +
17202 +       /*
17203 +        * If this is a regular wakeup, then we can unconditionally
17204 +        * clear the saved state of a "lock sleeper".
17205 +        */
17206 +       if (!(wake_flags & WF_LOCK_SLEEPER))
17207 +               p->saved_state = TASK_RUNNING;
17208  
17209         trace_sched_waking(p);
17210  
17211 @@ -2102,53 +2258,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
17212  }
17213  
17214  /**
17215 - * try_to_wake_up_local - try to wake up a local task with rq lock held
17216 - * @p: the thread to be awakened
17217 - * @cookie: context's cookie for pinning
17218 - *
17219 - * Put @p on the run-queue if it's not already there. The caller must
17220 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
17221 - * the current task.
17222 - */
17223 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
17224 -{
17225 -       struct rq *rq = task_rq(p);
17226 -
17227 -       if (WARN_ON_ONCE(rq != this_rq()) ||
17228 -           WARN_ON_ONCE(p == current))
17229 -               return;
17230 -
17231 -       lockdep_assert_held(&rq->lock);
17232 -
17233 -       if (!raw_spin_trylock(&p->pi_lock)) {
17234 -               /*
17235 -                * This is OK, because current is on_cpu, which avoids it being
17236 -                * picked for load-balance and preemption/IRQs are still
17237 -                * disabled avoiding further scheduler activity on it and we've
17238 -                * not yet picked a replacement task.
17239 -                */
17240 -               lockdep_unpin_lock(&rq->lock, cookie);
17241 -               raw_spin_unlock(&rq->lock);
17242 -               raw_spin_lock(&p->pi_lock);
17243 -               raw_spin_lock(&rq->lock);
17244 -               lockdep_repin_lock(&rq->lock, cookie);
17245 -       }
17246 -
17247 -       if (!(p->state & TASK_NORMAL))
17248 -               goto out;
17249 -
17250 -       trace_sched_waking(p);
17251 -
17252 -       if (!task_on_rq_queued(p))
17253 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
17254 -
17255 -       ttwu_do_wakeup(rq, p, 0, cookie);
17256 -       ttwu_stat(p, smp_processor_id(), 0);
17257 -out:
17258 -       raw_spin_unlock(&p->pi_lock);
17259 -}
17260 -
17261 -/**
17262   * wake_up_process - Wake up a specific process
17263   * @p: The process to be woken up.
17264   *
17265 @@ -2166,6 +2275,18 @@ int wake_up_process(struct task_struct *p)
17266  }
17267  EXPORT_SYMBOL(wake_up_process);
17268  
17269 +/**
17270 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
17271 + * @p: The process to be woken up.
17272 + *
17273 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
17274 + * the nature of the wakeup.
17275 + */
17276 +int wake_up_lock_sleeper(struct task_struct *p)
17277 +{
17278 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
17279 +}
17280 +
17281  int wake_up_state(struct task_struct *p, unsigned int state)
17282  {
17283         return try_to_wake_up(p, state, 0);
17284 @@ -2442,6 +2563,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
17285         p->on_cpu = 0;
17286  #endif
17287         init_task_preempt_count(p);
17288 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
17289 +       task_thread_info(p)->preempt_lazy_count = 0;
17290 +#endif
17291  #ifdef CONFIG_SMP
17292         plist_node_init(&p->pushable_tasks, MAX_PRIO);
17293         RB_CLEAR_NODE(&p->pushable_dl_tasks);
17294 @@ -2770,21 +2894,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
17295         finish_arch_post_lock_switch();
17296  
17297         fire_sched_in_preempt_notifiers(current);
17298 +       /*
17299 +        * We use mmdrop_delayed() here so we don't have to do the
17300 +        * full __mmdrop() when we are the last user.
17301 +        */
17302         if (mm)
17303 -               mmdrop(mm);
17304 +               mmdrop_delayed(mm);
17305         if (unlikely(prev_state == TASK_DEAD)) {
17306                 if (prev->sched_class->task_dead)
17307                         prev->sched_class->task_dead(prev);
17308  
17309 -               /*
17310 -                * Remove function-return probe instances associated with this
17311 -                * task and put them back on the free list.
17312 -                */
17313 -               kprobe_flush_task(prev);
17314 -
17315 -               /* Task is done with its stack. */
17316 -               put_task_stack(prev);
17317 -
17318                 put_task_struct(prev);
17319         }
17320  
17321 @@ -3252,6 +3371,77 @@ static inline void schedule_debug(struct task_struct *prev)
17322         schedstat_inc(this_rq()->sched_count);
17323  }
17324  
17325 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
17326 +
17327 +void migrate_disable(void)
17328 +{
17329 +       struct task_struct *p = current;
17330 +
17331 +       if (in_atomic() || irqs_disabled()) {
17332 +#ifdef CONFIG_SCHED_DEBUG
17333 +               p->migrate_disable_atomic++;
17334 +#endif
17335 +               return;
17336 +       }
17337 +
17338 +#ifdef CONFIG_SCHED_DEBUG
17339 +       if (unlikely(p->migrate_disable_atomic)) {
17340 +               tracing_off();
17341 +               WARN_ON_ONCE(1);
17342 +       }
17343 +#endif
17344 +
17345 +       if (p->migrate_disable) {
17346 +               p->migrate_disable++;
17347 +               return;
17348 +       }
17349 +
17350 +       preempt_disable();
17351 +       preempt_lazy_disable();
17352 +       pin_current_cpu();
17353 +       p->migrate_disable = 1;
17354 +       preempt_enable();
17355 +}
17356 +EXPORT_SYMBOL(migrate_disable);
17357 +
17358 +void migrate_enable(void)
17359 +{
17360 +       struct task_struct *p = current;
17361 +
17362 +       if (in_atomic() || irqs_disabled()) {
17363 +#ifdef CONFIG_SCHED_DEBUG
17364 +               p->migrate_disable_atomic--;
17365 +#endif
17366 +               return;
17367 +       }
17368 +
17369 +#ifdef CONFIG_SCHED_DEBUG
17370 +       if (unlikely(p->migrate_disable_atomic)) {
17371 +               tracing_off();
17372 +               WARN_ON_ONCE(1);
17373 +       }
17374 +#endif
17375 +       WARN_ON_ONCE(p->migrate_disable <= 0);
17376 +
17377 +       if (p->migrate_disable > 1) {
17378 +               p->migrate_disable--;
17379 +               return;
17380 +       }
17381 +
17382 +       preempt_disable();
17383 +       /*
17384 +        * Clearing migrate_disable causes tsk_cpus_allowed to
17385 +        * show the tasks original cpu affinity.
17386 +        */
17387 +       p->migrate_disable = 0;
17388 +
17389 +       unpin_current_cpu();
17390 +       preempt_enable();
17391 +       preempt_lazy_enable();
17392 +}
17393 +EXPORT_SYMBOL(migrate_enable);
17394 +#endif
17395 +
17396  /*
17397   * Pick up the highest-prio task:
17398   */
17399 @@ -3368,19 +3558,6 @@ static void __sched notrace __schedule(bool preempt)
17400                 } else {
17401                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
17402                         prev->on_rq = 0;
17403 -
17404 -                       /*
17405 -                        * If a worker went to sleep, notify and ask workqueue
17406 -                        * whether it wants to wake up a task to maintain
17407 -                        * concurrency.
17408 -                        */
17409 -                       if (prev->flags & PF_WQ_WORKER) {
17410 -                               struct task_struct *to_wakeup;
17411 -
17412 -                               to_wakeup = wq_worker_sleeping(prev);
17413 -                               if (to_wakeup)
17414 -                                       try_to_wake_up_local(to_wakeup, cookie);
17415 -                       }
17416                 }
17417                 switch_count = &prev->nvcsw;
17418         }
17419 @@ -3390,6 +3567,7 @@ static void __sched notrace __schedule(bool preempt)
17420  
17421         next = pick_next_task(rq, prev, cookie);
17422         clear_tsk_need_resched(prev);
17423 +       clear_tsk_need_resched_lazy(prev);
17424         clear_preempt_need_resched();
17425         rq->clock_skip_update = 0;
17426  
17427 @@ -3437,9 +3615,20 @@ void __noreturn do_task_dead(void)
17428  
17429  static inline void sched_submit_work(struct task_struct *tsk)
17430  {
17431 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
17432 +       if (!tsk->state)
17433                 return;
17434         /*
17435 +        * If a worker went to sleep, notify and ask workqueue whether
17436 +        * it wants to wake up a task to maintain concurrency.
17437 +        */
17438 +       if (tsk->flags & PF_WQ_WORKER)
17439 +               wq_worker_sleeping(tsk);
17440 +
17441 +
17442 +       if (tsk_is_pi_blocked(tsk))
17443 +               return;
17444 +
17445 +       /*
17446          * If we are going to sleep and we have plugged IO queued,
17447          * make sure to submit it to avoid deadlocks.
17448          */
17449 @@ -3447,6 +3636,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
17450                 blk_schedule_flush_plug(tsk);
17451  }
17452  
17453 +static void sched_update_worker(struct task_struct *tsk)
17454 +{
17455 +       if (tsk->flags & PF_WQ_WORKER)
17456 +               wq_worker_running(tsk);
17457 +}
17458 +
17459  asmlinkage __visible void __sched schedule(void)
17460  {
17461         struct task_struct *tsk = current;
17462 @@ -3457,6 +3652,7 @@ asmlinkage __visible void __sched schedule(void)
17463                 __schedule(false);
17464                 sched_preempt_enable_no_resched();
17465         } while (need_resched());
17466 +       sched_update_worker(tsk);
17467  }
17468  EXPORT_SYMBOL(schedule);
17469  
17470 @@ -3520,6 +3716,30 @@ static void __sched notrace preempt_schedule_common(void)
17471         } while (need_resched());
17472  }
17473  
17474 +#ifdef CONFIG_PREEMPT_LAZY
17475 +/*
17476 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
17477 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
17478 + * preempt_lazy_count counter >0.
17479 + */
17480 +static __always_inline int preemptible_lazy(void)
17481 +{
17482 +       if (test_thread_flag(TIF_NEED_RESCHED))
17483 +               return 1;
17484 +       if (current_thread_info()->preempt_lazy_count)
17485 +               return 0;
17486 +       return 1;
17487 +}
17488 +
17489 +#else
17490 +
17491 +static inline int preemptible_lazy(void)
17492 +{
17493 +       return 1;
17494 +}
17495 +
17496 +#endif
17497 +
17498  #ifdef CONFIG_PREEMPT
17499  /*
17500   * this is the entry point to schedule() from in-kernel preemption
17501 @@ -3534,7 +3754,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
17502          */
17503         if (likely(!preemptible()))
17504                 return;
17505 -
17506 +       if (!preemptible_lazy())
17507 +               return;
17508         preempt_schedule_common();
17509  }
17510  NOKPROBE_SYMBOL(preempt_schedule);
17511 @@ -3561,6 +3782,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
17512         if (likely(!preemptible()))
17513                 return;
17514  
17515 +       if (!preemptible_lazy())
17516 +               return;
17517 +
17518         do {
17519                 /*
17520                  * Because the function tracer can trace preempt_count_sub()
17521 @@ -3583,7 +3807,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
17522                  * an infinite recursion.
17523                  */
17524                 prev_ctx = exception_enter();
17525 +               /*
17526 +                * The add/subtract must not be traced by the function
17527 +                * tracer. But we still want to account for the
17528 +                * preempt off latency tracer. Since the _notrace versions
17529 +                * of add/subtract skip the accounting for latency tracer
17530 +                * we must force it manually.
17531 +                */
17532 +               start_critical_timings();
17533                 __schedule(true);
17534 +               stop_critical_timings();
17535                 exception_exit(prev_ctx);
17536  
17537                 preempt_latency_stop(1);
17538 @@ -4939,6 +5172,7 @@ int __cond_resched_lock(spinlock_t *lock)
17539  }
17540  EXPORT_SYMBOL(__cond_resched_lock);
17541  
17542 +#ifndef CONFIG_PREEMPT_RT_FULL
17543  int __sched __cond_resched_softirq(void)
17544  {
17545         BUG_ON(!in_softirq());
17546 @@ -4952,6 +5186,7 @@ int __sched __cond_resched_softirq(void)
17547         return 0;
17548  }
17549  EXPORT_SYMBOL(__cond_resched_softirq);
17550 +#endif
17551  
17552  /**
17553   * yield - yield the current processor to other threads.
17554 @@ -5315,7 +5550,9 @@ void init_idle(struct task_struct *idle, int cpu)
17555  
17556         /* Set the preempt count _outside_ the spinlocks! */
17557         init_idle_preempt_count(idle, cpu);
17558 -
17559 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
17560 +       task_thread_info(idle)->preempt_lazy_count = 0;
17561 +#endif
17562         /*
17563          * The idle tasks have their own, simple scheduling class:
17564          */
17565 @@ -5458,6 +5695,8 @@ void sched_setnuma(struct task_struct *p, int nid)
17566  #endif /* CONFIG_NUMA_BALANCING */
17567  
17568  #ifdef CONFIG_HOTPLUG_CPU
17569 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
17570 +
17571  /*
17572   * Ensures that the idle task is using init_mm right before its cpu goes
17573   * offline.
17574 @@ -5472,7 +5711,12 @@ void idle_task_exit(void)
17575                 switch_mm_irqs_off(mm, &init_mm, current);
17576                 finish_arch_post_lock_switch();
17577         }
17578 -       mmdrop(mm);
17579 +       /*
17580 +        * Defer the cleanup to an alive cpu. On RT we can neither
17581 +        * call mmdrop() nor mmdrop_delayed() from here.
17582 +        */
17583 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
17584 +
17585  }
17586  
17587  /*
17588 @@ -7418,6 +7662,10 @@ int sched_cpu_dying(unsigned int cpu)
17589         update_max_interval();
17590         nohz_balance_exit_idle(cpu);
17591         hrtick_clear(rq);
17592 +       if (per_cpu(idle_last_mm, cpu)) {
17593 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
17594 +               per_cpu(idle_last_mm, cpu) = NULL;
17595 +       }
17596         return 0;
17597  }
17598  #endif
17599 @@ -7698,7 +7946,7 @@ void __init sched_init(void)
17600  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
17601  static inline int preempt_count_equals(int preempt_offset)
17602  {
17603 -       int nested = preempt_count() + rcu_preempt_depth();
17604 +       int nested = preempt_count() + sched_rcu_preempt_depth();
17605  
17606         return (nested == preempt_offset);
17607  }
17608 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
17609 index c95c5122b105..e00accf92a4b 100644
17610 --- a/kernel/sched/deadline.c
17611 +++ b/kernel/sched/deadline.c
17612 @@ -687,6 +687,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
17613  
17614         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17615         timer->function = dl_task_timer;
17616 +       timer->irqsafe = 1;
17617  }
17618  
17619  static
17620 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
17621 index fa178b62ea79..935224123441 100644
17622 --- a/kernel/sched/debug.c
17623 +++ b/kernel/sched/debug.c
17624 @@ -558,6 +558,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
17625         P(rt_throttled);
17626         PN(rt_time);
17627         PN(rt_runtime);
17628 +#ifdef CONFIG_SMP
17629 +       P(rt_nr_migratory);
17630 +#endif
17631  
17632  #undef PN
17633  #undef P
17634 @@ -953,6 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
17635  #endif
17636         P(policy);
17637         P(prio);
17638 +#ifdef CONFIG_PREEMPT_RT_FULL
17639 +       P(migrate_disable);
17640 +#endif
17641 +       P(nr_cpus_allowed);
17642  #undef PN_SCHEDSTAT
17643  #undef PN
17644  #undef __PN
17645 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
17646 index c242944f5cbd..4aeb2e2e41bc 100644
17647 --- a/kernel/sched/fair.c
17648 +++ b/kernel/sched/fair.c
17649 @@ -3518,7 +3518,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
17650         ideal_runtime = sched_slice(cfs_rq, curr);
17651         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
17652         if (delta_exec > ideal_runtime) {
17653 -               resched_curr(rq_of(cfs_rq));
17654 +               resched_curr_lazy(rq_of(cfs_rq));
17655                 /*
17656                  * The current task ran long enough, ensure it doesn't get
17657                  * re-elected due to buddy favours.
17658 @@ -3542,7 +3542,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
17659                 return;
17660  
17661         if (delta > ideal_runtime)
17662 -               resched_curr(rq_of(cfs_rq));
17663 +               resched_curr_lazy(rq_of(cfs_rq));
17664  }
17665  
17666  static void
17667 @@ -3684,7 +3684,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
17668          * validating it and just reschedule.
17669          */
17670         if (queued) {
17671 -               resched_curr(rq_of(cfs_rq));
17672 +               resched_curr_lazy(rq_of(cfs_rq));
17673                 return;
17674         }
17675         /*
17676 @@ -3866,7 +3866,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
17677          * hierarchy can be throttled
17678          */
17679         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
17680 -               resched_curr(rq_of(cfs_rq));
17681 +               resched_curr_lazy(rq_of(cfs_rq));
17682  }
17683  
17684  static __always_inline
17685 @@ -4494,7 +4494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
17686  
17687                 if (delta < 0) {
17688                         if (rq->curr == p)
17689 -                               resched_curr(rq);
17690 +                               resched_curr_lazy(rq);
17691                         return;
17692                 }
17693                 hrtick_start(rq, delta);
17694 @@ -5905,7 +5905,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
17695         return;
17696  
17697  preempt:
17698 -       resched_curr(rq);
17699 +       resched_curr_lazy(rq);
17700         /*
17701          * Only set the backward buddy when the current task is still
17702          * on the rq. This can happen when a wakeup gets interleaved
17703 @@ -8631,7 +8631,7 @@ static void task_fork_fair(struct task_struct *p)
17704                  * 'current' within the tree based on its new key value.
17705                  */
17706                 swap(curr->vruntime, se->vruntime);
17707 -               resched_curr(rq);
17708 +               resched_curr_lazy(rq);
17709         }
17710  
17711         se->vruntime -= cfs_rq->min_vruntime;
17712 @@ -8655,7 +8655,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
17713          */
17714         if (rq->curr == p) {
17715                 if (p->prio > oldprio)
17716 -                       resched_curr(rq);
17717 +                       resched_curr_lazy(rq);
17718         } else
17719                 check_preempt_curr(rq, p, 0);
17720  }
17721 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
17722 index 69631fa46c2f..6d28fcd08872 100644
17723 --- a/kernel/sched/features.h
17724 +++ b/kernel/sched/features.h
17725 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
17726   */
17727  SCHED_FEAT(NONTASK_CAPACITY, true)
17728  
17729 +#ifdef CONFIG_PREEMPT_RT_FULL
17730 +SCHED_FEAT(TTWU_QUEUE, false)
17731 +# ifdef CONFIG_PREEMPT_LAZY
17732 +SCHED_FEAT(PREEMPT_LAZY, true)
17733 +# endif
17734 +#else
17735 +
17736  /*
17737   * Queue remote wakeups on the target CPU and process them
17738   * using the scheduler IPI. Reduces rq->lock contention/bounces.
17739   */
17740  SCHED_FEAT(TTWU_QUEUE, true)
17741 +#endif
17742  
17743  #ifdef HAVE_RT_PUSH_IPI
17744  /*
17745 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
17746 index f139f22ce30d..b0691f4e7d49 100644
17747 --- a/kernel/sched/rt.c
17748 +++ b/kernel/sched/rt.c
17749 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
17750  
17751         hrtimer_init(&rt_b->rt_period_timer,
17752                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17753 +       rt_b->rt_period_timer.irqsafe = 1;
17754         rt_b->rt_period_timer.function = sched_rt_period_timer;
17755  }
17756  
17757 @@ -101,6 +102,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
17758         rt_rq->push_cpu = nr_cpu_ids;
17759         raw_spin_lock_init(&rt_rq->push_lock);
17760         init_irq_work(&rt_rq->push_work, push_irq_work_func);
17761 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
17762  #endif
17763  #endif /* CONFIG_SMP */
17764         /* We start is dequeued state, because no RT tasks are queued */
17765 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
17766 index 055f935d4421..19324ac27026 100644
17767 --- a/kernel/sched/sched.h
17768 +++ b/kernel/sched/sched.h
17769 @@ -1163,6 +1163,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
17770  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
17771  #define WF_FORK                0x02            /* child wakeup after fork */
17772  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
17773 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
17774  
17775  /*
17776   * To aid in avoiding the subversion of "niceness" due to uneven distribution
17777 @@ -1346,6 +1347,15 @@ extern void init_sched_fair_class(void);
17778  extern void resched_curr(struct rq *rq);
17779  extern void resched_cpu(int cpu);
17780  
17781 +#ifdef CONFIG_PREEMPT_LAZY
17782 +extern void resched_curr_lazy(struct rq *rq);
17783 +#else
17784 +static inline void resched_curr_lazy(struct rq *rq)
17785 +{
17786 +       resched_curr(rq);
17787 +}
17788 +#endif
17789 +
17790  extern struct rt_bandwidth def_rt_bandwidth;
17791  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
17792  
17793 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
17794 index 82f0dff90030..ef027ff3250a 100644
17795 --- a/kernel/sched/swait.c
17796 +++ b/kernel/sched/swait.c
17797 @@ -1,5 +1,6 @@
17798  #include <linux/sched.h>
17799  #include <linux/swait.h>
17800 +#include <linux/suspend.h>
17801  
17802  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
17803                              struct lock_class_key *key)
17804 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
17805  }
17806  EXPORT_SYMBOL(swake_up_locked);
17807  
17808 +void swake_up_all_locked(struct swait_queue_head *q)
17809 +{
17810 +       struct swait_queue *curr;
17811 +       int wakes = 0;
17812 +
17813 +       while (!list_empty(&q->task_list)) {
17814 +
17815 +               curr = list_first_entry(&q->task_list, typeof(*curr),
17816 +                                       task_list);
17817 +               wake_up_process(curr->task);
17818 +               list_del_init(&curr->task_list);
17819 +               wakes++;
17820 +       }
17821 +       if (pm_in_action)
17822 +               return;
17823 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
17824 +}
17825 +EXPORT_SYMBOL(swake_up_all_locked);
17826 +
17827  void swake_up(struct swait_queue_head *q)
17828  {
17829         unsigned long flags;
17830 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
17831         if (!swait_active(q))
17832                 return;
17833  
17834 +       WARN_ON(irqs_disabled());
17835         raw_spin_lock_irq(&q->lock);
17836         list_splice_init(&q->task_list, &tmp);
17837         while (!list_empty(&tmp)) {
17838 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
17839 new file mode 100644
17840 index 000000000000..1950f40ca725
17841 --- /dev/null
17842 +++ b/kernel/sched/swork.c
17843 @@ -0,0 +1,173 @@
17844 +/*
17845 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
17846 + *
17847 + * Provides a framework for enqueuing callbacks from irq context
17848 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
17849 + */
17850 +
17851 +#include <linux/swait.h>
17852 +#include <linux/swork.h>
17853 +#include <linux/kthread.h>
17854 +#include <linux/slab.h>
17855 +#include <linux/spinlock.h>
17856 +#include <linux/export.h>
17857 +
17858 +#define SWORK_EVENT_PENDING     (1 << 0)
17859 +
17860 +static DEFINE_MUTEX(worker_mutex);
17861 +static struct sworker *glob_worker;
17862 +
17863 +struct sworker {
17864 +       struct list_head events;
17865 +       struct swait_queue_head wq;
17866 +
17867 +       raw_spinlock_t lock;
17868 +
17869 +       struct task_struct *task;
17870 +       int refs;
17871 +};
17872 +
17873 +static bool swork_readable(struct sworker *worker)
17874 +{
17875 +       bool r;
17876 +
17877 +       if (kthread_should_stop())
17878 +               return true;
17879 +
17880 +       raw_spin_lock_irq(&worker->lock);
17881 +       r = !list_empty(&worker->events);
17882 +       raw_spin_unlock_irq(&worker->lock);
17883 +
17884 +       return r;
17885 +}
17886 +
17887 +static int swork_kthread(void *arg)
17888 +{
17889 +       struct sworker *worker = arg;
17890 +
17891 +       for (;;) {
17892 +               swait_event_interruptible(worker->wq,
17893 +                                       swork_readable(worker));
17894 +               if (kthread_should_stop())
17895 +                       break;
17896 +
17897 +               raw_spin_lock_irq(&worker->lock);
17898 +               while (!list_empty(&worker->events)) {
17899 +                       struct swork_event *sev;
17900 +
17901 +                       sev = list_first_entry(&worker->events,
17902 +                                       struct swork_event, item);
17903 +                       list_del(&sev->item);
17904 +                       raw_spin_unlock_irq(&worker->lock);
17905 +
17906 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
17907 +                                                        &sev->flags));
17908 +                       sev->func(sev);
17909 +                       raw_spin_lock_irq(&worker->lock);
17910 +               }
17911 +               raw_spin_unlock_irq(&worker->lock);
17912 +       }
17913 +       return 0;
17914 +}
17915 +
17916 +static struct sworker *swork_create(void)
17917 +{
17918 +       struct sworker *worker;
17919 +
17920 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
17921 +       if (!worker)
17922 +               return ERR_PTR(-ENOMEM);
17923 +
17924 +       INIT_LIST_HEAD(&worker->events);
17925 +       raw_spin_lock_init(&worker->lock);
17926 +       init_swait_queue_head(&worker->wq);
17927 +
17928 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
17929 +       if (IS_ERR(worker->task)) {
17930 +               kfree(worker);
17931 +               return ERR_PTR(-ENOMEM);
17932 +       }
17933 +
17934 +       return worker;
17935 +}
17936 +
17937 +static void swork_destroy(struct sworker *worker)
17938 +{
17939 +       kthread_stop(worker->task);
17940 +
17941 +       WARN_ON(!list_empty(&worker->events));
17942 +       kfree(worker);
17943 +}
17944 +
17945 +/**
17946 + * swork_queue - queue swork
17947 + *
17948 + * Returns %false if @work was already on a queue, %true otherwise.
17949 + *
17950 + * The work is queued and processed on a random CPU
17951 + */
17952 +bool swork_queue(struct swork_event *sev)
17953 +{
17954 +       unsigned long flags;
17955 +
17956 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
17957 +               return false;
17958 +
17959 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
17960 +       list_add_tail(&sev->item, &glob_worker->events);
17961 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
17962 +
17963 +       swake_up(&glob_worker->wq);
17964 +       return true;
17965 +}
17966 +EXPORT_SYMBOL_GPL(swork_queue);
17967 +
17968 +/**
17969 + * swork_get - get an instance of the sworker
17970 + *
17971 + * Returns an negative error code if the initialization if the worker did not
17972 + * work, %0 otherwise.
17973 + *
17974 + */
17975 +int swork_get(void)
17976 +{
17977 +       struct sworker *worker;
17978 +
17979 +       mutex_lock(&worker_mutex);
17980 +       if (!glob_worker) {
17981 +               worker = swork_create();
17982 +               if (IS_ERR(worker)) {
17983 +                       mutex_unlock(&worker_mutex);
17984 +                       return -ENOMEM;
17985 +               }
17986 +
17987 +               glob_worker = worker;
17988 +       }
17989 +
17990 +       glob_worker->refs++;
17991 +       mutex_unlock(&worker_mutex);
17992 +
17993 +       return 0;
17994 +}
17995 +EXPORT_SYMBOL_GPL(swork_get);
17996 +
17997 +/**
17998 + * swork_put - puts an instance of the sworker
17999 + *
18000 + * Will destroy the sworker thread. This function must not be called until all
18001 + * queued events have been completed.
18002 + */
18003 +void swork_put(void)
18004 +{
18005 +       mutex_lock(&worker_mutex);
18006 +
18007 +       glob_worker->refs--;
18008 +       if (glob_worker->refs > 0)
18009 +               goto out;
18010 +
18011 +       swork_destroy(glob_worker);
18012 +       glob_worker = NULL;
18013 +out:
18014 +       mutex_unlock(&worker_mutex);
18015 +}
18016 +EXPORT_SYMBOL_GPL(swork_put);
18017 diff --git a/kernel/signal.c b/kernel/signal.c
18018 index 0b1415720a15..c884647951f7 100644
18019 --- a/kernel/signal.c
18020 +++ b/kernel/signal.c
18021 @@ -14,6 +14,7 @@
18022  #include <linux/export.h>
18023  #include <linux/init.h>
18024  #include <linux/sched.h>
18025 +#include <linux/sched/rt.h>
18026  #include <linux/fs.h>
18027  #include <linux/tty.h>
18028  #include <linux/binfmts.h>
18029 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
18030         return false;
18031  }
18032  
18033 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
18034 +{
18035 +       struct sigqueue *q = t->sigqueue_cache;
18036 +
18037 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
18038 +               return NULL;
18039 +       return q;
18040 +}
18041 +
18042 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
18043 +{
18044 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
18045 +               return 0;
18046 +       return 1;
18047 +}
18048 +
18049  /*
18050   * allocate a new signal queue record
18051   * - this may be called without locks if and only if t == current, otherwise an
18052   *   appropriate lock must be held to stop the target task from exiting
18053   */
18054  static struct sigqueue *
18055 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
18056 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
18057 +                   int override_rlimit, int fromslab)
18058  {
18059         struct sigqueue *q = NULL;
18060         struct user_struct *user;
18061 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
18062         if (override_rlimit ||
18063             atomic_read(&user->sigpending) <=
18064                         task_rlimit(t, RLIMIT_SIGPENDING)) {
18065 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
18066 +               if (!fromslab)
18067 +                       q = get_task_cache(t);
18068 +               if (!q)
18069 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
18070         } else {
18071                 print_dropped_signal(sig);
18072         }
18073 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
18074         return q;
18075  }
18076  
18077 +static struct sigqueue *
18078 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
18079 +                int override_rlimit)
18080 +{
18081 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
18082 +}
18083 +
18084  static void __sigqueue_free(struct sigqueue *q)
18085  {
18086         if (q->flags & SIGQUEUE_PREALLOC)
18087 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
18088         kmem_cache_free(sigqueue_cachep, q);
18089  }
18090  
18091 +static void sigqueue_free_current(struct sigqueue *q)
18092 +{
18093 +       struct user_struct *up;
18094 +
18095 +       if (q->flags & SIGQUEUE_PREALLOC)
18096 +               return;
18097 +
18098 +       up = q->user;
18099 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
18100 +               atomic_dec(&up->sigpending);
18101 +               free_uid(up);
18102 +       } else
18103 +                 __sigqueue_free(q);
18104 +}
18105 +
18106  void flush_sigqueue(struct sigpending *queue)
18107  {
18108         struct sigqueue *q;
18109 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
18110  }
18111  
18112  /*
18113 + * Called from __exit_signal. Flush tsk->pending and
18114 + * tsk->sigqueue_cache
18115 + */
18116 +void flush_task_sigqueue(struct task_struct *tsk)
18117 +{
18118 +       struct sigqueue *q;
18119 +
18120 +       flush_sigqueue(&tsk->pending);
18121 +
18122 +       q = get_task_cache(tsk);
18123 +       if (q)
18124 +               kmem_cache_free(sigqueue_cachep, q);
18125 +}
18126 +
18127 +/*
18128   * Flush all pending signals for this kthread.
18129   */
18130  void flush_signals(struct task_struct *t)
18131 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
18132  still_pending:
18133                 list_del_init(&first->list);
18134                 copy_siginfo(info, &first->info);
18135 -               __sigqueue_free(first);
18136 +               sigqueue_free_current(first);
18137         } else {
18138                 /*
18139                  * Ok, it wasn't in the queue.  This must be
18140 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
18141  {
18142         int signr;
18143  
18144 +       WARN_ON_ONCE(tsk != current);
18145 +
18146         /* We only dequeue private signals from ourselves, we don't let
18147          * signalfd steal them
18148          */
18149 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
18150   * We don't want to have recursive SIGSEGV's etc, for example,
18151   * that is why we also clear SIGNAL_UNKILLABLE.
18152   */
18153 -int
18154 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18155 +static int
18156 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18157  {
18158         unsigned long int flags;
18159         int ret, blocked, ignored;
18160 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18161         return ret;
18162  }
18163  
18164 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18165 +{
18166 +/*
18167 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
18168 + * since it can not enable preemption, and the signal code's spin_locks
18169 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
18170 + * send the signal on exit of the trap.
18171 + */
18172 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
18173 +       if (in_atomic()) {
18174 +               if (WARN_ON_ONCE(t != current))
18175 +                       return 0;
18176 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
18177 +                       return 0;
18178 +
18179 +               if (is_si_special(info)) {
18180 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
18181 +                       t->forced_info.si_signo = sig;
18182 +                       t->forced_info.si_errno = 0;
18183 +                       t->forced_info.si_code = SI_KERNEL;
18184 +                       t->forced_info.si_pid = 0;
18185 +                       t->forced_info.si_uid = 0;
18186 +               } else {
18187 +                       t->forced_info = *info;
18188 +               }
18189 +
18190 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
18191 +               return 0;
18192 +       }
18193 +#endif
18194 +       return do_force_sig_info(sig, info, t);
18195 +}
18196 +
18197  /*
18198   * Nuke all other threads in the group.
18199   */
18200 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
18201                  * Disable interrupts early to avoid deadlocks.
18202                  * See rcu_read_unlock() comment header for details.
18203                  */
18204 -               local_irq_save(*flags);
18205 +               local_irq_save_nort(*flags);
18206                 rcu_read_lock();
18207                 sighand = rcu_dereference(tsk->sighand);
18208                 if (unlikely(sighand == NULL)) {
18209                         rcu_read_unlock();
18210 -                       local_irq_restore(*flags);
18211 +                       local_irq_restore_nort(*flags);
18212                         break;
18213                 }
18214                 /*
18215 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
18216                 }
18217                 spin_unlock(&sighand->siglock);
18218                 rcu_read_unlock();
18219 -               local_irq_restore(*flags);
18220 +               local_irq_restore_nort(*flags);
18221         }
18222  
18223         return sighand;
18224 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
18225   */
18226  struct sigqueue *sigqueue_alloc(void)
18227  {
18228 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
18229 +       /* Preallocated sigqueue objects always from the slabcache ! */
18230 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
18231  
18232         if (q)
18233                 q->flags |= SIGQUEUE_PREALLOC;
18234 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
18235                 if (gstop_done && ptrace_reparented(current))
18236                         do_notify_parent_cldstop(current, false, why);
18237  
18238 -               /*
18239 -                * Don't want to allow preemption here, because
18240 -                * sys_ptrace() needs this task to be inactive.
18241 -                *
18242 -                * XXX: implement read_unlock_no_resched().
18243 -                */
18244 -               preempt_disable();
18245                 read_unlock(&tasklist_lock);
18246 -               preempt_enable_no_resched();
18247                 freezable_schedule();
18248         } else {
18249                 /*
18250 diff --git a/kernel/softirq.c b/kernel/softirq.c
18251 index 744fa611cae0..819bd7cf5ad0 100644
18252 --- a/kernel/softirq.c
18253 +++ b/kernel/softirq.c
18254 @@ -21,10 +21,12 @@
18255  #include <linux/freezer.h>
18256  #include <linux/kthread.h>
18257  #include <linux/rcupdate.h>
18258 +#include <linux/delay.h>
18259  #include <linux/ftrace.h>
18260  #include <linux/smp.h>
18261  #include <linux/smpboot.h>
18262  #include <linux/tick.h>
18263 +#include <linux/locallock.h>
18264  #include <linux/irq.h>
18265  
18266  #define CREATE_TRACE_POINTS
18267 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
18268  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
18269  
18270  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
18271 +#ifdef CONFIG_PREEMPT_RT_FULL
18272 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
18273 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
18274 +#endif
18275  
18276  const char * const softirq_to_name[NR_SOFTIRQS] = {
18277         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
18278         "TASKLET", "SCHED", "HRTIMER", "RCU"
18279  };
18280  
18281 +#ifdef CONFIG_NO_HZ_COMMON
18282 +# ifdef CONFIG_PREEMPT_RT_FULL
18283 +
18284 +struct softirq_runner {
18285 +       struct task_struct *runner[NR_SOFTIRQS];
18286 +};
18287 +
18288 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
18289 +
18290 +static inline void softirq_set_runner(unsigned int sirq)
18291 +{
18292 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18293 +
18294 +       sr->runner[sirq] = current;
18295 +}
18296 +
18297 +static inline void softirq_clr_runner(unsigned int sirq)
18298 +{
18299 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18300 +
18301 +       sr->runner[sirq] = NULL;
18302 +}
18303 +
18304 +/*
18305 + * On preempt-rt a softirq running context might be blocked on a
18306 + * lock. There might be no other runnable task on this CPU because the
18307 + * lock owner runs on some other CPU. So we have to go into idle with
18308 + * the pending bit set. Therefor we need to check this otherwise we
18309 + * warn about false positives which confuses users and defeats the
18310 + * whole purpose of this test.
18311 + *
18312 + * This code is called with interrupts disabled.
18313 + */
18314 +void softirq_check_pending_idle(void)
18315 +{
18316 +       static int rate_limit;
18317 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18318 +       u32 warnpending;
18319 +       int i;
18320 +
18321 +       if (rate_limit >= 10)
18322 +               return;
18323 +
18324 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
18325 +       for (i = 0; i < NR_SOFTIRQS; i++) {
18326 +               struct task_struct *tsk = sr->runner[i];
18327 +
18328 +               /*
18329 +                * The wakeup code in rtmutex.c wakes up the task
18330 +                * _before_ it sets pi_blocked_on to NULL under
18331 +                * tsk->pi_lock. So we need to check for both: state
18332 +                * and pi_blocked_on.
18333 +                */
18334 +               if (tsk) {
18335 +                       raw_spin_lock(&tsk->pi_lock);
18336 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
18337 +                               /* Clear all bits pending in that task */
18338 +                               warnpending &= ~(tsk->softirqs_raised);
18339 +                               warnpending &= ~(1 << i);
18340 +                       }
18341 +                       raw_spin_unlock(&tsk->pi_lock);
18342 +               }
18343 +       }
18344 +
18345 +       if (warnpending) {
18346 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
18347 +                      warnpending);
18348 +               rate_limit++;
18349 +       }
18350 +}
18351 +# else
18352 +/*
18353 + * On !PREEMPT_RT we just printk rate limited:
18354 + */
18355 +void softirq_check_pending_idle(void)
18356 +{
18357 +       static int rate_limit;
18358 +
18359 +       if (rate_limit < 10 &&
18360 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18361 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
18362 +                      local_softirq_pending());
18363 +               rate_limit++;
18364 +       }
18365 +}
18366 +# endif
18367 +
18368 +#else /* !CONFIG_NO_HZ_COMMON */
18369 +static inline void softirq_set_runner(unsigned int sirq) { }
18370 +static inline void softirq_clr_runner(unsigned int sirq) { }
18371 +#endif
18372 +
18373  /*
18374   * we cannot loop indefinitely here to avoid userspace starvation,
18375   * but we also don't want to introduce a worst case 1/HZ latency
18376 @@ -77,6 +175,38 @@ static void wakeup_softirqd(void)
18377                 wake_up_process(tsk);
18378  }
18379  
18380 +#ifdef CONFIG_PREEMPT_RT_FULL
18381 +static void wakeup_timer_softirqd(void)
18382 +{
18383 +       /* Interrupts are disabled: no need to stop preemption */
18384 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
18385 +
18386 +       if (tsk && tsk->state != TASK_RUNNING)
18387 +               wake_up_process(tsk);
18388 +}
18389 +#endif
18390 +
18391 +static void handle_softirq(unsigned int vec_nr)
18392 +{
18393 +       struct softirq_action *h = softirq_vec + vec_nr;
18394 +       int prev_count;
18395 +
18396 +       prev_count = preempt_count();
18397 +
18398 +       kstat_incr_softirqs_this_cpu(vec_nr);
18399 +
18400 +       trace_softirq_entry(vec_nr);
18401 +       h->action(h);
18402 +       trace_softirq_exit(vec_nr);
18403 +       if (unlikely(prev_count != preempt_count())) {
18404 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
18405 +                      vec_nr, softirq_to_name[vec_nr], h->action,
18406 +                      prev_count, preempt_count());
18407 +               preempt_count_set(prev_count);
18408 +       }
18409 +}
18410 +
18411 +#ifndef CONFIG_PREEMPT_RT_FULL
18412  /*
18413   * If ksoftirqd is scheduled, we do not want to process pending softirqs
18414   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
18415 @@ -88,6 +218,47 @@ static bool ksoftirqd_running(void)
18416         return tsk && (tsk->state == TASK_RUNNING);
18417  }
18418  
18419 +static inline int ksoftirqd_softirq_pending(void)
18420 +{
18421 +       return local_softirq_pending();
18422 +}
18423 +
18424 +static void handle_pending_softirqs(u32 pending)
18425 +{
18426 +       struct softirq_action *h = softirq_vec;
18427 +       int softirq_bit;
18428 +
18429 +       local_irq_enable();
18430 +
18431 +       h = softirq_vec;
18432 +
18433 +       while ((softirq_bit = ffs(pending))) {
18434 +               unsigned int vec_nr;
18435 +
18436 +               h += softirq_bit - 1;
18437 +               vec_nr = h - softirq_vec;
18438 +               handle_softirq(vec_nr);
18439 +
18440 +               h++;
18441 +               pending >>= softirq_bit;
18442 +       }
18443 +
18444 +       rcu_bh_qs();
18445 +       local_irq_disable();
18446 +}
18447 +
18448 +static void run_ksoftirqd(unsigned int cpu)
18449 +{
18450 +       local_irq_disable();
18451 +       if (ksoftirqd_softirq_pending()) {
18452 +               __do_softirq();
18453 +               local_irq_enable();
18454 +               cond_resched_rcu_qs();
18455 +               return;
18456 +       }
18457 +       local_irq_enable();
18458 +}
18459 +
18460  /*
18461   * preempt_count and SOFTIRQ_OFFSET usage:
18462   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
18463 @@ -243,10 +414,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
18464         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
18465         unsigned long old_flags = current->flags;
18466         int max_restart = MAX_SOFTIRQ_RESTART;
18467 -       struct softirq_action *h;
18468         bool in_hardirq;
18469         __u32 pending;
18470 -       int softirq_bit;
18471  
18472         /*
18473          * Mask out PF_MEMALLOC s current task context is borrowed for the
18474 @@ -265,36 +434,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
18475         /* Reset the pending bitmask before enabling irqs */
18476         set_softirq_pending(0);
18477  
18478 -       local_irq_enable();
18479 -
18480 -       h = softirq_vec;
18481 -
18482 -       while ((softirq_bit = ffs(pending))) {
18483 -               unsigned int vec_nr;
18484 -               int prev_count;
18485 -
18486 -               h += softirq_bit - 1;
18487 -
18488 -               vec_nr = h - softirq_vec;
18489 -               prev_count = preempt_count();
18490 -
18491 -               kstat_incr_softirqs_this_cpu(vec_nr);
18492 -
18493 -               trace_softirq_entry(vec_nr);
18494 -               h->action(h);
18495 -               trace_softirq_exit(vec_nr);
18496 -               if (unlikely(prev_count != preempt_count())) {
18497 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
18498 -                              vec_nr, softirq_to_name[vec_nr], h->action,
18499 -                              prev_count, preempt_count());
18500 -                       preempt_count_set(prev_count);
18501 -               }
18502 -               h++;
18503 -               pending >>= softirq_bit;
18504 -       }
18505 -
18506 -       rcu_bh_qs();
18507 -       local_irq_disable();
18508 +       handle_pending_softirqs(pending);
18509  
18510         pending = local_softirq_pending();
18511         if (pending) {
18512 @@ -331,6 +471,309 @@ asmlinkage __visible void do_softirq(void)
18513  }
18514  
18515  /*
18516 + * This function must run with irqs disabled!
18517 + */
18518 +void raise_softirq_irqoff(unsigned int nr)
18519 +{
18520 +       __raise_softirq_irqoff(nr);
18521 +
18522 +       /*
18523 +        * If we're in an interrupt or softirq, we're done
18524 +        * (this also catches softirq-disabled code). We will
18525 +        * actually run the softirq once we return from
18526 +        * the irq or softirq.
18527 +        *
18528 +        * Otherwise we wake up ksoftirqd to make sure we
18529 +        * schedule the softirq soon.
18530 +        */
18531 +       if (!in_interrupt())
18532 +               wakeup_softirqd();
18533 +}
18534 +
18535 +void __raise_softirq_irqoff(unsigned int nr)
18536 +{
18537 +       trace_softirq_raise(nr);
18538 +       or_softirq_pending(1UL << nr);
18539 +}
18540 +
18541 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
18542 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
18543 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
18544 +
18545 +#else /* !PREEMPT_RT_FULL */
18546 +
18547 +/*
18548 + * On RT we serialize softirq execution with a cpu local lock per softirq
18549 + */
18550 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
18551 +
18552 +void __init softirq_early_init(void)
18553 +{
18554 +       int i;
18555 +
18556 +       for (i = 0; i < NR_SOFTIRQS; i++)
18557 +               local_irq_lock_init(local_softirq_locks[i]);
18558 +}
18559 +
18560 +static void lock_softirq(int which)
18561 +{
18562 +       local_lock(local_softirq_locks[which]);
18563 +}
18564 +
18565 +static void unlock_softirq(int which)
18566 +{
18567 +       local_unlock(local_softirq_locks[which]);
18568 +}
18569 +
18570 +static void do_single_softirq(int which)
18571 +{
18572 +       unsigned long old_flags = current->flags;
18573 +
18574 +       current->flags &= ~PF_MEMALLOC;
18575 +       vtime_account_irq_enter(current);
18576 +       current->flags |= PF_IN_SOFTIRQ;
18577 +       lockdep_softirq_enter();
18578 +       local_irq_enable();
18579 +       handle_softirq(which);
18580 +       local_irq_disable();
18581 +       lockdep_softirq_exit();
18582 +       current->flags &= ~PF_IN_SOFTIRQ;
18583 +       vtime_account_irq_enter(current);
18584 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
18585 +}
18586 +
18587 +/*
18588 + * Called with interrupts disabled. Process softirqs which were raised
18589 + * in current context (or on behalf of ksoftirqd).
18590 + */
18591 +static void do_current_softirqs(void)
18592 +{
18593 +       while (current->softirqs_raised) {
18594 +               int i = __ffs(current->softirqs_raised);
18595 +               unsigned int pending, mask = (1U << i);
18596 +
18597 +               current->softirqs_raised &= ~mask;
18598 +               local_irq_enable();
18599 +
18600 +               /*
18601 +                * If the lock is contended, we boost the owner to
18602 +                * process the softirq or leave the critical section
18603 +                * now.
18604 +                */
18605 +               lock_softirq(i);
18606 +               local_irq_disable();
18607 +               softirq_set_runner(i);
18608 +               /*
18609 +                * Check with the local_softirq_pending() bits,
18610 +                * whether we need to process this still or if someone
18611 +                * else took care of it.
18612 +                */
18613 +               pending = local_softirq_pending();
18614 +               if (pending & mask) {
18615 +                       set_softirq_pending(pending & ~mask);
18616 +                       do_single_softirq(i);
18617 +               }
18618 +               softirq_clr_runner(i);
18619 +               WARN_ON(current->softirq_nestcnt != 1);
18620 +               local_irq_enable();
18621 +               unlock_softirq(i);
18622 +               local_irq_disable();
18623 +       }
18624 +}
18625 +
18626 +void __local_bh_disable(void)
18627 +{
18628 +       if (++current->softirq_nestcnt == 1)
18629 +               migrate_disable();
18630 +}
18631 +EXPORT_SYMBOL(__local_bh_disable);
18632 +
18633 +void __local_bh_enable(void)
18634 +{
18635 +       if (WARN_ON(current->softirq_nestcnt == 0))
18636 +               return;
18637 +
18638 +       local_irq_disable();
18639 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
18640 +               do_current_softirqs();
18641 +       local_irq_enable();
18642 +
18643 +       if (--current->softirq_nestcnt == 0)
18644 +               migrate_enable();
18645 +}
18646 +EXPORT_SYMBOL(__local_bh_enable);
18647 +
18648 +void _local_bh_enable(void)
18649 +{
18650 +       if (WARN_ON(current->softirq_nestcnt == 0))
18651 +               return;
18652 +       if (--current->softirq_nestcnt == 0)
18653 +               migrate_enable();
18654 +}
18655 +EXPORT_SYMBOL(_local_bh_enable);
18656 +
18657 +int in_serving_softirq(void)
18658 +{
18659 +       return current->flags & PF_IN_SOFTIRQ;
18660 +}
18661 +EXPORT_SYMBOL(in_serving_softirq);
18662 +
18663 +/* Called with preemption disabled */
18664 +static void run_ksoftirqd(unsigned int cpu)
18665 +{
18666 +       local_irq_disable();
18667 +       current->softirq_nestcnt++;
18668 +
18669 +       do_current_softirqs();
18670 +       current->softirq_nestcnt--;
18671 +       local_irq_enable();
18672 +       cond_resched_rcu_qs();
18673 +}
18674 +
18675 +/*
18676 + * Called from netif_rx_ni(). Preemption enabled, but migration
18677 + * disabled. So the cpu can't go away under us.
18678 + */
18679 +void thread_do_softirq(void)
18680 +{
18681 +       if (!in_serving_softirq() && current->softirqs_raised) {
18682 +               current->softirq_nestcnt++;
18683 +               do_current_softirqs();
18684 +               current->softirq_nestcnt--;
18685 +       }
18686 +}
18687 +
18688 +static void do_raise_softirq_irqoff(unsigned int nr)
18689 +{
18690 +       unsigned int mask;
18691 +
18692 +       mask = 1UL << nr;
18693 +
18694 +       trace_softirq_raise(nr);
18695 +       or_softirq_pending(mask);
18696 +
18697 +       /*
18698 +        * If we are not in a hard interrupt and inside a bh disabled
18699 +        * region, we simply raise the flag on current. local_bh_enable()
18700 +        * will make sure that the softirq is executed. Otherwise we
18701 +        * delegate it to ksoftirqd.
18702 +        */
18703 +       if (!in_irq() && current->softirq_nestcnt)
18704 +               current->softirqs_raised |= mask;
18705 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
18706 +               return;
18707 +
18708 +       if (mask & TIMER_SOFTIRQS)
18709 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
18710 +       else
18711 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
18712 +}
18713 +
18714 +static void wakeup_proper_softirq(unsigned int nr)
18715 +{
18716 +       if ((1UL << nr) & TIMER_SOFTIRQS)
18717 +               wakeup_timer_softirqd();
18718 +       else
18719 +               wakeup_softirqd();
18720 +}
18721 +
18722 +void __raise_softirq_irqoff(unsigned int nr)
18723 +{
18724 +       do_raise_softirq_irqoff(nr);
18725 +       if (!in_irq() && !current->softirq_nestcnt)
18726 +               wakeup_proper_softirq(nr);
18727 +}
18728 +
18729 +/*
18730 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
18731 + */
18732 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
18733 +{
18734 +       unsigned int mask;
18735 +
18736 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
18737 +                        !__this_cpu_read(ktimer_softirqd)))
18738 +               return;
18739 +       mask = 1UL << nr;
18740 +
18741 +       trace_softirq_raise(nr);
18742 +       or_softirq_pending(mask);
18743 +       if (mask & TIMER_SOFTIRQS)
18744 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
18745 +       else
18746 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
18747 +       wakeup_proper_softirq(nr);
18748 +}
18749 +
18750 +/*
18751 + * This function must run with irqs disabled!
18752 + */
18753 +void raise_softirq_irqoff(unsigned int nr)
18754 +{
18755 +       do_raise_softirq_irqoff(nr);
18756 +
18757 +       /*
18758 +        * If we're in an hard interrupt we let irq return code deal
18759 +        * with the wakeup of ksoftirqd.
18760 +        */
18761 +       if (in_irq())
18762 +               return;
18763 +       /*
18764 +        * If we are in thread context but outside of a bh disabled
18765 +        * region, we need to wake ksoftirqd as well.
18766 +        *
18767 +        * CHECKME: Some of the places which do that could be wrapped
18768 +        * into local_bh_disable/enable pairs. Though it's unclear
18769 +        * whether this is worth the effort. To find those places just
18770 +        * raise a WARN() if the condition is met.
18771 +        */
18772 +       if (!current->softirq_nestcnt)
18773 +               wakeup_proper_softirq(nr);
18774 +}
18775 +
18776 +static inline int ksoftirqd_softirq_pending(void)
18777 +{
18778 +       return current->softirqs_raised;
18779 +}
18780 +
18781 +static inline void local_bh_disable_nort(void) { }
18782 +static inline void _local_bh_enable_nort(void) { }
18783 +
18784 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
18785 +{
18786 +       /* Take over all but timer pending softirqs when starting */
18787 +       local_irq_disable();
18788 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
18789 +       local_irq_enable();
18790 +}
18791 +
18792 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
18793 +{
18794 +       struct sched_param param = { .sched_priority = 1 };
18795 +
18796 +       sched_setscheduler(current, SCHED_FIFO, &param);
18797 +
18798 +       /* Take over timer pending softirqs when starting */
18799 +       local_irq_disable();
18800 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
18801 +       local_irq_enable();
18802 +}
18803 +
18804 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
18805 +                                                   bool online)
18806 +{
18807 +       struct sched_param param = { .sched_priority = 0 };
18808 +
18809 +       sched_setscheduler(current, SCHED_NORMAL, &param);
18810 +}
18811 +
18812 +static int ktimer_softirqd_should_run(unsigned int cpu)
18813 +{
18814 +       return current->softirqs_raised;
18815 +}
18816 +
18817 +#endif /* PREEMPT_RT_FULL */
18818 +/*
18819   * Enter an interrupt context.
18820   */
18821  void irq_enter(void)
18822 @@ -341,9 +784,9 @@ void irq_enter(void)
18823                  * Prevent raise_softirq from needlessly waking up ksoftirqd
18824                  * here, as softirq will be serviced on return from interrupt.
18825                  */
18826 -               local_bh_disable();
18827 +               local_bh_disable_nort();
18828                 tick_irq_enter();
18829 -               _local_bh_enable();
18830 +               _local_bh_enable_nort();
18831         }
18832  
18833         __irq_enter();
18834 @@ -351,6 +794,7 @@ void irq_enter(void)
18835  
18836  static inline void invoke_softirq(void)
18837  {
18838 +#ifndef CONFIG_PREEMPT_RT_FULL
18839         if (ksoftirqd_running())
18840                 return;
18841  
18842 @@ -373,6 +817,18 @@ static inline void invoke_softirq(void)
18843         } else {
18844                 wakeup_softirqd();
18845         }
18846 +#else /* PREEMPT_RT_FULL */
18847 +       unsigned long flags;
18848 +
18849 +       local_irq_save(flags);
18850 +       if (__this_cpu_read(ksoftirqd) &&
18851 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
18852 +               wakeup_softirqd();
18853 +       if (__this_cpu_read(ktimer_softirqd) &&
18854 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
18855 +               wakeup_timer_softirqd();
18856 +       local_irq_restore(flags);
18857 +#endif
18858  }
18859  
18860  static inline void tick_irq_exit(void)
18861 @@ -409,26 +865,6 @@ void irq_exit(void)
18862         trace_hardirq_exit(); /* must be last! */
18863  }
18864  
18865 -/*
18866 - * This function must run with irqs disabled!
18867 - */
18868 -inline void raise_softirq_irqoff(unsigned int nr)
18869 -{
18870 -       __raise_softirq_irqoff(nr);
18871 -
18872 -       /*
18873 -        * If we're in an interrupt or softirq, we're done
18874 -        * (this also catches softirq-disabled code). We will
18875 -        * actually run the softirq once we return from
18876 -        * the irq or softirq.
18877 -        *
18878 -        * Otherwise we wake up ksoftirqd to make sure we
18879 -        * schedule the softirq soon.
18880 -        */
18881 -       if (!in_interrupt())
18882 -               wakeup_softirqd();
18883 -}
18884 -
18885  void raise_softirq(unsigned int nr)
18886  {
18887         unsigned long flags;
18888 @@ -438,12 +874,6 @@ void raise_softirq(unsigned int nr)
18889         local_irq_restore(flags);
18890  }
18891  
18892 -void __raise_softirq_irqoff(unsigned int nr)
18893 -{
18894 -       trace_softirq_raise(nr);
18895 -       or_softirq_pending(1UL << nr);
18896 -}
18897 -
18898  void open_softirq(int nr, void (*action)(struct softirq_action *))
18899  {
18900         softirq_vec[nr].action = action;
18901 @@ -460,15 +890,45 @@ struct tasklet_head {
18902  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
18903  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
18904  
18905 +static void inline
18906 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
18907 +{
18908 +       if (tasklet_trylock(t)) {
18909 +again:
18910 +               /* We may have been preempted before tasklet_trylock
18911 +                * and __tasklet_action may have already run.
18912 +                * So double check the sched bit while the takslet
18913 +                * is locked before adding it to the list.
18914 +                */
18915 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
18916 +                       t->next = NULL;
18917 +                       *head->tail = t;
18918 +                       head->tail = &(t->next);
18919 +                       raise_softirq_irqoff(nr);
18920 +                       tasklet_unlock(t);
18921 +               } else {
18922 +                       /* This is subtle. If we hit the corner case above
18923 +                        * It is possible that we get preempted right here,
18924 +                        * and another task has successfully called
18925 +                        * tasklet_schedule(), then this function, and
18926 +                        * failed on the trylock. Thus we must be sure
18927 +                        * before releasing the tasklet lock, that the
18928 +                        * SCHED_BIT is clear. Otherwise the tasklet
18929 +                        * may get its SCHED_BIT set, but not added to the
18930 +                        * list
18931 +                        */
18932 +                       if (!tasklet_tryunlock(t))
18933 +                               goto again;
18934 +               }
18935 +       }
18936 +}
18937 +
18938  void __tasklet_schedule(struct tasklet_struct *t)
18939  {
18940         unsigned long flags;
18941  
18942         local_irq_save(flags);
18943 -       t->next = NULL;
18944 -       *__this_cpu_read(tasklet_vec.tail) = t;
18945 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
18946 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
18947 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
18948         local_irq_restore(flags);
18949  }
18950  EXPORT_SYMBOL(__tasklet_schedule);
18951 @@ -478,10 +938,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
18952         unsigned long flags;
18953  
18954         local_irq_save(flags);
18955 -       t->next = NULL;
18956 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
18957 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
18958 -       raise_softirq_irqoff(HI_SOFTIRQ);
18959 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
18960         local_irq_restore(flags);
18961  }
18962  EXPORT_SYMBOL(__tasklet_hi_schedule);
18963 @@ -490,82 +947,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
18964  {
18965         BUG_ON(!irqs_disabled());
18966  
18967 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
18968 -       __this_cpu_write(tasklet_hi_vec.head, t);
18969 -       __raise_softirq_irqoff(HI_SOFTIRQ);
18970 +       __tasklet_hi_schedule(t);
18971  }
18972  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
18973  
18974 -static __latent_entropy void tasklet_action(struct softirq_action *a)
18975 +void  tasklet_enable(struct tasklet_struct *t)
18976  {
18977 -       struct tasklet_struct *list;
18978 +       if (!atomic_dec_and_test(&t->count))
18979 +               return;
18980 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
18981 +               tasklet_schedule(t);
18982 +}
18983 +EXPORT_SYMBOL(tasklet_enable);
18984  
18985 -       local_irq_disable();
18986 -       list = __this_cpu_read(tasklet_vec.head);
18987 -       __this_cpu_write(tasklet_vec.head, NULL);
18988 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
18989 -       local_irq_enable();
18990 +static void __tasklet_action(struct softirq_action *a,
18991 +                            struct tasklet_struct *list)
18992 +{
18993 +       int loops = 1000000;
18994  
18995         while (list) {
18996                 struct tasklet_struct *t = list;
18997  
18998                 list = list->next;
18999  
19000 -               if (tasklet_trylock(t)) {
19001 -                       if (!atomic_read(&t->count)) {
19002 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
19003 -                                                       &t->state))
19004 -                                       BUG();
19005 -                               t->func(t->data);
19006 -                               tasklet_unlock(t);
19007 -                               continue;
19008 -                       }
19009 -                       tasklet_unlock(t);
19010 +               /*
19011 +                * Should always succeed - after a tasklist got on the
19012 +                * list (after getting the SCHED bit set from 0 to 1),
19013 +                * nothing but the tasklet softirq it got queued to can
19014 +                * lock it:
19015 +                */
19016 +               if (!tasklet_trylock(t)) {
19017 +                       WARN_ON(1);
19018 +                       continue;
19019                 }
19020  
19021 -               local_irq_disable();
19022                 t->next = NULL;
19023 -               *__this_cpu_read(tasklet_vec.tail) = t;
19024 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
19025 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
19026 -               local_irq_enable();
19027 +
19028 +               /*
19029 +                * If we cannot handle the tasklet because it's disabled,
19030 +                * mark it as pending. tasklet_enable() will later
19031 +                * re-schedule the tasklet.
19032 +                */
19033 +               if (unlikely(atomic_read(&t->count))) {
19034 +out_disabled:
19035 +                       /* implicit unlock: */
19036 +                       wmb();
19037 +                       t->state = TASKLET_STATEF_PENDING;
19038 +                       continue;
19039 +               }
19040 +
19041 +               /*
19042 +                * After this point on the tasklet might be rescheduled
19043 +                * on another CPU, but it can only be added to another
19044 +                * CPU's tasklet list if we unlock the tasklet (which we
19045 +                * dont do yet).
19046 +                */
19047 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
19048 +                       WARN_ON(1);
19049 +
19050 +again:
19051 +               t->func(t->data);
19052 +
19053 +               /*
19054 +                * Try to unlock the tasklet. We must use cmpxchg, because
19055 +                * another CPU might have scheduled or disabled the tasklet.
19056 +                * We only allow the STATE_RUN -> 0 transition here.
19057 +                */
19058 +               while (!tasklet_tryunlock(t)) {
19059 +                       /*
19060 +                        * If it got disabled meanwhile, bail out:
19061 +                        */
19062 +                       if (atomic_read(&t->count))
19063 +                               goto out_disabled;
19064 +                       /*
19065 +                        * If it got scheduled meanwhile, re-execute
19066 +                        * the tasklet function:
19067 +                        */
19068 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
19069 +                               goto again;
19070 +                       if (!--loops) {
19071 +                               printk("hm, tasklet state: %08lx\n", t->state);
19072 +                               WARN_ON(1);
19073 +                               tasklet_unlock(t);
19074 +                               break;
19075 +                       }
19076 +               }
19077         }
19078  }
19079  
19080 +static void tasklet_action(struct softirq_action *a)
19081 +{
19082 +       struct tasklet_struct *list;
19083 +
19084 +       local_irq_disable();
19085 +
19086 +       list = __this_cpu_read(tasklet_vec.head);
19087 +       __this_cpu_write(tasklet_vec.head, NULL);
19088 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
19089 +
19090 +       local_irq_enable();
19091 +
19092 +       __tasklet_action(a, list);
19093 +}
19094 +
19095  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
19096  {
19097         struct tasklet_struct *list;
19098  
19099         local_irq_disable();
19100 +
19101         list = __this_cpu_read(tasklet_hi_vec.head);
19102         __this_cpu_write(tasklet_hi_vec.head, NULL);
19103         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
19104 +
19105         local_irq_enable();
19106  
19107 -       while (list) {
19108 -               struct tasklet_struct *t = list;
19109 -
19110 -               list = list->next;
19111 -
19112 -               if (tasklet_trylock(t)) {
19113 -                       if (!atomic_read(&t->count)) {
19114 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
19115 -                                                       &t->state))
19116 -                                       BUG();
19117 -                               t->func(t->data);
19118 -                               tasklet_unlock(t);
19119 -                               continue;
19120 -                       }
19121 -                       tasklet_unlock(t);
19122 -               }
19123 -
19124 -               local_irq_disable();
19125 -               t->next = NULL;
19126 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
19127 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
19128 -               __raise_softirq_irqoff(HI_SOFTIRQ);
19129 -               local_irq_enable();
19130 -       }
19131 +       __tasklet_action(a, list);
19132  }
19133  
19134  void tasklet_init(struct tasklet_struct *t,
19135 @@ -586,7 +1083,7 @@ void tasklet_kill(struct tasklet_struct *t)
19136  
19137         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
19138                 do {
19139 -                       yield();
19140 +                       msleep(1);
19141                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
19142         }
19143         tasklet_unlock_wait(t);
19144 @@ -660,25 +1157,26 @@ void __init softirq_init(void)
19145         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
19146  }
19147  
19148 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
19149 +void tasklet_unlock_wait(struct tasklet_struct *t)
19150 +{
19151 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
19152 +               /*
19153 +                * Hack for now to avoid this busy-loop:
19154 +                */
19155 +#ifdef CONFIG_PREEMPT_RT_FULL
19156 +               msleep(1);
19157 +#else
19158 +               barrier();
19159 +#endif
19160 +       }
19161 +}
19162 +EXPORT_SYMBOL(tasklet_unlock_wait);
19163 +#endif
19164 +
19165  static int ksoftirqd_should_run(unsigned int cpu)
19166  {
19167 -       return local_softirq_pending();
19168 -}
19169 -
19170 -static void run_ksoftirqd(unsigned int cpu)
19171 -{
19172 -       local_irq_disable();
19173 -       if (local_softirq_pending()) {
19174 -               /*
19175 -                * We can safely run softirq on inline stack, as we are not deep
19176 -                * in the task stack here.
19177 -                */
19178 -               __do_softirq();
19179 -               local_irq_enable();
19180 -               cond_resched_rcu_qs();
19181 -               return;
19182 -       }
19183 -       local_irq_enable();
19184 +       return ksoftirqd_softirq_pending();
19185  }
19186  
19187  #ifdef CONFIG_HOTPLUG_CPU
19188 @@ -745,17 +1243,31 @@ static int takeover_tasklets(unsigned int cpu)
19189  
19190  static struct smp_hotplug_thread softirq_threads = {
19191         .store                  = &ksoftirqd,
19192 +       .setup                  = ksoftirqd_set_sched_params,
19193         .thread_should_run      = ksoftirqd_should_run,
19194         .thread_fn              = run_ksoftirqd,
19195         .thread_comm            = "ksoftirqd/%u",
19196  };
19197  
19198 +#ifdef CONFIG_PREEMPT_RT_FULL
19199 +static struct smp_hotplug_thread softirq_timer_threads = {
19200 +       .store                  = &ktimer_softirqd,
19201 +       .setup                  = ktimer_softirqd_set_sched_params,
19202 +       .cleanup                = ktimer_softirqd_clr_sched_params,
19203 +       .thread_should_run      = ktimer_softirqd_should_run,
19204 +       .thread_fn              = run_ksoftirqd,
19205 +       .thread_comm            = "ktimersoftd/%u",
19206 +};
19207 +#endif
19208 +
19209  static __init int spawn_ksoftirqd(void)
19210  {
19211         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
19212                                   takeover_tasklets);
19213         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
19214 -
19215 +#ifdef CONFIG_PREEMPT_RT_FULL
19216 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
19217 +#endif
19218         return 0;
19219  }
19220  early_initcall(spawn_ksoftirqd);
19221 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
19222 index ec9ab2f01489..8b89dbedeaff 100644
19223 --- a/kernel/stop_machine.c
19224 +++ b/kernel/stop_machine.c
19225 @@ -36,7 +36,7 @@ struct cpu_stop_done {
19226  struct cpu_stopper {
19227         struct task_struct      *thread;
19228  
19229 -       spinlock_t              lock;
19230 +       raw_spinlock_t          lock;
19231         bool                    enabled;        /* is this stopper enabled? */
19232         struct list_head        works;          /* list of pending works */
19233  
19234 @@ -78,14 +78,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
19235         unsigned long flags;
19236         bool enabled;
19237  
19238 -       spin_lock_irqsave(&stopper->lock, flags);
19239 +       raw_spin_lock_irqsave(&stopper->lock, flags);
19240         enabled = stopper->enabled;
19241         if (enabled)
19242                 __cpu_stop_queue_work(stopper, work);
19243         else if (work->done)
19244                 cpu_stop_signal_done(work->done);
19245 -       spin_unlock_irqrestore(&stopper->lock, flags);
19246  
19247 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
19248         return enabled;
19249  }
19250  
19251 @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
19252         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
19253         int err;
19254  retry:
19255 -       spin_lock_irq(&stopper1->lock);
19256 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
19257 +       raw_spin_lock_irq(&stopper1->lock);
19258 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
19259  
19260         err = -ENOENT;
19261         if (!stopper1->enabled || !stopper2->enabled)
19262 @@ -255,8 +255,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
19263         __cpu_stop_queue_work(stopper1, work1);
19264         __cpu_stop_queue_work(stopper2, work2);
19265  unlock:
19266 -       spin_unlock(&stopper2->lock);
19267 -       spin_unlock_irq(&stopper1->lock);
19268 +       raw_spin_unlock(&stopper2->lock);
19269 +       raw_spin_unlock_irq(&stopper1->lock);
19270  
19271         if (unlikely(err == -EDEADLK)) {
19272                 while (stop_cpus_in_progress)
19273 @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
19274         unsigned long flags;
19275         int run;
19276  
19277 -       spin_lock_irqsave(&stopper->lock, flags);
19278 +       raw_spin_lock_irqsave(&stopper->lock, flags);
19279         run = !list_empty(&stopper->works);
19280 -       spin_unlock_irqrestore(&stopper->lock, flags);
19281 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
19282         return run;
19283  }
19284  
19285 @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
19286  
19287  repeat:
19288         work = NULL;
19289 -       spin_lock_irq(&stopper->lock);
19290 +       raw_spin_lock_irq(&stopper->lock);
19291         if (!list_empty(&stopper->works)) {
19292                 work = list_first_entry(&stopper->works,
19293                                         struct cpu_stop_work, list);
19294                 list_del_init(&work->list);
19295         }
19296 -       spin_unlock_irq(&stopper->lock);
19297 +       raw_spin_unlock_irq(&stopper->lock);
19298  
19299         if (work) {
19300                 cpu_stop_fn_t fn = work->fn;
19301 @@ -475,6 +475,8 @@ static void cpu_stopper_thread(unsigned int cpu)
19302                 struct cpu_stop_done *done = work->done;
19303                 int ret;
19304  
19305 +               /* XXX */
19306 +
19307                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
19308                 preempt_count_inc();
19309                 ret = fn(arg);
19310 @@ -541,7 +543,7 @@ static int __init cpu_stop_init(void)
19311         for_each_possible_cpu(cpu) {
19312                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
19313  
19314 -               spin_lock_init(&stopper->lock);
19315 +               raw_spin_lock_init(&stopper->lock);
19316                 INIT_LIST_HEAD(&stopper->works);
19317         }
19318  
19319 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
19320 index bb5ec425dfe0..8338b14ed3a3 100644
19321 --- a/kernel/time/hrtimer.c
19322 +++ b/kernel/time/hrtimer.c
19323 @@ -53,6 +53,7 @@
19324  #include <asm/uaccess.h>
19325  
19326  #include <trace/events/timer.h>
19327 +#include <trace/events/hist.h>
19328  
19329  #include "tick-internal.h"
19330  
19331 @@ -695,6 +696,29 @@ static void hrtimer_switch_to_hres(void)
19332         retrigger_next_event(NULL);
19333  }
19334  
19335 +#ifdef CONFIG_PREEMPT_RT_FULL
19336 +
19337 +static struct swork_event clock_set_delay_work;
19338 +
19339 +static void run_clock_set_delay(struct swork_event *event)
19340 +{
19341 +       clock_was_set();
19342 +}
19343 +
19344 +void clock_was_set_delayed(void)
19345 +{
19346 +       swork_queue(&clock_set_delay_work);
19347 +}
19348 +
19349 +static __init int create_clock_set_delay_thread(void)
19350 +{
19351 +       WARN_ON(swork_get());
19352 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
19353 +       return 0;
19354 +}
19355 +early_initcall(create_clock_set_delay_thread);
19356 +#else /* PREEMPT_RT_FULL */
19357 +
19358  static void clock_was_set_work(struct work_struct *work)
19359  {
19360         clock_was_set();
19361 @@ -710,6 +734,7 @@ void clock_was_set_delayed(void)
19362  {
19363         schedule_work(&hrtimer_work);
19364  }
19365 +#endif
19366  
19367  #else
19368  
19369 @@ -719,11 +744,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
19370  static inline void hrtimer_switch_to_hres(void) { }
19371  static inline void
19372  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
19373 -static inline int hrtimer_reprogram(struct hrtimer *timer,
19374 -                                   struct hrtimer_clock_base *base)
19375 -{
19376 -       return 0;
19377 -}
19378 +static inline void hrtimer_reprogram(struct hrtimer *timer,
19379 +                                    struct hrtimer_clock_base *base) { }
19380  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
19381  static inline void retrigger_next_event(void *arg) { }
19382  
19383 @@ -855,6 +877,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
19384  }
19385  EXPORT_SYMBOL_GPL(hrtimer_forward);
19386  
19387 +#ifdef CONFIG_PREEMPT_RT_BASE
19388 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
19389 +
19390 +/**
19391 + * hrtimer_wait_for_timer - Wait for a running timer
19392 + *
19393 + * @timer:     timer to wait for
19394 + *
19395 + * The function waits in case the timers callback function is
19396 + * currently executed on the waitqueue of the timer base. The
19397 + * waitqueue is woken up after the timer callback function has
19398 + * finished execution.
19399 + */
19400 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
19401 +{
19402 +       struct hrtimer_clock_base *base = timer->base;
19403 +
19404 +       if (base && base->cpu_base && !timer->irqsafe)
19405 +               wait_event(base->cpu_base->wait,
19406 +                               !(hrtimer_callback_running(timer)));
19407 +}
19408 +
19409 +#else
19410 +# define wake_up_timer_waiters(b)      do { } while (0)
19411 +#endif
19412 +
19413  /*
19414   * enqueue_hrtimer - internal function to (re)start a timer
19415   *
19416 @@ -896,6 +944,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
19417         if (!(state & HRTIMER_STATE_ENQUEUED))
19418                 return;
19419  
19420 +       if (unlikely(!list_empty(&timer->cb_entry))) {
19421 +               list_del_init(&timer->cb_entry);
19422 +               return;
19423 +       }
19424 +
19425         if (!timerqueue_del(&base->active, &timer->node))
19426                 cpu_base->active_bases &= ~(1 << base->index);
19427  
19428 @@ -991,7 +1044,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
19429         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
19430  
19431         timer_stats_hrtimer_set_start_info(timer);
19432 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19433 +       {
19434 +               ktime_t now = new_base->get_time();
19435  
19436 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
19437 +                       timer->praecox = now;
19438 +               else
19439 +                       timer->praecox = ktime_set(0, 0);
19440 +       }
19441 +#endif
19442         leftmost = enqueue_hrtimer(timer, new_base);
19443         if (!leftmost)
19444                 goto unlock;
19445 @@ -1063,7 +1125,7 @@ int hrtimer_cancel(struct hrtimer *timer)
19446  
19447                 if (ret >= 0)
19448                         return ret;
19449 -               cpu_relax();
19450 +               hrtimer_wait_for_timer(timer);
19451         }
19452  }
19453  EXPORT_SYMBOL_GPL(hrtimer_cancel);
19454 @@ -1127,6 +1189,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
19455  
19456         base = hrtimer_clockid_to_base(clock_id);
19457         timer->base = &cpu_base->clock_base[base];
19458 +       INIT_LIST_HEAD(&timer->cb_entry);
19459         timerqueue_init(&timer->node);
19460  
19461  #ifdef CONFIG_TIMER_STATS
19462 @@ -1167,6 +1230,7 @@ bool hrtimer_active(const struct hrtimer *timer)
19463                 seq = raw_read_seqcount_begin(&cpu_base->seq);
19464  
19465                 if (timer->state != HRTIMER_STATE_INACTIVE ||
19466 +                   cpu_base->running_soft == timer ||
19467                     cpu_base->running == timer)
19468                         return true;
19469  
19470 @@ -1265,10 +1329,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
19471         cpu_base->running = NULL;
19472  }
19473  
19474 +#ifdef CONFIG_PREEMPT_RT_BASE
19475 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
19476 +                                struct hrtimer_clock_base *base)
19477 +{
19478 +       int leftmost;
19479 +
19480 +       if (restart != HRTIMER_NORESTART &&
19481 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
19482 +
19483 +               leftmost = enqueue_hrtimer(timer, base);
19484 +               if (!leftmost)
19485 +                       return;
19486 +#ifdef CONFIG_HIGH_RES_TIMERS
19487 +               if (!hrtimer_is_hres_active(timer)) {
19488 +                       /*
19489 +                        * Kick to reschedule the next tick to handle the new timer
19490 +                        * on dynticks target.
19491 +                        */
19492 +                       if (base->cpu_base->nohz_active)
19493 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
19494 +               } else {
19495 +
19496 +                       hrtimer_reprogram(timer, base);
19497 +               }
19498 +#endif
19499 +       }
19500 +}
19501 +
19502 +/*
19503 + * The changes in mainline which removed the callback modes from
19504 + * hrtimer are not yet working with -rt. The non wakeup_process()
19505 + * based callbacks which involve sleeping locks need to be treated
19506 + * seperately.
19507 + */
19508 +static void hrtimer_rt_run_pending(void)
19509 +{
19510 +       enum hrtimer_restart (*fn)(struct hrtimer *);
19511 +       struct hrtimer_cpu_base *cpu_base;
19512 +       struct hrtimer_clock_base *base;
19513 +       struct hrtimer *timer;
19514 +       int index, restart;
19515 +
19516 +       local_irq_disable();
19517 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
19518 +
19519 +       raw_spin_lock(&cpu_base->lock);
19520 +
19521 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
19522 +               base = &cpu_base->clock_base[index];
19523 +
19524 +               while (!list_empty(&base->expired)) {
19525 +                       timer = list_first_entry(&base->expired,
19526 +                                                struct hrtimer, cb_entry);
19527 +
19528 +                       /*
19529 +                        * Same as the above __run_hrtimer function
19530 +                        * just we run with interrupts enabled.
19531 +                        */
19532 +                       debug_deactivate(timer);
19533 +                       cpu_base->running_soft = timer;
19534 +                       raw_write_seqcount_barrier(&cpu_base->seq);
19535 +
19536 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
19537 +                       timer_stats_account_hrtimer(timer);
19538 +                       fn = timer->function;
19539 +
19540 +                       raw_spin_unlock_irq(&cpu_base->lock);
19541 +                       restart = fn(timer);
19542 +                       raw_spin_lock_irq(&cpu_base->lock);
19543 +
19544 +                       hrtimer_rt_reprogram(restart, timer, base);
19545 +                       raw_write_seqcount_barrier(&cpu_base->seq);
19546 +
19547 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
19548 +                       cpu_base->running_soft = NULL;
19549 +               }
19550 +       }
19551 +
19552 +       raw_spin_unlock_irq(&cpu_base->lock);
19553 +
19554 +       wake_up_timer_waiters(cpu_base);
19555 +}
19556 +
19557 +static int hrtimer_rt_defer(struct hrtimer *timer)
19558 +{
19559 +       if (timer->irqsafe)
19560 +               return 0;
19561 +
19562 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
19563 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
19564 +       return 1;
19565 +}
19566 +
19567 +#else
19568 +
19569 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
19570 +
19571 +#endif
19572 +
19573 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
19574 +
19575  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19576  {
19577         struct hrtimer_clock_base *base = cpu_base->clock_base;
19578         unsigned int active = cpu_base->active_bases;
19579 +       int raise = 0;
19580  
19581         for (; active; base++, active >>= 1) {
19582                 struct timerqueue_node *node;
19583 @@ -1284,6 +1450,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19584  
19585                         timer = container_of(node, struct hrtimer, node);
19586  
19587 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
19588 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
19589 +                               timer->praecox : hrtimer_get_expires(timer),
19590 +                               basenow)),
19591 +                           current,
19592 +                           timer->function == hrtimer_wakeup ?
19593 +                           container_of(timer, struct hrtimer_sleeper,
19594 +                               timer)->task : NULL);
19595 +
19596                         /*
19597                          * The immediate goal for using the softexpires is
19598                          * minimizing wakeups, not running timers at the
19599 @@ -1299,9 +1474,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19600                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
19601                                 break;
19602  
19603 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
19604 +                       if (!hrtimer_rt_defer(timer))
19605 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
19606 +                       else
19607 +                               raise = 1;
19608                 }
19609         }
19610 +       if (raise)
19611 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
19612  }
19613  
19614  #ifdef CONFIG_HIGH_RES_TIMERS
19615 @@ -1464,16 +1644,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
19616  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
19617  {
19618         sl->timer.function = hrtimer_wakeup;
19619 +       sl->timer.irqsafe = 1;
19620         sl->task = task;
19621  }
19622  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
19623  
19624 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
19625 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
19626 +                               unsigned long state)
19627  {
19628         hrtimer_init_sleeper(t, current);
19629  
19630         do {
19631 -               set_current_state(TASK_INTERRUPTIBLE);
19632 +               set_current_state(state);
19633                 hrtimer_start_expires(&t->timer, mode);
19634  
19635                 if (likely(t->task))
19636 @@ -1515,7 +1697,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
19637                                 HRTIMER_MODE_ABS);
19638         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
19639  
19640 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
19641 +       /* cpu_chill() does not care about restart state. */
19642 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
19643                 goto out;
19644  
19645         rmtp = restart->nanosleep.rmtp;
19646 @@ -1532,8 +1715,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
19647         return ret;
19648  }
19649  
19650 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19651 -                      const enum hrtimer_mode mode, const clockid_t clockid)
19652 +static long
19653 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19654 +                   const enum hrtimer_mode mode, const clockid_t clockid,
19655 +                   unsigned long state)
19656  {
19657         struct restart_block *restart;
19658         struct hrtimer_sleeper t;
19659 @@ -1546,7 +1731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19660  
19661         hrtimer_init_on_stack(&t.timer, clockid, mode);
19662         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
19663 -       if (do_nanosleep(&t, mode))
19664 +       if (do_nanosleep(&t, mode, state))
19665                 goto out;
19666  
19667         /* Absolute timers do not update the rmtp value and restart: */
19668 @@ -1573,6 +1758,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19669         return ret;
19670  }
19671  
19672 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19673 +                      const enum hrtimer_mode mode, const clockid_t clockid)
19674 +{
19675 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
19676 +}
19677 +
19678  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
19679                 struct timespec __user *, rmtp)
19680  {
19681 @@ -1587,6 +1778,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
19682         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
19683  }
19684  
19685 +#ifdef CONFIG_PREEMPT_RT_FULL
19686 +/*
19687 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
19688 + */
19689 +void cpu_chill(void)
19690 +{
19691 +       struct timespec tu = {
19692 +               .tv_nsec = NSEC_PER_MSEC,
19693 +       };
19694 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
19695 +
19696 +       current->flags |= PF_NOFREEZE;
19697 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
19698 +                           TASK_UNINTERRUPTIBLE);
19699 +       if (!freeze_flag)
19700 +               current->flags &= ~PF_NOFREEZE;
19701 +}
19702 +EXPORT_SYMBOL(cpu_chill);
19703 +#endif
19704 +
19705  /*
19706   * Functions related to boot-time initialization:
19707   */
19708 @@ -1598,10 +1809,14 @@ int hrtimers_prepare_cpu(unsigned int cpu)
19709         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
19710                 cpu_base->clock_base[i].cpu_base = cpu_base;
19711                 timerqueue_init_head(&cpu_base->clock_base[i].active);
19712 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
19713         }
19714  
19715         cpu_base->cpu = cpu;
19716         hrtimer_init_hres(cpu_base);
19717 +#ifdef CONFIG_PREEMPT_RT_BASE
19718 +       init_waitqueue_head(&cpu_base->wait);
19719 +#endif
19720         return 0;
19721  }
19722  
19723 @@ -1671,9 +1886,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
19724  
19725  #endif /* CONFIG_HOTPLUG_CPU */
19726  
19727 +#ifdef CONFIG_PREEMPT_RT_BASE
19728 +
19729 +static void run_hrtimer_softirq(struct softirq_action *h)
19730 +{
19731 +       hrtimer_rt_run_pending();
19732 +}
19733 +
19734 +static void hrtimers_open_softirq(void)
19735 +{
19736 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
19737 +}
19738 +
19739 +#else
19740 +static void hrtimers_open_softirq(void) { }
19741 +#endif
19742 +
19743  void __init hrtimers_init(void)
19744  {
19745         hrtimers_prepare_cpu(smp_processor_id());
19746 +       hrtimers_open_softirq();
19747  }
19748  
19749  /**
19750 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
19751 index 1d5c7204ddc9..184de6751180 100644
19752 --- a/kernel/time/itimer.c
19753 +++ b/kernel/time/itimer.c
19754 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
19755                 /* We are sharing ->siglock with it_real_fn() */
19756                 if (hrtimer_try_to_cancel(timer) < 0) {
19757                         spin_unlock_irq(&tsk->sighand->siglock);
19758 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
19759                         goto again;
19760                 }
19761                 expires = timeval_to_ktime(value->it_value);
19762 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
19763 index 555e21f7b966..a5d6435fabbb 100644
19764 --- a/kernel/time/jiffies.c
19765 +++ b/kernel/time/jiffies.c
19766 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
19767         .max_cycles     = 10,
19768  };
19769  
19770 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
19771 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
19772 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
19773  
19774  #if (BITS_PER_LONG < 64)
19775  u64 get_jiffies_64(void)
19776 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
19777         u64 ret;
19778  
19779         do {
19780 -               seq = read_seqbegin(&jiffies_lock);
19781 +               seq = read_seqcount_begin(&jiffies_seq);
19782                 ret = jiffies_64;
19783 -       } while (read_seqretry(&jiffies_lock, seq));
19784 +       } while (read_seqcount_retry(&jiffies_seq, seq));
19785         return ret;
19786  }
19787  EXPORT_SYMBOL(get_jiffies_64);
19788 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
19789 index 6df8927c58a5..05b7391bf9bd 100644
19790 --- a/kernel/time/ntp.c
19791 +++ b/kernel/time/ntp.c
19792 @@ -17,6 +17,7 @@
19793  #include <linux/module.h>
19794  #include <linux/rtc.h>
19795  #include <linux/math64.h>
19796 +#include <linux/swork.h>
19797  
19798  #include "ntp_internal.h"
19799  #include "timekeeping_internal.h"
19800 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
19801                            &sync_cmos_work, timespec64_to_jiffies(&next));
19802  }
19803  
19804 +#ifdef CONFIG_PREEMPT_RT_FULL
19805 +
19806 +static void run_clock_set_delay(struct swork_event *event)
19807 +{
19808 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
19809 +}
19810 +
19811 +static struct swork_event ntp_cmos_swork;
19812 +
19813 +void ntp_notify_cmos_timer(void)
19814 +{
19815 +       swork_queue(&ntp_cmos_swork);
19816 +}
19817 +
19818 +static __init int create_cmos_delay_thread(void)
19819 +{
19820 +       WARN_ON(swork_get());
19821 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
19822 +       return 0;
19823 +}
19824 +early_initcall(create_cmos_delay_thread);
19825 +
19826 +#else
19827 +
19828  void ntp_notify_cmos_timer(void)
19829  {
19830         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
19831  }
19832 +#endif /* CONFIG_PREEMPT_RT_FULL */
19833  
19834  #else
19835  void ntp_notify_cmos_timer(void) { }
19836 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
19837 index 39008d78927a..633f4eaca9e7 100644
19838 --- a/kernel/time/posix-cpu-timers.c
19839 +++ b/kernel/time/posix-cpu-timers.c
19840 @@ -3,6 +3,7 @@
19841   */
19842  
19843  #include <linux/sched.h>
19844 +#include <linux/sched/rt.h>
19845  #include <linux/posix-timers.h>
19846  #include <linux/errno.h>
19847  #include <linux/math64.h>
19848 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
19849         /*
19850          * Disarm any old timer after extracting its expiry time.
19851          */
19852 -       WARN_ON_ONCE(!irqs_disabled());
19853 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19854  
19855         ret = 0;
19856         old_incr = timer->it.cpu.incr;
19857 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
19858         /*
19859          * Now re-arm for the new expiry time.
19860          */
19861 -       WARN_ON_ONCE(!irqs_disabled());
19862 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19863         arm_timer(timer);
19864         unlock_task_sighand(p, &flags);
19865  
19866 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
19867   * already updated our counts.  We need to check if any timers fire now.
19868   * Interrupts are disabled.
19869   */
19870 -void run_posix_cpu_timers(struct task_struct *tsk)
19871 +static void __run_posix_cpu_timers(struct task_struct *tsk)
19872  {
19873         LIST_HEAD(firing);
19874         struct k_itimer *timer, *next;
19875         unsigned long flags;
19876  
19877 -       WARN_ON_ONCE(!irqs_disabled());
19878 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19879  
19880         /*
19881          * The fast path checks that there are no expired thread or thread
19882 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
19883         }
19884  }
19885  
19886 +#ifdef CONFIG_PREEMPT_RT_BASE
19887 +#include <linux/kthread.h>
19888 +#include <linux/cpu.h>
19889 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
19890 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
19891 +
19892 +static int posix_cpu_timers_thread(void *data)
19893 +{
19894 +       int cpu = (long)data;
19895 +
19896 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
19897 +
19898 +       while (!kthread_should_stop()) {
19899 +               struct task_struct *tsk = NULL;
19900 +               struct task_struct *next = NULL;
19901 +
19902 +               if (cpu_is_offline(cpu))
19903 +                       goto wait_to_die;
19904 +
19905 +               /* grab task list */
19906 +               raw_local_irq_disable();
19907 +               tsk = per_cpu(posix_timer_tasklist, cpu);
19908 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
19909 +               raw_local_irq_enable();
19910 +
19911 +               /* its possible the list is empty, just return */
19912 +               if (!tsk) {
19913 +                       set_current_state(TASK_INTERRUPTIBLE);
19914 +                       schedule();
19915 +                       __set_current_state(TASK_RUNNING);
19916 +                       continue;
19917 +               }
19918 +
19919 +               /* Process task list */
19920 +               while (1) {
19921 +                       /* save next */
19922 +                       next = tsk->posix_timer_list;
19923 +
19924 +                       /* run the task timers, clear its ptr and
19925 +                        * unreference it
19926 +                        */
19927 +                       __run_posix_cpu_timers(tsk);
19928 +                       tsk->posix_timer_list = NULL;
19929 +                       put_task_struct(tsk);
19930 +
19931 +                       /* check if this is the last on the list */
19932 +                       if (next == tsk)
19933 +                               break;
19934 +                       tsk = next;
19935 +               }
19936 +       }
19937 +       return 0;
19938 +
19939 +wait_to_die:
19940 +       /* Wait for kthread_stop */
19941 +       set_current_state(TASK_INTERRUPTIBLE);
19942 +       while (!kthread_should_stop()) {
19943 +               schedule();
19944 +               set_current_state(TASK_INTERRUPTIBLE);
19945 +       }
19946 +       __set_current_state(TASK_RUNNING);
19947 +       return 0;
19948 +}
19949 +
19950 +static inline int __fastpath_timer_check(struct task_struct *tsk)
19951 +{
19952 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
19953 +       if (unlikely(tsk->exit_state))
19954 +               return 0;
19955 +
19956 +       if (!task_cputime_zero(&tsk->cputime_expires))
19957 +                       return 1;
19958 +
19959 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
19960 +                       return 1;
19961 +
19962 +       return 0;
19963 +}
19964 +
19965 +void run_posix_cpu_timers(struct task_struct *tsk)
19966 +{
19967 +       unsigned long cpu = smp_processor_id();
19968 +       struct task_struct *tasklist;
19969 +
19970 +       BUG_ON(!irqs_disabled());
19971 +       if(!per_cpu(posix_timer_task, cpu))
19972 +               return;
19973 +       /* get per-cpu references */
19974 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
19975 +
19976 +       /* check to see if we're already queued */
19977 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
19978 +               get_task_struct(tsk);
19979 +               if (tasklist) {
19980 +                       tsk->posix_timer_list = tasklist;
19981 +               } else {
19982 +                       /*
19983 +                        * The list is terminated by a self-pointing
19984 +                        * task_struct
19985 +                        */
19986 +                       tsk->posix_timer_list = tsk;
19987 +               }
19988 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
19989 +
19990 +               wake_up_process(per_cpu(posix_timer_task, cpu));
19991 +       }
19992 +}
19993 +
19994 +/*
19995 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
19996 + * Here we can start up the necessary migration thread for the new CPU.
19997 + */
19998 +static int posix_cpu_thread_call(struct notifier_block *nfb,
19999 +                                unsigned long action, void *hcpu)
20000 +{
20001 +       int cpu = (long)hcpu;
20002 +       struct task_struct *p;
20003 +       struct sched_param param;
20004 +
20005 +       switch (action) {
20006 +       case CPU_UP_PREPARE:
20007 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
20008 +                                       "posixcputmr/%d",cpu);
20009 +               if (IS_ERR(p))
20010 +                       return NOTIFY_BAD;
20011 +               p->flags |= PF_NOFREEZE;
20012 +               kthread_bind(p, cpu);
20013 +               /* Must be high prio to avoid getting starved */
20014 +               param.sched_priority = MAX_RT_PRIO-1;
20015 +               sched_setscheduler(p, SCHED_FIFO, &param);
20016 +               per_cpu(posix_timer_task,cpu) = p;
20017 +               break;
20018 +       case CPU_ONLINE:
20019 +               /* Strictly unneccessary, as first user will wake it. */
20020 +               wake_up_process(per_cpu(posix_timer_task,cpu));
20021 +               break;
20022 +#ifdef CONFIG_HOTPLUG_CPU
20023 +       case CPU_UP_CANCELED:
20024 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
20025 +               kthread_bind(per_cpu(posix_timer_task, cpu),
20026 +                            cpumask_any(cpu_online_mask));
20027 +               kthread_stop(per_cpu(posix_timer_task,cpu));
20028 +               per_cpu(posix_timer_task,cpu) = NULL;
20029 +               break;
20030 +       case CPU_DEAD:
20031 +               kthread_stop(per_cpu(posix_timer_task,cpu));
20032 +               per_cpu(posix_timer_task,cpu) = NULL;
20033 +               break;
20034 +#endif
20035 +       }
20036 +       return NOTIFY_OK;
20037 +}
20038 +
20039 +/* Register at highest priority so that task migration (migrate_all_tasks)
20040 + * happens before everything else.
20041 + */
20042 +static struct notifier_block posix_cpu_thread_notifier = {
20043 +       .notifier_call = posix_cpu_thread_call,
20044 +       .priority = 10
20045 +};
20046 +
20047 +static int __init posix_cpu_thread_init(void)
20048 +{
20049 +       void *hcpu = (void *)(long)smp_processor_id();
20050 +       /* Start one for boot CPU. */
20051 +       unsigned long cpu;
20052 +
20053 +       /* init the per-cpu posix_timer_tasklets */
20054 +       for_each_possible_cpu(cpu)
20055 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
20056 +
20057 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
20058 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
20059 +       register_cpu_notifier(&posix_cpu_thread_notifier);
20060 +       return 0;
20061 +}
20062 +early_initcall(posix_cpu_thread_init);
20063 +#else /* CONFIG_PREEMPT_RT_BASE */
20064 +void run_posix_cpu_timers(struct task_struct *tsk)
20065 +{
20066 +       __run_posix_cpu_timers(tsk);
20067 +}
20068 +#endif /* CONFIG_PREEMPT_RT_BASE */
20069 +
20070  /*
20071   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
20072   * The tsk->sighand->siglock must be held by the caller.
20073 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
20074 index f2826c35e918..464a98155a0e 100644
20075 --- a/kernel/time/posix-timers.c
20076 +++ b/kernel/time/posix-timers.c
20077 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
20078  static struct pid *good_sigevent(sigevent_t * event)
20079  {
20080         struct task_struct *rtn = current->group_leader;
20081 +       int sig = event->sigev_signo;
20082  
20083         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
20084                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
20085 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
20086                 return NULL;
20087  
20088         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
20089 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
20090 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
20091 +            sig_kernel_coredump(sig)))
20092                 return NULL;
20093  
20094         return task_pid(rtn);
20095 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
20096         return overrun;
20097  }
20098  
20099 +/*
20100 + * Protected by RCU!
20101 + */
20102 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
20103 +{
20104 +#ifdef CONFIG_PREEMPT_RT_FULL
20105 +       if (kc->timer_set == common_timer_set)
20106 +               hrtimer_wait_for_timer(&timr->it.real.timer);
20107 +       else
20108 +               /* FIXME: Whacky hack for posix-cpu-timers */
20109 +               schedule_timeout(1);
20110 +#endif
20111 +}
20112 +
20113  /* Set a POSIX.1b interval timer. */
20114  /* timr->it_lock is taken. */
20115  static int
20116 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
20117         if (!timr)
20118                 return -EINVAL;
20119  
20120 +       rcu_read_lock();
20121         kc = clockid_to_kclock(timr->it_clock);
20122         if (WARN_ON_ONCE(!kc || !kc->timer_set))
20123                 error = -EINVAL;
20124 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
20125  
20126         unlock_timer(timr, flag);
20127         if (error == TIMER_RETRY) {
20128 +               timer_wait_for_callback(kc, timr);
20129                 rtn = NULL;     // We already got the old time...
20130 +               rcu_read_unlock();
20131                 goto retry;
20132         }
20133 +       rcu_read_unlock();
20134  
20135         if (old_setting && !error &&
20136             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
20137 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
20138         if (!timer)
20139                 return -EINVAL;
20140  
20141 +       rcu_read_lock();
20142         if (timer_delete_hook(timer) == TIMER_RETRY) {
20143                 unlock_timer(timer, flags);
20144 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
20145 +                                       timer);
20146 +               rcu_read_unlock();
20147                 goto retry_delete;
20148         }
20149 +       rcu_read_unlock();
20150  
20151         spin_lock(&current->sighand->siglock);
20152         list_del(&timer->list);
20153 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
20154  retry_delete:
20155         spin_lock_irqsave(&timer->it_lock, flags);
20156  
20157 -       if (timer_delete_hook(timer) == TIMER_RETRY) {
20158 +       /* On RT we can race with a deletion */
20159 +       if (!timer->it_signal) {
20160                 unlock_timer(timer, flags);
20161 +               return;
20162 +       }
20163 +
20164 +       if (timer_delete_hook(timer) == TIMER_RETRY) {
20165 +               rcu_read_lock();
20166 +               unlock_timer(timer, flags);
20167 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
20168 +                                       timer);
20169 +               rcu_read_unlock();
20170                 goto retry_delete;
20171         }
20172         list_del(&timer->list);
20173 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
20174 index 690b797f522e..fe8ba1619879 100644
20175 --- a/kernel/time/tick-broadcast-hrtimer.c
20176 +++ b/kernel/time/tick-broadcast-hrtimer.c
20177 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
20178  {
20179         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
20180         bctimer.function = bc_handler;
20181 +       bctimer.irqsafe = true;
20182         clockevents_register_device(&ce_broadcast_hrtimer);
20183  }
20184 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
20185 index 4fcd99e12aa0..5a47f2e98faf 100644
20186 --- a/kernel/time/tick-common.c
20187 +++ b/kernel/time/tick-common.c
20188 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
20189  static void tick_periodic(int cpu)
20190  {
20191         if (tick_do_timer_cpu == cpu) {
20192 -               write_seqlock(&jiffies_lock);
20193 +               raw_spin_lock(&jiffies_lock);
20194 +               write_seqcount_begin(&jiffies_seq);
20195  
20196                 /* Keep track of the next tick event */
20197                 tick_next_period = ktime_add(tick_next_period, tick_period);
20198  
20199                 do_timer(1);
20200 -               write_sequnlock(&jiffies_lock);
20201 +               write_seqcount_end(&jiffies_seq);
20202 +               raw_spin_unlock(&jiffies_lock);
20203                 update_wall_time();
20204         }
20205  
20206 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
20207                 ktime_t next;
20208  
20209                 do {
20210 -                       seq = read_seqbegin(&jiffies_lock);
20211 +                       seq = read_seqcount_begin(&jiffies_seq);
20212                         next = tick_next_period;
20213 -               } while (read_seqretry(&jiffies_lock, seq));
20214 +               } while (read_seqcount_retry(&jiffies_seq, seq));
20215  
20216                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
20217  
20218 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
20219 index 3bcb61b52f6c..66d85482a96e 100644
20220 --- a/kernel/time/tick-sched.c
20221 +++ b/kernel/time/tick-sched.c
20222 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
20223                 return;
20224  
20225         /* Reevaluate with jiffies_lock held */
20226 -       write_seqlock(&jiffies_lock);
20227 +       raw_spin_lock(&jiffies_lock);
20228 +       write_seqcount_begin(&jiffies_seq);
20229  
20230         delta = ktime_sub(now, last_jiffies_update);
20231         if (delta.tv64 >= tick_period.tv64) {
20232 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
20233                 /* Keep the tick_next_period variable up to date */
20234                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
20235         } else {
20236 -               write_sequnlock(&jiffies_lock);
20237 +               write_seqcount_end(&jiffies_seq);
20238 +               raw_spin_unlock(&jiffies_lock);
20239                 return;
20240         }
20241 -       write_sequnlock(&jiffies_lock);
20242 +       write_seqcount_end(&jiffies_seq);
20243 +       raw_spin_unlock(&jiffies_lock);
20244         update_wall_time();
20245  }
20246  
20247 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
20248  {
20249         ktime_t period;
20250  
20251 -       write_seqlock(&jiffies_lock);
20252 +       raw_spin_lock(&jiffies_lock);
20253 +       write_seqcount_begin(&jiffies_seq);
20254         /* Did we start the jiffies update yet ? */
20255         if (last_jiffies_update.tv64 == 0)
20256                 last_jiffies_update = tick_next_period;
20257         period = last_jiffies_update;
20258 -       write_sequnlock(&jiffies_lock);
20259 +       write_seqcount_end(&jiffies_seq);
20260 +       raw_spin_unlock(&jiffies_lock);
20261         return period;
20262  }
20263  
20264 @@ -215,6 +220,7 @@ static void nohz_full_kick_func(struct irq_work *work)
20265  
20266  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
20267         .func = nohz_full_kick_func,
20268 +       .flags = IRQ_WORK_HARD_IRQ,
20269  };
20270  
20271  /*
20272 @@ -673,10 +679,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
20273  
20274         /* Read jiffies and the time when jiffies were updated last */
20275         do {
20276 -               seq = read_seqbegin(&jiffies_lock);
20277 +               seq = read_seqcount_begin(&jiffies_seq);
20278                 basemono = last_jiffies_update.tv64;
20279                 basejiff = jiffies;
20280 -       } while (read_seqretry(&jiffies_lock, seq));
20281 +       } while (read_seqcount_retry(&jiffies_seq, seq));
20282         ts->last_jiffies = basejiff;
20283  
20284         if (rcu_needs_cpu(basemono, &next_rcu) ||
20285 @@ -877,14 +883,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
20286                 return false;
20287  
20288         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
20289 -               static int ratelimit;
20290 -
20291 -               if (ratelimit < 10 &&
20292 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
20293 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
20294 -                               (unsigned int) local_softirq_pending());
20295 -                       ratelimit++;
20296 -               }
20297 +               softirq_check_pending_idle();
20298                 return false;
20299         }
20300  
20301 @@ -1193,6 +1192,7 @@ void tick_setup_sched_timer(void)
20302          * Emulate tick processing via per-CPU hrtimers:
20303          */
20304         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
20305 +       ts->sched_timer.irqsafe = 1;
20306         ts->sched_timer.function = tick_sched_timer;
20307  
20308         /* Get the next period (per-CPU) */
20309 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
20310 index 46e312e9be38..fa75cf5d9253 100644
20311 --- a/kernel/time/timekeeping.c
20312 +++ b/kernel/time/timekeeping.c
20313 @@ -2328,8 +2328,10 @@ EXPORT_SYMBOL(hardpps);
20314   */
20315  void xtime_update(unsigned long ticks)
20316  {
20317 -       write_seqlock(&jiffies_lock);
20318 +       raw_spin_lock(&jiffies_lock);
20319 +       write_seqcount_begin(&jiffies_seq);
20320         do_timer(ticks);
20321 -       write_sequnlock(&jiffies_lock);
20322 +       write_seqcount_end(&jiffies_seq);
20323 +       raw_spin_unlock(&jiffies_lock);
20324         update_wall_time();
20325  }
20326 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
20327 index 704f595ce83f..763a3e5121ff 100644
20328 --- a/kernel/time/timekeeping.h
20329 +++ b/kernel/time/timekeeping.h
20330 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
20331  extern void do_timer(unsigned long ticks);
20332  extern void update_wall_time(void);
20333  
20334 -extern seqlock_t jiffies_lock;
20335 +extern raw_spinlock_t jiffies_lock;
20336 +extern seqcount_t jiffies_seq;
20337  
20338  #define CS_NAME_LEN    32
20339  
20340 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
20341 index c611c47de884..cdff4411f8f6 100644
20342 --- a/kernel/time/timer.c
20343 +++ b/kernel/time/timer.c
20344 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
20345  #endif
20346  
20347  struct timer_base {
20348 -       spinlock_t              lock;
20349 +       raw_spinlock_t          lock;
20350         struct timer_list       *running_timer;
20351 +#ifdef CONFIG_PREEMPT_RT_FULL
20352 +       struct swait_queue_head wait_for_running_timer;
20353 +#endif
20354         unsigned long           clk;
20355         unsigned long           next_expiry;
20356         unsigned int            cpu;
20357 @@ -203,6 +206,8 @@ struct timer_base {
20358         bool                    is_idle;
20359         DECLARE_BITMAP(pending_map, WHEEL_SIZE);
20360         struct hlist_head       vectors[WHEEL_SIZE];
20361 +       struct hlist_head       expired_lists[LVL_DEPTH];
20362 +       int                     expired_count;
20363  } ____cacheline_aligned;
20364  
20365  static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
20366 @@ -948,10 +953,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
20367  
20368                 if (!(tf & TIMER_MIGRATING)) {
20369                         base = get_timer_base(tf);
20370 -                       spin_lock_irqsave(&base->lock, *flags);
20371 +                       raw_spin_lock_irqsave(&base->lock, *flags);
20372                         if (timer->flags == tf)
20373                                 return base;
20374 -                       spin_unlock_irqrestore(&base->lock, *flags);
20375 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
20376                 }
20377                 cpu_relax();
20378         }
20379 @@ -1023,9 +1028,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
20380                         /* See the comment in lock_timer_base() */
20381                         timer->flags |= TIMER_MIGRATING;
20382  
20383 -                       spin_unlock(&base->lock);
20384 +                       raw_spin_unlock(&base->lock);
20385                         base = new_base;
20386 -                       spin_lock(&base->lock);
20387 +                       raw_spin_lock(&base->lock);
20388                         WRITE_ONCE(timer->flags,
20389                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
20390                 }
20391 @@ -1050,7 +1055,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
20392         }
20393  
20394  out_unlock:
20395 -       spin_unlock_irqrestore(&base->lock, flags);
20396 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20397  
20398         return ret;
20399  }
20400 @@ -1144,19 +1149,46 @@ void add_timer_on(struct timer_list *timer, int cpu)
20401         if (base != new_base) {
20402                 timer->flags |= TIMER_MIGRATING;
20403  
20404 -               spin_unlock(&base->lock);
20405 +               raw_spin_unlock(&base->lock);
20406                 base = new_base;
20407 -               spin_lock(&base->lock);
20408 +               raw_spin_lock(&base->lock);
20409                 WRITE_ONCE(timer->flags,
20410                            (timer->flags & ~TIMER_BASEMASK) | cpu);
20411         }
20412  
20413         debug_activate(timer, timer->expires);
20414         internal_add_timer(base, timer);
20415 -       spin_unlock_irqrestore(&base->lock, flags);
20416 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20417  }
20418  EXPORT_SYMBOL_GPL(add_timer_on);
20419  
20420 +#ifdef CONFIG_PREEMPT_RT_FULL
20421 +/*
20422 + * Wait for a running timer
20423 + */
20424 +static void wait_for_running_timer(struct timer_list *timer)
20425 +{
20426 +       struct timer_base *base;
20427 +       u32 tf = timer->flags;
20428 +
20429 +       if (tf & TIMER_MIGRATING)
20430 +               return;
20431 +
20432 +       base = get_timer_base(tf);
20433 +       swait_event(base->wait_for_running_timer,
20434 +                  base->running_timer != timer);
20435 +}
20436 +
20437 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
20438 +#else
20439 +static inline void wait_for_running_timer(struct timer_list *timer)
20440 +{
20441 +       cpu_relax();
20442 +}
20443 +
20444 +# define wakeup_timer_waiters(b)       do { } while (0)
20445 +#endif
20446 +
20447  /**
20448   * del_timer - deactive a timer.
20449   * @timer: the timer to be deactivated
20450 @@ -1180,7 +1212,7 @@ int del_timer(struct timer_list *timer)
20451         if (timer_pending(timer)) {
20452                 base = lock_timer_base(timer, &flags);
20453                 ret = detach_if_pending(timer, base, true);
20454 -               spin_unlock_irqrestore(&base->lock, flags);
20455 +               raw_spin_unlock_irqrestore(&base->lock, flags);
20456         }
20457  
20458         return ret;
20459 @@ -1208,13 +1240,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
20460                 timer_stats_timer_clear_start_info(timer);
20461                 ret = detach_if_pending(timer, base, true);
20462         }
20463 -       spin_unlock_irqrestore(&base->lock, flags);
20464 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20465  
20466         return ret;
20467  }
20468  EXPORT_SYMBOL(try_to_del_timer_sync);
20469  
20470 -#ifdef CONFIG_SMP
20471 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
20472  /**
20473   * del_timer_sync - deactivate a timer and wait for the handler to finish.
20474   * @timer: the timer to be deactivated
20475 @@ -1274,7 +1306,7 @@ int del_timer_sync(struct timer_list *timer)
20476                 int ret = try_to_del_timer_sync(timer);
20477                 if (ret >= 0)
20478                         return ret;
20479 -               cpu_relax();
20480 +               wait_for_running_timer(timer);
20481         }
20482  }
20483  EXPORT_SYMBOL(del_timer_sync);
20484 @@ -1323,7 +1355,8 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
20485         }
20486  }
20487  
20488 -static void expire_timers(struct timer_base *base, struct hlist_head *head)
20489 +static inline void __expire_timers(struct timer_base *base,
20490 +                                  struct hlist_head *head)
20491  {
20492         while (!hlist_empty(head)) {
20493                 struct timer_list *timer;
20494 @@ -1339,33 +1372,53 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
20495                 fn = timer->function;
20496                 data = timer->data;
20497  
20498 -               if (timer->flags & TIMER_IRQSAFE) {
20499 -                       spin_unlock(&base->lock);
20500 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
20501 +                   timer->flags & TIMER_IRQSAFE) {
20502 +                       raw_spin_unlock(&base->lock);
20503                         call_timer_fn(timer, fn, data);
20504 -                       spin_lock(&base->lock);
20505 +                       base->running_timer = NULL;
20506 +                       raw_spin_lock(&base->lock);
20507                 } else {
20508 -                       spin_unlock_irq(&base->lock);
20509 +                       raw_spin_unlock_irq(&base->lock);
20510                         call_timer_fn(timer, fn, data);
20511 -                       spin_lock_irq(&base->lock);
20512 +                       base->running_timer = NULL;
20513 +                       raw_spin_lock_irq(&base->lock);
20514                 }
20515         }
20516  }
20517  
20518 -static int __collect_expired_timers(struct timer_base *base,
20519 -                                   struct hlist_head *heads)
20520 +static void expire_timers(struct timer_base *base)
20521 +{
20522 +       struct hlist_head *head;
20523 +
20524 +       while (base->expired_count--) {
20525 +               head = base->expired_lists + base->expired_count;
20526 +               __expire_timers(base, head);
20527 +       }
20528 +       base->expired_count = 0;
20529 +}
20530 +
20531 +static void __collect_expired_timers(struct timer_base *base)
20532  {
20533         unsigned long clk = base->clk;
20534         struct hlist_head *vec;
20535 -       int i, levels = 0;
20536 +       int i;
20537         unsigned int idx;
20538  
20539 +       /*
20540 +        * expire_timers() must be called at least once before we can
20541 +        * collect more timers
20542 +        */
20543 +       if (WARN_ON(base->expired_count))
20544 +               return;
20545 +
20546         for (i = 0; i < LVL_DEPTH; i++) {
20547                 idx = (clk & LVL_MASK) + i * LVL_SIZE;
20548  
20549                 if (__test_and_clear_bit(idx, base->pending_map)) {
20550                         vec = base->vectors + idx;
20551 -                       hlist_move_list(vec, heads++);
20552 -                       levels++;
20553 +                       hlist_move_list(vec,
20554 +                               &base->expired_lists[base->expired_count++]);
20555                 }
20556                 /* Is it time to look at the next level? */
20557                 if (clk & LVL_CLK_MASK)
20558 @@ -1373,7 +1426,6 @@ static int __collect_expired_timers(struct timer_base *base,
20559                 /* Shift clock for the next level granularity */
20560                 clk >>= LVL_CLK_SHIFT;
20561         }
20562 -       return levels;
20563  }
20564  
20565  #ifdef CONFIG_NO_HZ_COMMON
20566 @@ -1515,7 +1567,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
20567         if (cpu_is_offline(smp_processor_id()))
20568                 return expires;
20569  
20570 -       spin_lock(&base->lock);
20571 +       raw_spin_lock(&base->lock);
20572         nextevt = __next_timer_interrupt(base);
20573         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
20574         base->next_expiry = nextevt;
20575 @@ -1543,7 +1595,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
20576                 if ((expires - basem) > TICK_NSEC)
20577                         base->is_idle = true;
20578         }
20579 -       spin_unlock(&base->lock);
20580 +       raw_spin_unlock(&base->lock);
20581  
20582         return cmp_next_hrtimer_event(basem, expires);
20583  }
20584 @@ -1566,8 +1618,7 @@ void timer_clear_idle(void)
20585         base->is_idle = false;
20586  }
20587  
20588 -static int collect_expired_timers(struct timer_base *base,
20589 -                                 struct hlist_head *heads)
20590 +static void collect_expired_timers(struct timer_base *base)
20591  {
20592         /*
20593          * NOHZ optimization. After a long idle sleep we need to forward the
20594 @@ -1584,20 +1635,49 @@ static int collect_expired_timers(struct timer_base *base,
20595                 if (time_after(next, jiffies)) {
20596                         /* The call site will increment clock! */
20597                         base->clk = jiffies - 1;
20598 -                       return 0;
20599 +                       return;
20600                 }
20601                 base->clk = next;
20602         }
20603 -       return __collect_expired_timers(base, heads);
20604 +       __collect_expired_timers(base);
20605  }
20606  #else
20607 -static inline int collect_expired_timers(struct timer_base *base,
20608 -                                        struct hlist_head *heads)
20609 +static inline void collect_expired_timers(struct timer_base *base)
20610  {
20611 -       return __collect_expired_timers(base, heads);
20612 +       __collect_expired_timers(base);
20613  }
20614  #endif
20615  
20616 +static int find_expired_timers(struct timer_base *base)
20617 +{
20618 +       const unsigned long int end_clk = jiffies;
20619 +
20620 +       while (!base->expired_count && time_after_eq(end_clk, base->clk)) {
20621 +               collect_expired_timers(base);
20622 +               base->clk++;
20623 +       }
20624 +
20625 +       return base->expired_count;
20626 +}
20627 +
20628 +/* Called from CPU tick routine to quickly collect expired timers */
20629 +static int tick_find_expired(struct timer_base *base)
20630 +{
20631 +       int count;
20632 +
20633 +       raw_spin_lock(&base->lock);
20634 +
20635 +       if (unlikely(time_after(jiffies, base->clk + HZ))) {
20636 +               /* defer to ktimersoftd; don't spend too long in irq context */
20637 +               count = -1;
20638 +       } else
20639 +               count = find_expired_timers(base);
20640 +
20641 +       raw_spin_unlock(&base->lock);
20642 +
20643 +       return count;
20644 +}
20645 +
20646  /*
20647   * Called from the timer interrupt handler to charge one tick to the current
20648   * process.  user_tick is 1 if the tick is user time, 0 for system.
20649 @@ -1608,13 +1688,13 @@ void update_process_times(int user_tick)
20650  
20651         /* Note: this timer irq context must be accounted for as well. */
20652         account_process_tick(p, user_tick);
20653 +       scheduler_tick();
20654         run_local_timers();
20655         rcu_check_callbacks(user_tick);
20656 -#ifdef CONFIG_IRQ_WORK
20657 +#if defined(CONFIG_IRQ_WORK)
20658         if (in_irq())
20659                 irq_work_tick();
20660  #endif
20661 -       scheduler_tick();
20662         run_posix_cpu_timers(p);
20663  }
20664  
20665 @@ -1624,24 +1704,13 @@ void update_process_times(int user_tick)
20666   */
20667  static inline void __run_timers(struct timer_base *base)
20668  {
20669 -       struct hlist_head heads[LVL_DEPTH];
20670 -       int levels;
20671 +       raw_spin_lock_irq(&base->lock);
20672  
20673 -       if (!time_after_eq(jiffies, base->clk))
20674 -               return;
20675 +       while (find_expired_timers(base))
20676 +               expire_timers(base);
20677  
20678 -       spin_lock_irq(&base->lock);
20679 -
20680 -       while (time_after_eq(jiffies, base->clk)) {
20681 -
20682 -               levels = collect_expired_timers(base, heads);
20683 -               base->clk++;
20684 -
20685 -               while (levels--)
20686 -                       expire_timers(base, heads + levels);
20687 -       }
20688 -       base->running_timer = NULL;
20689 -       spin_unlock_irq(&base->lock);
20690 +       raw_spin_unlock_irq(&base->lock);
20691 +       wakeup_timer_waiters(base);
20692  }
20693  
20694  /*
20695 @@ -1651,6 +1720,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
20696  {
20697         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
20698  
20699 +       irq_work_tick_soft();
20700 +
20701         __run_timers(base);
20702         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
20703                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
20704 @@ -1665,12 +1736,12 @@ void run_local_timers(void)
20705  
20706         hrtimer_run_queues();
20707         /* Raise the softirq only if required. */
20708 -       if (time_before(jiffies, base->clk)) {
20709 +       if (time_before(jiffies, base->clk) || !tick_find_expired(base)) {
20710                 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
20711                         return;
20712                 /* CPU is awake, so check the deferrable base. */
20713                 base++;
20714 -               if (time_before(jiffies, base->clk))
20715 +               if (time_before(jiffies, base->clk) || !tick_find_expired(base))
20716                         return;
20717         }
20718         raise_softirq(TIMER_SOFTIRQ);
20719 @@ -1836,16 +1907,17 @@ int timers_dead_cpu(unsigned int cpu)
20720                  * The caller is globally serialized and nobody else
20721                  * takes two locks at once, deadlock is not possible.
20722                  */
20723 -               spin_lock_irq(&new_base->lock);
20724 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
20725 +               raw_spin_lock_irq(&new_base->lock);
20726 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
20727  
20728                 BUG_ON(old_base->running_timer);
20729 +               BUG_ON(old_base->expired_count);
20730  
20731                 for (i = 0; i < WHEEL_SIZE; i++)
20732                         migrate_timer_list(new_base, old_base->vectors + i);
20733  
20734 -               spin_unlock(&old_base->lock);
20735 -               spin_unlock_irq(&new_base->lock);
20736 +               raw_spin_unlock(&old_base->lock);
20737 +               raw_spin_unlock_irq(&new_base->lock);
20738                 put_cpu_ptr(&timer_bases);
20739         }
20740         return 0;
20741 @@ -1861,8 +1933,12 @@ static void __init init_timer_cpu(int cpu)
20742         for (i = 0; i < NR_BASES; i++) {
20743                 base = per_cpu_ptr(&timer_bases[i], cpu);
20744                 base->cpu = cpu;
20745 -               spin_lock_init(&base->lock);
20746 +               raw_spin_lock_init(&base->lock);
20747                 base->clk = jiffies;
20748 +#ifdef CONFIG_PREEMPT_RT_FULL
20749 +               init_swait_queue_head(&base->wait_for_running_timer);
20750 +#endif
20751 +               base->expired_count = 0;
20752         }
20753  }
20754  
20755 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
20756 index 2a96b063d659..812e37237eb8 100644
20757 --- a/kernel/trace/Kconfig
20758 +++ b/kernel/trace/Kconfig
20759 @@ -182,6 +182,24 @@ config IRQSOFF_TRACER
20760           enabled. This option and the preempt-off timing option can be
20761           used together or separately.)
20762  
20763 +config INTERRUPT_OFF_HIST
20764 +       bool "Interrupts-off Latency Histogram"
20765 +       depends on IRQSOFF_TRACER
20766 +       help
20767 +         This option generates continuously updated histograms (one per cpu)
20768 +         of the duration of time periods with interrupts disabled. The
20769 +         histograms are disabled by default. To enable them, write a non-zero
20770 +         number to
20771 +
20772 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
20773 +
20774 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
20775 +         per cpu) are generated that accumulate the duration of time periods
20776 +         when both interrupts and preemption are disabled. The histogram data
20777 +         will be located in the debug file system at
20778 +
20779 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
20780 +
20781  config PREEMPT_TRACER
20782         bool "Preemption-off Latency Tracer"
20783         default n
20784 @@ -206,6 +224,24 @@ config PREEMPT_TRACER
20785           enabled. This option and the irqs-off timing option can be
20786           used together or separately.)
20787  
20788 +config PREEMPT_OFF_HIST
20789 +       bool "Preemption-off Latency Histogram"
20790 +       depends on PREEMPT_TRACER
20791 +       help
20792 +         This option generates continuously updated histograms (one per cpu)
20793 +         of the duration of time periods with preemption disabled. The
20794 +         histograms are disabled by default. To enable them, write a non-zero
20795 +         number to
20796 +
20797 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
20798 +
20799 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
20800 +         per cpu) are generated that accumulate the duration of time periods
20801 +         when both interrupts and preemption are disabled. The histogram data
20802 +         will be located in the debug file system at
20803 +
20804 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
20805 +
20806  config SCHED_TRACER
20807         bool "Scheduling Latency Tracer"
20808         select GENERIC_TRACER
20809 @@ -251,6 +287,74 @@ config HWLAT_TRACER
20810          file. Every time a latency is greater than tracing_thresh, it will
20811          be recorded into the ring buffer.
20812  
20813 +config WAKEUP_LATENCY_HIST
20814 +       bool "Scheduling Latency Histogram"
20815 +       depends on SCHED_TRACER
20816 +       help
20817 +         This option generates continuously updated histograms (one per cpu)
20818 +         of the scheduling latency of the highest priority task.
20819 +         The histograms are disabled by default. To enable them, write a
20820 +         non-zero number to
20821 +
20822 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
20823 +
20824 +         Two different algorithms are used, one to determine the latency of
20825 +         processes that exclusively use the highest priority of the system and
20826 +         another one to determine the latency of processes that share the
20827 +         highest system priority with other processes. The former is used to
20828 +         improve hardware and system software, the latter to optimize the
20829 +         priority design of a given system. The histogram data will be
20830 +         located in the debug file system at
20831 +
20832 +             /sys/kernel/debug/tracing/latency_hist/wakeup
20833 +
20834 +         and
20835 +
20836 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
20837 +
20838 +         If both Scheduling Latency Histogram and Missed Timer Offsets
20839 +         Histogram are selected, additional histogram data will be collected
20840 +         that contain, in addition to the wakeup latency, the timer latency, in
20841 +         case the wakeup was triggered by an expired timer. These histograms
20842 +         are available in the
20843 +
20844 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
20845 +
20846 +         directory. They reflect the apparent interrupt and scheduling latency
20847 +         and are best suitable to determine the worst-case latency of a given
20848 +         system. To enable these histograms, write a non-zero number to
20849 +
20850 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
20851 +
20852 +config MISSED_TIMER_OFFSETS_HIST
20853 +       depends on HIGH_RES_TIMERS
20854 +       select GENERIC_TRACER
20855 +       bool "Missed Timer Offsets Histogram"
20856 +       help
20857 +         Generate a histogram of missed timer offsets in microseconds. The
20858 +         histograms are disabled by default. To enable them, write a non-zero
20859 +         number to
20860 +
20861 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
20862 +
20863 +         The histogram data will be located in the debug file system at
20864 +
20865 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
20866 +
20867 +         If both Scheduling Latency Histogram and Missed Timer Offsets
20868 +         Histogram are selected, additional histogram data will be collected
20869 +         that contain, in addition to the wakeup latency, the timer latency, in
20870 +         case the wakeup was triggered by an expired timer. These histograms
20871 +         are available in the
20872 +
20873 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
20874 +
20875 +         directory. They reflect the apparent interrupt and scheduling latency
20876 +         and are best suitable to determine the worst-case latency of a given
20877 +         system. To enable these histograms, write a non-zero number to
20878 +
20879 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
20880 +
20881  config ENABLE_DEFAULT_TRACERS
20882         bool "Trace process context switches and events"
20883         depends on !GENERIC_TRACER
20884 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
20885 index e57980845549..83af000b783c 100644
20886 --- a/kernel/trace/Makefile
20887 +++ b/kernel/trace/Makefile
20888 @@ -38,6 +38,10 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20889  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
20890  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
20891  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
20892 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
20893 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
20894 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
20895 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
20896  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
20897  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
20898  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
20899 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
20900 new file mode 100644
20901 index 000000000000..7f6ee70dea41
20902 --- /dev/null
20903 +++ b/kernel/trace/latency_hist.c
20904 @@ -0,0 +1,1178 @@
20905 +/*
20906 + * kernel/trace/latency_hist.c
20907 + *
20908 + * Add support for histograms of preemption-off latency and
20909 + * interrupt-off latency and wakeup latency, it depends on
20910 + * Real-Time Preemption Support.
20911 + *
20912 + *  Copyright (C) 2005 MontaVista Software, Inc.
20913 + *  Yi Yang <yyang@ch.mvista.com>
20914 + *
20915 + *  Converted to work with the new latency tracer.
20916 + *  Copyright (C) 2008 Red Hat, Inc.
20917 + *    Steven Rostedt <srostedt@redhat.com>
20918 + *
20919 + */
20920 +#include <linux/module.h>
20921 +#include <linux/debugfs.h>
20922 +#include <linux/seq_file.h>
20923 +#include <linux/percpu.h>
20924 +#include <linux/kallsyms.h>
20925 +#include <linux/uaccess.h>
20926 +#include <linux/sched.h>
20927 +#include <linux/sched/rt.h>
20928 +#include <linux/slab.h>
20929 +#include <linux/atomic.h>
20930 +#include <asm/div64.h>
20931 +
20932 +#include "trace.h"
20933 +#include <trace/events/sched.h>
20934 +
20935 +#define NSECS_PER_USECS 1000L
20936 +
20937 +#define CREATE_TRACE_POINTS
20938 +#include <trace/events/hist.h>
20939 +
20940 +enum {
20941 +       IRQSOFF_LATENCY = 0,
20942 +       PREEMPTOFF_LATENCY,
20943 +       PREEMPTIRQSOFF_LATENCY,
20944 +       WAKEUP_LATENCY,
20945 +       WAKEUP_LATENCY_SHAREDPRIO,
20946 +       MISSED_TIMER_OFFSETS,
20947 +       TIMERANDWAKEUP_LATENCY,
20948 +       MAX_LATENCY_TYPE,
20949 +};
20950 +
20951 +#define MAX_ENTRY_NUM 10240
20952 +
20953 +struct hist_data {
20954 +       atomic_t hist_mode; /* 0 log, 1 don't log */
20955 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
20956 +       long min_lat;
20957 +       long max_lat;
20958 +       unsigned long long below_hist_bound_samples;
20959 +       unsigned long long above_hist_bound_samples;
20960 +       long long accumulate_lat;
20961 +       unsigned long long total_samples;
20962 +       unsigned long long hist_array[MAX_ENTRY_NUM];
20963 +};
20964 +
20965 +struct enable_data {
20966 +       int latency_type;
20967 +       int enabled;
20968 +};
20969 +
20970 +static char *latency_hist_dir_root = "latency_hist";
20971 +
20972 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20973 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
20974 +static char *irqsoff_hist_dir = "irqsoff";
20975 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
20976 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
20977 +#endif
20978 +
20979 +#ifdef CONFIG_PREEMPT_OFF_HIST
20980 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
20981 +static char *preemptoff_hist_dir = "preemptoff";
20982 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
20983 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
20984 +#endif
20985 +
20986 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
20987 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
20988 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
20989 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
20990 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
20991 +#endif
20992 +
20993 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
20994 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
20995 +static struct enable_data preemptirqsoff_enabled_data = {
20996 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
20997 +       .enabled = 0,
20998 +};
20999 +#endif
21000 +
21001 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21002 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21003 +struct maxlatproc_data {
21004 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
21005 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
21006 +       int pid;
21007 +       int current_pid;
21008 +       int prio;
21009 +       int current_prio;
21010 +       long latency;
21011 +       long timeroffset;
21012 +       cycle_t timestamp;
21013 +};
21014 +#endif
21015 +
21016 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21017 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
21018 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
21019 +static char *wakeup_latency_hist_dir = "wakeup";
21020 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
21021 +static notrace void probe_wakeup_latency_hist_start(void *v,
21022 +       struct task_struct *p);
21023 +static notrace void probe_wakeup_latency_hist_stop(void *v,
21024 +       bool preempt, struct task_struct *prev, struct task_struct *next);
21025 +static notrace void probe_sched_migrate_task(void *,
21026 +       struct task_struct *task, int cpu);
21027 +static struct enable_data wakeup_latency_enabled_data = {
21028 +       .latency_type = WAKEUP_LATENCY,
21029 +       .enabled = 0,
21030 +};
21031 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
21032 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
21033 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
21034 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
21035 +static unsigned long wakeup_pid;
21036 +#endif
21037 +
21038 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21039 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
21040 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
21041 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
21042 +       long long offset, struct task_struct *curr, struct task_struct *task);
21043 +static struct enable_data missed_timer_offsets_enabled_data = {
21044 +       .latency_type = MISSED_TIMER_OFFSETS,
21045 +       .enabled = 0,
21046 +};
21047 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
21048 +static unsigned long missed_timer_offsets_pid;
21049 +#endif
21050 +
21051 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21052 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21053 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
21054 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
21055 +static struct enable_data timerandwakeup_enabled_data = {
21056 +       .latency_type = TIMERANDWAKEUP_LATENCY,
21057 +       .enabled = 0,
21058 +};
21059 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
21060 +#endif
21061 +
21062 +void notrace latency_hist(int latency_type, int cpu, long latency,
21063 +                         long timeroffset, cycle_t stop,
21064 +                         struct task_struct *p)
21065 +{
21066 +       struct hist_data *my_hist;
21067 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21068 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21069 +       struct maxlatproc_data *mp = NULL;
21070 +#endif
21071 +
21072 +       if (!cpu_possible(cpu) || latency_type < 0 ||
21073 +           latency_type >= MAX_LATENCY_TYPE)
21074 +               return;
21075 +
21076 +       switch (latency_type) {
21077 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21078 +       case IRQSOFF_LATENCY:
21079 +               my_hist = &per_cpu(irqsoff_hist, cpu);
21080 +               break;
21081 +#endif
21082 +#ifdef CONFIG_PREEMPT_OFF_HIST
21083 +       case PREEMPTOFF_LATENCY:
21084 +               my_hist = &per_cpu(preemptoff_hist, cpu);
21085 +               break;
21086 +#endif
21087 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
21088 +       case PREEMPTIRQSOFF_LATENCY:
21089 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
21090 +               break;
21091 +#endif
21092 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21093 +       case WAKEUP_LATENCY:
21094 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
21095 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
21096 +               break;
21097 +       case WAKEUP_LATENCY_SHAREDPRIO:
21098 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
21099 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
21100 +               break;
21101 +#endif
21102 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21103 +       case MISSED_TIMER_OFFSETS:
21104 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
21105 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
21106 +               break;
21107 +#endif
21108 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21109 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21110 +       case TIMERANDWAKEUP_LATENCY:
21111 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
21112 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
21113 +               break;
21114 +#endif
21115 +
21116 +       default:
21117 +               return;
21118 +       }
21119 +
21120 +       latency += my_hist->offset;
21121 +
21122 +       if (atomic_read(&my_hist->hist_mode) == 0)
21123 +               return;
21124 +
21125 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
21126 +               if (latency < 0)
21127 +                       my_hist->below_hist_bound_samples++;
21128 +               else
21129 +                       my_hist->above_hist_bound_samples++;
21130 +       } else
21131 +               my_hist->hist_array[latency]++;
21132 +
21133 +       if (unlikely(latency > my_hist->max_lat ||
21134 +           my_hist->min_lat == LONG_MAX)) {
21135 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21136 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21137 +               if (latency_type == WAKEUP_LATENCY ||
21138 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
21139 +                   latency_type == MISSED_TIMER_OFFSETS ||
21140 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
21141 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
21142 +                       strncpy(mp->current_comm, current->comm,
21143 +                           sizeof(mp->current_comm));
21144 +                       mp->pid = task_pid_nr(p);
21145 +                       mp->current_pid = task_pid_nr(current);
21146 +                       mp->prio = p->prio;
21147 +                       mp->current_prio = current->prio;
21148 +                       mp->latency = latency;
21149 +                       mp->timeroffset = timeroffset;
21150 +                       mp->timestamp = stop;
21151 +               }
21152 +#endif
21153 +               my_hist->max_lat = latency;
21154 +       }
21155 +       if (unlikely(latency < my_hist->min_lat))
21156 +               my_hist->min_lat = latency;
21157 +       my_hist->total_samples++;
21158 +       my_hist->accumulate_lat += latency;
21159 +}
21160 +
21161 +static void *l_start(struct seq_file *m, loff_t *pos)
21162 +{
21163 +       loff_t *index_ptr = NULL;
21164 +       loff_t index = *pos;
21165 +       struct hist_data *my_hist = m->private;
21166 +
21167 +       if (index == 0) {
21168 +               char minstr[32], avgstr[32], maxstr[32];
21169 +
21170 +               atomic_dec(&my_hist->hist_mode);
21171 +
21172 +               if (likely(my_hist->total_samples)) {
21173 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
21174 +                           my_hist->total_samples);
21175 +                       snprintf(minstr, sizeof(minstr), "%ld",
21176 +                           my_hist->min_lat - my_hist->offset);
21177 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
21178 +                           avg - my_hist->offset);
21179 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
21180 +                           my_hist->max_lat - my_hist->offset);
21181 +               } else {
21182 +                       strcpy(minstr, "<undef>");
21183 +                       strcpy(avgstr, minstr);
21184 +                       strcpy(maxstr, minstr);
21185 +               }
21186 +
21187 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
21188 +                          "#Average latency: %s microseconds\n"
21189 +                          "#Maximum latency: %s microseconds\n"
21190 +                          "#Total samples: %llu\n"
21191 +                          "#There are %llu samples lower than %ld"
21192 +                          " microseconds.\n"
21193 +                          "#There are %llu samples greater or equal"
21194 +                          " than %ld microseconds.\n"
21195 +                          "#usecs\t%16s\n",
21196 +                          minstr, avgstr, maxstr,
21197 +                          my_hist->total_samples,
21198 +                          my_hist->below_hist_bound_samples,
21199 +                          -my_hist->offset,
21200 +                          my_hist->above_hist_bound_samples,
21201 +                          MAX_ENTRY_NUM - my_hist->offset,
21202 +                          "samples");
21203 +       }
21204 +       if (index < MAX_ENTRY_NUM) {
21205 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
21206 +               if (index_ptr)
21207 +                       *index_ptr = index;
21208 +       }
21209 +
21210 +       return index_ptr;
21211 +}
21212 +
21213 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
21214 +{
21215 +       loff_t *index_ptr = p;
21216 +       struct hist_data *my_hist = m->private;
21217 +
21218 +       if (++*pos >= MAX_ENTRY_NUM) {
21219 +               atomic_inc(&my_hist->hist_mode);
21220 +               return NULL;
21221 +       }
21222 +       *index_ptr = *pos;
21223 +       return index_ptr;
21224 +}
21225 +
21226 +static void l_stop(struct seq_file *m, void *p)
21227 +{
21228 +       kfree(p);
21229 +}
21230 +
21231 +static int l_show(struct seq_file *m, void *p)
21232 +{
21233 +       int index = *(loff_t *) p;
21234 +       struct hist_data *my_hist = m->private;
21235 +
21236 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
21237 +           my_hist->hist_array[index]);
21238 +       return 0;
21239 +}
21240 +
21241 +static const struct seq_operations latency_hist_seq_op = {
21242 +       .start = l_start,
21243 +       .next  = l_next,
21244 +       .stop  = l_stop,
21245 +       .show  = l_show
21246 +};
21247 +
21248 +static int latency_hist_open(struct inode *inode, struct file *file)
21249 +{
21250 +       int ret;
21251 +
21252 +       ret = seq_open(file, &latency_hist_seq_op);
21253 +       if (!ret) {
21254 +               struct seq_file *seq = file->private_data;
21255 +               seq->private = inode->i_private;
21256 +       }
21257 +       return ret;
21258 +}
21259 +
21260 +static const struct file_operations latency_hist_fops = {
21261 +       .open = latency_hist_open,
21262 +       .read = seq_read,
21263 +       .llseek = seq_lseek,
21264 +       .release = seq_release,
21265 +};
21266 +
21267 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21268 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21269 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
21270 +{
21271 +       mp->comm[0] = mp->current_comm[0] = '\0';
21272 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
21273 +           mp->latency = mp->timeroffset = -1;
21274 +       mp->timestamp = 0;
21275 +}
21276 +#endif
21277 +
21278 +static void hist_reset(struct hist_data *hist)
21279 +{
21280 +       atomic_dec(&hist->hist_mode);
21281 +
21282 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
21283 +       hist->below_hist_bound_samples = 0ULL;
21284 +       hist->above_hist_bound_samples = 0ULL;
21285 +       hist->min_lat = LONG_MAX;
21286 +       hist->max_lat = LONG_MIN;
21287 +       hist->total_samples = 0ULL;
21288 +       hist->accumulate_lat = 0LL;
21289 +
21290 +       atomic_inc(&hist->hist_mode);
21291 +}
21292 +
21293 +static ssize_t
21294 +latency_hist_reset(struct file *file, const char __user *a,
21295 +                  size_t size, loff_t *off)
21296 +{
21297 +       int cpu;
21298 +       struct hist_data *hist = NULL;
21299 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21300 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21301 +       struct maxlatproc_data *mp = NULL;
21302 +#endif
21303 +       off_t latency_type = (off_t) file->private_data;
21304 +
21305 +       for_each_online_cpu(cpu) {
21306 +
21307 +               switch (latency_type) {
21308 +#ifdef CONFIG_PREEMPT_OFF_HIST
21309 +               case PREEMPTOFF_LATENCY:
21310 +                       hist = &per_cpu(preemptoff_hist, cpu);
21311 +                       break;
21312 +#endif
21313 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21314 +               case IRQSOFF_LATENCY:
21315 +                       hist = &per_cpu(irqsoff_hist, cpu);
21316 +                       break;
21317 +#endif
21318 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21319 +               case PREEMPTIRQSOFF_LATENCY:
21320 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
21321 +                       break;
21322 +#endif
21323 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21324 +               case WAKEUP_LATENCY:
21325 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
21326 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
21327 +                       break;
21328 +               case WAKEUP_LATENCY_SHAREDPRIO:
21329 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
21330 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
21331 +                       break;
21332 +#endif
21333 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21334 +               case MISSED_TIMER_OFFSETS:
21335 +                       hist = &per_cpu(missed_timer_offsets, cpu);
21336 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
21337 +                       break;
21338 +#endif
21339 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21340 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21341 +               case TIMERANDWAKEUP_LATENCY:
21342 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
21343 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
21344 +                       break;
21345 +#endif
21346 +               }
21347 +
21348 +               hist_reset(hist);
21349 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21350 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21351 +               if (latency_type == WAKEUP_LATENCY ||
21352 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
21353 +                   latency_type == MISSED_TIMER_OFFSETS ||
21354 +                   latency_type == TIMERANDWAKEUP_LATENCY)
21355 +                       clear_maxlatprocdata(mp);
21356 +#endif
21357 +       }
21358 +
21359 +       return size;
21360 +}
21361 +
21362 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21363 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21364 +static ssize_t
21365 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21366 +{
21367 +       char buf[64];
21368 +       int r;
21369 +       unsigned long *this_pid = file->private_data;
21370 +
21371 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
21372 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21373 +}
21374 +
21375 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
21376 +                     size_t cnt, loff_t *ppos)
21377 +{
21378 +       char buf[64];
21379 +       unsigned long pid;
21380 +       unsigned long *this_pid = file->private_data;
21381 +
21382 +       if (cnt >= sizeof(buf))
21383 +               return -EINVAL;
21384 +
21385 +       if (copy_from_user(&buf, ubuf, cnt))
21386 +               return -EFAULT;
21387 +
21388 +       buf[cnt] = '\0';
21389 +
21390 +       if (kstrtoul(buf, 10, &pid))
21391 +               return -EINVAL;
21392 +
21393 +       *this_pid = pid;
21394 +
21395 +       return cnt;
21396 +}
21397 +#endif
21398 +
21399 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21400 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21401 +static ssize_t
21402 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21403 +{
21404 +       int r;
21405 +       struct maxlatproc_data *mp = file->private_data;
21406 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
21407 +       unsigned long long t;
21408 +       unsigned long usecs, secs;
21409 +       char *buf;
21410 +
21411 +       if (mp->pid == -1 || mp->current_pid == -1) {
21412 +               buf = "(none)\n";
21413 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
21414 +                   strlen(buf));
21415 +       }
21416 +
21417 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
21418 +       if (buf == NULL)
21419 +               return -ENOMEM;
21420 +
21421 +       t = ns2usecs(mp->timestamp);
21422 +       usecs = do_div(t, USEC_PER_SEC);
21423 +       secs = (unsigned long) t;
21424 +       r = snprintf(buf, strmaxlen,
21425 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
21426 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
21427 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
21428 +           secs, usecs);
21429 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21430 +       kfree(buf);
21431 +       return r;
21432 +}
21433 +#endif
21434 +
21435 +static ssize_t
21436 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21437 +{
21438 +       char buf[64];
21439 +       struct enable_data *ed = file->private_data;
21440 +       int r;
21441 +
21442 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
21443 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21444 +}
21445 +
21446 +static ssize_t
21447 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
21448 +{
21449 +       char buf[64];
21450 +       long enable;
21451 +       struct enable_data *ed = file->private_data;
21452 +
21453 +       if (cnt >= sizeof(buf))
21454 +               return -EINVAL;
21455 +
21456 +       if (copy_from_user(&buf, ubuf, cnt))
21457 +               return -EFAULT;
21458 +
21459 +       buf[cnt] = 0;
21460 +
21461 +       if (kstrtoul(buf, 10, &enable))
21462 +               return -EINVAL;
21463 +
21464 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
21465 +               return cnt;
21466 +
21467 +       if (enable) {
21468 +               int ret;
21469 +
21470 +               switch (ed->latency_type) {
21471 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21472 +               case PREEMPTIRQSOFF_LATENCY:
21473 +                       ret = register_trace_preemptirqsoff_hist(
21474 +                           probe_preemptirqsoff_hist, NULL);
21475 +                       if (ret) {
21476 +                               pr_info("wakeup trace: Couldn't assign "
21477 +                                   "probe_preemptirqsoff_hist "
21478 +                                   "to trace_preemptirqsoff_hist\n");
21479 +                               return ret;
21480 +                       }
21481 +                       break;
21482 +#endif
21483 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21484 +               case WAKEUP_LATENCY:
21485 +                       ret = register_trace_sched_wakeup(
21486 +                           probe_wakeup_latency_hist_start, NULL);
21487 +                       if (ret) {
21488 +                               pr_info("wakeup trace: Couldn't assign "
21489 +                                   "probe_wakeup_latency_hist_start "
21490 +                                   "to trace_sched_wakeup\n");
21491 +                               return ret;
21492 +                       }
21493 +                       ret = register_trace_sched_wakeup_new(
21494 +                           probe_wakeup_latency_hist_start, NULL);
21495 +                       if (ret) {
21496 +                               pr_info("wakeup trace: Couldn't assign "
21497 +                                   "probe_wakeup_latency_hist_start "
21498 +                                   "to trace_sched_wakeup_new\n");
21499 +                               unregister_trace_sched_wakeup(
21500 +                                   probe_wakeup_latency_hist_start, NULL);
21501 +                               return ret;
21502 +                       }
21503 +                       ret = register_trace_sched_switch(
21504 +                           probe_wakeup_latency_hist_stop, NULL);
21505 +                       if (ret) {
21506 +                               pr_info("wakeup trace: Couldn't assign "
21507 +                                   "probe_wakeup_latency_hist_stop "
21508 +                                   "to trace_sched_switch\n");
21509 +                               unregister_trace_sched_wakeup(
21510 +                                   probe_wakeup_latency_hist_start, NULL);
21511 +                               unregister_trace_sched_wakeup_new(
21512 +                                   probe_wakeup_latency_hist_start, NULL);
21513 +                               return ret;
21514 +                       }
21515 +                       ret = register_trace_sched_migrate_task(
21516 +                           probe_sched_migrate_task, NULL);
21517 +                       if (ret) {
21518 +                               pr_info("wakeup trace: Couldn't assign "
21519 +                                   "probe_sched_migrate_task "
21520 +                                   "to trace_sched_migrate_task\n");
21521 +                               unregister_trace_sched_wakeup(
21522 +                                   probe_wakeup_latency_hist_start, NULL);
21523 +                               unregister_trace_sched_wakeup_new(
21524 +                                   probe_wakeup_latency_hist_start, NULL);
21525 +                               unregister_trace_sched_switch(
21526 +                                   probe_wakeup_latency_hist_stop, NULL);
21527 +                               return ret;
21528 +                       }
21529 +                       break;
21530 +#endif
21531 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21532 +               case MISSED_TIMER_OFFSETS:
21533 +                       ret = register_trace_hrtimer_interrupt(
21534 +                           probe_hrtimer_interrupt, NULL);
21535 +                       if (ret) {
21536 +                               pr_info("wakeup trace: Couldn't assign "
21537 +                                   "probe_hrtimer_interrupt "
21538 +                                   "to trace_hrtimer_interrupt\n");
21539 +                               return ret;
21540 +                       }
21541 +                       break;
21542 +#endif
21543 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21544 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21545 +               case TIMERANDWAKEUP_LATENCY:
21546 +                       if (!wakeup_latency_enabled_data.enabled ||
21547 +                           !missed_timer_offsets_enabled_data.enabled)
21548 +                               return -EINVAL;
21549 +                       break;
21550 +#endif
21551 +               default:
21552 +                       break;
21553 +               }
21554 +       } else {
21555 +               switch (ed->latency_type) {
21556 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21557 +               case PREEMPTIRQSOFF_LATENCY:
21558 +                       {
21559 +                               int cpu;
21560 +
21561 +                               unregister_trace_preemptirqsoff_hist(
21562 +                                   probe_preemptirqsoff_hist, NULL);
21563 +                               for_each_online_cpu(cpu) {
21564 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21565 +                                       per_cpu(hist_irqsoff_counting,
21566 +                                           cpu) = 0;
21567 +#endif
21568 +#ifdef CONFIG_PREEMPT_OFF_HIST
21569 +                                       per_cpu(hist_preemptoff_counting,
21570 +                                           cpu) = 0;
21571 +#endif
21572 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21573 +                                       per_cpu(hist_preemptirqsoff_counting,
21574 +                                           cpu) = 0;
21575 +#endif
21576 +                               }
21577 +                       }
21578 +                       break;
21579 +#endif
21580 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21581 +               case WAKEUP_LATENCY:
21582 +                       {
21583 +                               int cpu;
21584 +
21585 +                               unregister_trace_sched_wakeup(
21586 +                                   probe_wakeup_latency_hist_start, NULL);
21587 +                               unregister_trace_sched_wakeup_new(
21588 +                                   probe_wakeup_latency_hist_start, NULL);
21589 +                               unregister_trace_sched_switch(
21590 +                                   probe_wakeup_latency_hist_stop, NULL);
21591 +                               unregister_trace_sched_migrate_task(
21592 +                                   probe_sched_migrate_task, NULL);
21593 +
21594 +                               for_each_online_cpu(cpu) {
21595 +                                       per_cpu(wakeup_task, cpu) = NULL;
21596 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
21597 +                               }
21598 +                       }
21599 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21600 +                       timerandwakeup_enabled_data.enabled = 0;
21601 +#endif
21602 +                       break;
21603 +#endif
21604 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21605 +               case MISSED_TIMER_OFFSETS:
21606 +                       unregister_trace_hrtimer_interrupt(
21607 +                           probe_hrtimer_interrupt, NULL);
21608 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21609 +                       timerandwakeup_enabled_data.enabled = 0;
21610 +#endif
21611 +                       break;
21612 +#endif
21613 +               default:
21614 +                       break;
21615 +               }
21616 +       }
21617 +       ed->enabled = enable;
21618 +       return cnt;
21619 +}
21620 +
21621 +static const struct file_operations latency_hist_reset_fops = {
21622 +       .open = tracing_open_generic,
21623 +       .write = latency_hist_reset,
21624 +};
21625 +
21626 +static const struct file_operations enable_fops = {
21627 +       .open = tracing_open_generic,
21628 +       .read = show_enable,
21629 +       .write = do_enable,
21630 +};
21631 +
21632 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21633 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21634 +static const struct file_operations pid_fops = {
21635 +       .open = tracing_open_generic,
21636 +       .read = show_pid,
21637 +       .write = do_pid,
21638 +};
21639 +
21640 +static const struct file_operations maxlatproc_fops = {
21641 +       .open = tracing_open_generic,
21642 +       .read = show_maxlatproc,
21643 +};
21644 +#endif
21645 +
21646 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21647 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
21648 +       int starthist)
21649 +{
21650 +       int cpu = raw_smp_processor_id();
21651 +       int time_set = 0;
21652 +
21653 +       if (starthist) {
21654 +               cycle_t uninitialized_var(start);
21655 +
21656 +               if (!preempt_count() && !irqs_disabled())
21657 +                       return;
21658 +
21659 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21660 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
21661 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
21662 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
21663 +                       start = ftrace_now(cpu);
21664 +                       time_set++;
21665 +                       per_cpu(hist_irqsoff_start, cpu) = start;
21666 +               }
21667 +#endif
21668 +
21669 +#ifdef CONFIG_PREEMPT_OFF_HIST
21670 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
21671 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
21672 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
21673 +                       if (!(time_set++))
21674 +                               start = ftrace_now(cpu);
21675 +                       per_cpu(hist_preemptoff_start, cpu) = start;
21676 +               }
21677 +#endif
21678 +
21679 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21680 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
21681 +                   per_cpu(hist_preemptoff_counting, cpu) &&
21682 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
21683 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
21684 +                       if (!time_set)
21685 +                               start = ftrace_now(cpu);
21686 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
21687 +               }
21688 +#endif
21689 +       } else {
21690 +               cycle_t uninitialized_var(stop);
21691 +
21692 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21693 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
21694 +                   per_cpu(hist_irqsoff_counting, cpu)) {
21695 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
21696 +
21697 +                       stop = ftrace_now(cpu);
21698 +                       time_set++;
21699 +                       if (start) {
21700 +                               long latency = ((long) (stop - start)) /
21701 +                                   NSECS_PER_USECS;
21702 +
21703 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
21704 +                                   stop, NULL);
21705 +                       }
21706 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
21707 +               }
21708 +#endif
21709 +
21710 +#ifdef CONFIG_PREEMPT_OFF_HIST
21711 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
21712 +                   per_cpu(hist_preemptoff_counting, cpu)) {
21713 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
21714 +
21715 +                       if (!(time_set++))
21716 +                               stop = ftrace_now(cpu);
21717 +                       if (start) {
21718 +                               long latency = ((long) (stop - start)) /
21719 +                                   NSECS_PER_USECS;
21720 +
21721 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
21722 +                                   0, stop, NULL);
21723 +                       }
21724 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
21725 +               }
21726 +#endif
21727 +
21728 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21729 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
21730 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
21731 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
21732 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
21733 +
21734 +                       if (!time_set)
21735 +                               stop = ftrace_now(cpu);
21736 +                       if (start) {
21737 +                               long latency = ((long) (stop - start)) /
21738 +                                   NSECS_PER_USECS;
21739 +
21740 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
21741 +                                   latency, 0, stop, NULL);
21742 +                       }
21743 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
21744 +               }
21745 +#endif
21746 +       }
21747 +}
21748 +#endif
21749 +
21750 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21751 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
21752 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
21753 +       int cpu)
21754 +{
21755 +       int old_cpu = task_cpu(task);
21756 +
21757 +       if (cpu != old_cpu) {
21758 +               unsigned long flags;
21759 +               struct task_struct *cpu_wakeup_task;
21760 +
21761 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
21762 +
21763 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
21764 +               if (task == cpu_wakeup_task) {
21765 +                       put_task_struct(cpu_wakeup_task);
21766 +                       per_cpu(wakeup_task, old_cpu) = NULL;
21767 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
21768 +                       get_task_struct(cpu_wakeup_task);
21769 +               }
21770 +
21771 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21772 +       }
21773 +}
21774 +
21775 +static notrace void probe_wakeup_latency_hist_start(void *v,
21776 +       struct task_struct *p)
21777 +{
21778 +       unsigned long flags;
21779 +       struct task_struct *curr = current;
21780 +       int cpu = task_cpu(p);
21781 +       struct task_struct *cpu_wakeup_task;
21782 +
21783 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
21784 +
21785 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
21786 +
21787 +       if (wakeup_pid) {
21788 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
21789 +                   p->prio == curr->prio)
21790 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21791 +               if (likely(wakeup_pid != task_pid_nr(p)))
21792 +                       goto out;
21793 +       } else {
21794 +               if (likely(!rt_task(p)) ||
21795 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
21796 +                   p->prio > curr->prio)
21797 +                       goto out;
21798 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
21799 +                   p->prio == curr->prio)
21800 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21801 +       }
21802 +
21803 +       if (cpu_wakeup_task)
21804 +               put_task_struct(cpu_wakeup_task);
21805 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
21806 +       get_task_struct(cpu_wakeup_task);
21807 +       cpu_wakeup_task->preempt_timestamp_hist =
21808 +               ftrace_now(raw_smp_processor_id());
21809 +out:
21810 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21811 +}
21812 +
21813 +static notrace void probe_wakeup_latency_hist_stop(void *v,
21814 +       bool preempt, struct task_struct *prev, struct task_struct *next)
21815 +{
21816 +       unsigned long flags;
21817 +       int cpu = task_cpu(next);
21818 +       long latency;
21819 +       cycle_t stop;
21820 +       struct task_struct *cpu_wakeup_task;
21821 +
21822 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
21823 +
21824 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
21825 +
21826 +       if (cpu_wakeup_task == NULL)
21827 +               goto out;
21828 +
21829 +       /* Already running? */
21830 +       if (unlikely(current == cpu_wakeup_task))
21831 +               goto out_reset;
21832 +
21833 +       if (next != cpu_wakeup_task) {
21834 +               if (next->prio < cpu_wakeup_task->prio)
21835 +                       goto out_reset;
21836 +
21837 +               if (next->prio == cpu_wakeup_task->prio)
21838 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21839 +
21840 +               goto out;
21841 +       }
21842 +
21843 +       if (current->prio == cpu_wakeup_task->prio)
21844 +               per_cpu(wakeup_sharedprio, cpu) = 1;
21845 +
21846 +       /*
21847 +        * The task we are waiting for is about to be switched to.
21848 +        * Calculate latency and store it in histogram.
21849 +        */
21850 +       stop = ftrace_now(raw_smp_processor_id());
21851 +
21852 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
21853 +           NSECS_PER_USECS;
21854 +
21855 +       if (per_cpu(wakeup_sharedprio, cpu)) {
21856 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
21857 +                   next);
21858 +               per_cpu(wakeup_sharedprio, cpu) = 0;
21859 +       } else {
21860 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
21861 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21862 +               if (timerandwakeup_enabled_data.enabled) {
21863 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
21864 +                           next->timer_offset + latency, next->timer_offset,
21865 +                           stop, next);
21866 +               }
21867 +#endif
21868 +       }
21869 +
21870 +out_reset:
21871 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21872 +       next->timer_offset = 0;
21873 +#endif
21874 +       put_task_struct(cpu_wakeup_task);
21875 +       per_cpu(wakeup_task, cpu) = NULL;
21876 +out:
21877 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21878 +}
21879 +#endif
21880 +
21881 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21882 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
21883 +       long long latency_ns, struct task_struct *curr,
21884 +       struct task_struct *task)
21885 +{
21886 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
21887 +           (task->prio < curr->prio ||
21888 +           (task->prio == curr->prio &&
21889 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
21890 +               long latency;
21891 +               cycle_t now;
21892 +
21893 +               if (missed_timer_offsets_pid) {
21894 +                       if (likely(missed_timer_offsets_pid !=
21895 +                           task_pid_nr(task)))
21896 +                               return;
21897 +               }
21898 +
21899 +               now = ftrace_now(cpu);
21900 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
21901 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
21902 +                   task);
21903 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21904 +               task->timer_offset = latency;
21905 +#endif
21906 +       }
21907 +}
21908 +#endif
21909 +
21910 +static __init int latency_hist_init(void)
21911 +{
21912 +       struct dentry *latency_hist_root = NULL;
21913 +       struct dentry *dentry;
21914 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21915 +       struct dentry *dentry_sharedprio;
21916 +#endif
21917 +       struct dentry *entry;
21918 +       struct dentry *enable_root;
21919 +       int i = 0;
21920 +       struct hist_data *my_hist;
21921 +       char name[64];
21922 +       char *cpufmt = "CPU%d";
21923 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21924 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21925 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
21926 +       struct maxlatproc_data *mp = NULL;
21927 +#endif
21928 +
21929 +       dentry = tracing_init_dentry();
21930 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
21931 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
21932 +
21933 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21934 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
21935 +       for_each_possible_cpu(i) {
21936 +               sprintf(name, cpufmt, i);
21937 +               entry = debugfs_create_file(name, 0444, dentry,
21938 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
21939 +               my_hist = &per_cpu(irqsoff_hist, i);
21940 +               atomic_set(&my_hist->hist_mode, 1);
21941 +               my_hist->min_lat = LONG_MAX;
21942 +       }
21943 +       entry = debugfs_create_file("reset", 0644, dentry,
21944 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
21945 +#endif
21946 +
21947 +#ifdef CONFIG_PREEMPT_OFF_HIST
21948 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
21949 +           latency_hist_root);
21950 +       for_each_possible_cpu(i) {
21951 +               sprintf(name, cpufmt, i);
21952 +               entry = debugfs_create_file(name, 0444, dentry,
21953 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
21954 +               my_hist = &per_cpu(preemptoff_hist, i);
21955 +               atomic_set(&my_hist->hist_mode, 1);
21956 +               my_hist->min_lat = LONG_MAX;
21957 +       }
21958 +       entry = debugfs_create_file("reset", 0644, dentry,
21959 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
21960 +#endif
21961 +
21962 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21963 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
21964 +           latency_hist_root);
21965 +       for_each_possible_cpu(i) {
21966 +               sprintf(name, cpufmt, i);
21967 +               entry = debugfs_create_file(name, 0444, dentry,
21968 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
21969 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
21970 +               atomic_set(&my_hist->hist_mode, 1);
21971 +               my_hist->min_lat = LONG_MAX;
21972 +       }
21973 +       entry = debugfs_create_file("reset", 0644, dentry,
21974 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
21975 +#endif
21976 +
21977 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21978 +       entry = debugfs_create_file("preemptirqsoff", 0644,
21979 +           enable_root, (void *)&preemptirqsoff_enabled_data,
21980 +           &enable_fops);
21981 +#endif
21982 +
21983 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21984 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
21985 +           latency_hist_root);
21986 +       dentry_sharedprio = debugfs_create_dir(
21987 +           wakeup_latency_hist_dir_sharedprio, dentry);
21988 +       for_each_possible_cpu(i) {
21989 +               sprintf(name, cpufmt, i);
21990 +
21991 +               entry = debugfs_create_file(name, 0444, dentry,
21992 +                   &per_cpu(wakeup_latency_hist, i),
21993 +                   &latency_hist_fops);
21994 +               my_hist = &per_cpu(wakeup_latency_hist, i);
21995 +               atomic_set(&my_hist->hist_mode, 1);
21996 +               my_hist->min_lat = LONG_MAX;
21997 +
21998 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
21999 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
22000 +                   &latency_hist_fops);
22001 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
22002 +               atomic_set(&my_hist->hist_mode, 1);
22003 +               my_hist->min_lat = LONG_MAX;
22004 +
22005 +               sprintf(name, cpufmt_maxlatproc, i);
22006 +
22007 +               mp = &per_cpu(wakeup_maxlatproc, i);
22008 +               entry = debugfs_create_file(name, 0444, dentry, mp,
22009 +                   &maxlatproc_fops);
22010 +               clear_maxlatprocdata(mp);
22011 +
22012 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
22013 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
22014 +                   &maxlatproc_fops);
22015 +               clear_maxlatprocdata(mp);
22016 +       }
22017 +       entry = debugfs_create_file("pid", 0644, dentry,
22018 +           (void *)&wakeup_pid, &pid_fops);
22019 +       entry = debugfs_create_file("reset", 0644, dentry,
22020 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
22021 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
22022 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
22023 +       entry = debugfs_create_file("wakeup", 0644,
22024 +           enable_root, (void *)&wakeup_latency_enabled_data,
22025 +           &enable_fops);
22026 +#endif
22027 +
22028 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22029 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
22030 +           latency_hist_root);
22031 +       for_each_possible_cpu(i) {
22032 +               sprintf(name, cpufmt, i);
22033 +               entry = debugfs_create_file(name, 0444, dentry,
22034 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
22035 +               my_hist = &per_cpu(missed_timer_offsets, i);
22036 +               atomic_set(&my_hist->hist_mode, 1);
22037 +               my_hist->min_lat = LONG_MAX;
22038 +
22039 +               sprintf(name, cpufmt_maxlatproc, i);
22040 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
22041 +               entry = debugfs_create_file(name, 0444, dentry, mp,
22042 +                   &maxlatproc_fops);
22043 +               clear_maxlatprocdata(mp);
22044 +       }
22045 +       entry = debugfs_create_file("pid", 0644, dentry,
22046 +           (void *)&missed_timer_offsets_pid, &pid_fops);
22047 +       entry = debugfs_create_file("reset", 0644, dentry,
22048 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
22049 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
22050 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
22051 +           &enable_fops);
22052 +#endif
22053 +
22054 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22055 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22056 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
22057 +           latency_hist_root);
22058 +       for_each_possible_cpu(i) {
22059 +               sprintf(name, cpufmt, i);
22060 +               entry = debugfs_create_file(name, 0444, dentry,
22061 +                   &per_cpu(timerandwakeup_latency_hist, i),
22062 +                   &latency_hist_fops);
22063 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
22064 +               atomic_set(&my_hist->hist_mode, 1);
22065 +               my_hist->min_lat = LONG_MAX;
22066 +
22067 +               sprintf(name, cpufmt_maxlatproc, i);
22068 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
22069 +               entry = debugfs_create_file(name, 0444, dentry, mp,
22070 +                   &maxlatproc_fops);
22071 +               clear_maxlatprocdata(mp);
22072 +       }
22073 +       entry = debugfs_create_file("reset", 0644, dentry,
22074 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
22075 +       entry = debugfs_create_file("timerandwakeup", 0644,
22076 +           enable_root, (void *)&timerandwakeup_enabled_data,
22077 +           &enable_fops);
22078 +#endif
22079 +       return 0;
22080 +}
22081 +
22082 +device_initcall(latency_hist_init);
22083 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
22084 index 90b66ed6f0e2..7d9897e41ded 100644
22085 --- a/kernel/trace/trace.c
22086 +++ b/kernel/trace/trace.c
22087 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
22088         struct task_struct *tsk = current;
22089  
22090         entry->preempt_count            = pc & 0xff;
22091 +       entry->preempt_lazy_count       = preempt_lazy_count();
22092         entry->pid                      = (tsk) ? tsk->pid : 0;
22093         entry->flags =
22094  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
22095 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
22096                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
22097                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
22098                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
22099 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
22100 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
22101 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
22102                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
22103 +
22104 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
22105  }
22106  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
22107  
22108 @@ -2892,14 +2896,17 @@ get_total_entries(struct trace_buffer *buf,
22109  
22110  static void print_lat_help_header(struct seq_file *m)
22111  {
22112 -       seq_puts(m, "#                  _------=> CPU#            \n"
22113 -                   "#                 / _-----=> irqs-off        \n"
22114 -                   "#                | / _----=> need-resched    \n"
22115 -                   "#                || / _---=> hardirq/softirq \n"
22116 -                   "#                ||| / _--=> preempt-depth   \n"
22117 -                   "#                |||| /     delay            \n"
22118 -                   "#  cmd     pid   ||||| time  |   caller      \n"
22119 -                   "#     \\   /      |||||  \\    |   /         \n");
22120 +       seq_puts(m, "#                  _--------=> CPU#              \n"
22121 +                   "#                 / _-------=> irqs-off          \n"
22122 +                   "#                | / _------=> need-resched      \n"
22123 +                   "#                || / _-----=> need-resched_lazy \n"
22124 +                   "#                ||| / _----=> hardirq/softirq   \n"
22125 +                   "#                |||| / _---=> preempt-depth     \n"
22126 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
22127 +                   "#                |||||| / _-=> migrate-disable   \n"
22128 +                   "#                ||||||| /     delay             \n"
22129 +                   "# cmd     pid    |||||||| time   |  caller       \n"
22130 +                   "#     \\   /      ||||||||   \\    |  /            \n");
22131  }
22132  
22133  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
22134 @@ -2925,11 +2932,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
22135         print_event_info(buf, m);
22136         seq_puts(m, "#                              _-----=> irqs-off\n"
22137                     "#                             / _----=> need-resched\n"
22138 -                   "#                            | / _---=> hardirq/softirq\n"
22139 -                   "#                            || / _--=> preempt-depth\n"
22140 -                   "#                            ||| /     delay\n"
22141 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
22142 -                   "#              | |       |   ||||       |         |\n");
22143 +                   "#                            |/  _-----=> need-resched_lazy\n"
22144 +                   "#                            || / _---=> hardirq/softirq\n"
22145 +                   "#                            ||| / _--=> preempt-depth\n"
22146 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
22147 +                   "#                            ||||| / _-=> migrate-disable   \n"
22148 +                   "#                            |||||| /    delay\n"
22149 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
22150 +                   "#              | |       |   |||||||      |         |\n");
22151  }
22152  
22153  void
22154 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
22155 index fd24b1f9ac43..852b2c81be25 100644
22156 --- a/kernel/trace/trace.h
22157 +++ b/kernel/trace/trace.h
22158 @@ -124,6 +124,7 @@ struct kretprobe_trace_entry_head {
22159   *  NEED_RESCHED       - reschedule is requested
22160   *  HARDIRQ            - inside an interrupt handler
22161   *  SOFTIRQ            - inside a softirq handler
22162 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
22163   */
22164  enum trace_flag_type {
22165         TRACE_FLAG_IRQS_OFF             = 0x01,
22166 @@ -133,6 +134,7 @@ enum trace_flag_type {
22167         TRACE_FLAG_SOFTIRQ              = 0x10,
22168         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
22169         TRACE_FLAG_NMI                  = 0x40,
22170 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
22171  };
22172  
22173  #define TRACE_BUF_SIZE         1024
22174 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
22175 index 03c0a48c3ac4..0b85d516b491 100644
22176 --- a/kernel/trace/trace_events.c
22177 +++ b/kernel/trace/trace_events.c
22178 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
22179         __common_field(unsigned char, flags);
22180         __common_field(unsigned char, preempt_count);
22181         __common_field(int, pid);
22182 +       __common_field(unsigned short, migrate_disable);
22183 +       __common_field(unsigned short, padding);
22184  
22185         return ret;
22186  }
22187 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
22188 index 03cdff84d026..940bd10b4406 100644
22189 --- a/kernel/trace/trace_irqsoff.c
22190 +++ b/kernel/trace/trace_irqsoff.c
22191 @@ -13,6 +13,7 @@
22192  #include <linux/uaccess.h>
22193  #include <linux/module.h>
22194  #include <linux/ftrace.h>
22195 +#include <trace/events/hist.h>
22196  
22197  #include "trace.h"
22198  
22199 @@ -424,11 +425,13 @@ void start_critical_timings(void)
22200  {
22201         if (preempt_trace() || irq_trace())
22202                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22203 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
22204  }
22205  EXPORT_SYMBOL_GPL(start_critical_timings);
22206  
22207  void stop_critical_timings(void)
22208  {
22209 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
22210         if (preempt_trace() || irq_trace())
22211                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22212  }
22213 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
22214  #ifdef CONFIG_PROVE_LOCKING
22215  void time_hardirqs_on(unsigned long a0, unsigned long a1)
22216  {
22217 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
22218         if (!preempt_trace() && irq_trace())
22219                 stop_critical_timing(a0, a1);
22220  }
22221 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
22222  {
22223         if (!preempt_trace() && irq_trace())
22224                 start_critical_timing(a0, a1);
22225 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
22226  }
22227  
22228  #else /* !CONFIG_PROVE_LOCKING */
22229 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
22230   */
22231  void trace_hardirqs_on(void)
22232  {
22233 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
22234         if (!preempt_trace() && irq_trace())
22235                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22236  }
22237 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
22238  {
22239         if (!preempt_trace() && irq_trace())
22240                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22241 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
22242  }
22243  EXPORT_SYMBOL(trace_hardirqs_off);
22244  
22245  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
22246  {
22247 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
22248         if (!preempt_trace() && irq_trace())
22249                 stop_critical_timing(CALLER_ADDR0, caller_addr);
22250  }
22251 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
22252  {
22253         if (!preempt_trace() && irq_trace())
22254                 start_critical_timing(CALLER_ADDR0, caller_addr);
22255 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
22256  }
22257  EXPORT_SYMBOL(trace_hardirqs_off_caller);
22258  
22259 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
22260  #ifdef CONFIG_PREEMPT_TRACER
22261  void trace_preempt_on(unsigned long a0, unsigned long a1)
22262  {
22263 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
22264         if (preempt_trace() && !irq_trace())
22265                 stop_critical_timing(a0, a1);
22266  }
22267  
22268  void trace_preempt_off(unsigned long a0, unsigned long a1)
22269  {
22270 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
22271         if (preempt_trace() && !irq_trace())
22272                 start_critical_timing(a0, a1);
22273  }
22274 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
22275 index 3fc20422c166..65a6dde71a7d 100644
22276 --- a/kernel/trace/trace_output.c
22277 +++ b/kernel/trace/trace_output.c
22278 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
22279  {
22280         char hardsoft_irq;
22281         char need_resched;
22282 +       char need_resched_lazy;
22283         char irqs_off;
22284         int hardirq;
22285         int softirq;
22286 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
22287                 break;
22288         }
22289  
22290 +       need_resched_lazy =
22291 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
22292 +
22293         hardsoft_irq =
22294                 (nmi && hardirq)     ? 'Z' :
22295                 nmi                  ? 'z' :
22296 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
22297                 softirq              ? 's' :
22298                                        '.' ;
22299  
22300 -       trace_seq_printf(s, "%c%c%c",
22301 -                        irqs_off, need_resched, hardsoft_irq);
22302 +       trace_seq_printf(s, "%c%c%c%c",
22303 +                        irqs_off, need_resched, need_resched_lazy,
22304 +                        hardsoft_irq);
22305  
22306         if (entry->preempt_count)
22307                 trace_seq_printf(s, "%x", entry->preempt_count);
22308         else
22309                 trace_seq_putc(s, '.');
22310  
22311 +       if (entry->preempt_lazy_count)
22312 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
22313 +       else
22314 +               trace_seq_putc(s, '.');
22315 +
22316 +       if (entry->migrate_disable)
22317 +               trace_seq_printf(s, "%x", entry->migrate_disable);
22318 +       else
22319 +               trace_seq_putc(s, '.');
22320 +
22321         return !trace_seq_has_overflowed(s);
22322  }
22323  
22324 diff --git a/kernel/user.c b/kernel/user.c
22325 index b069ccbfb0b0..1a2e88e98b5e 100644
22326 --- a/kernel/user.c
22327 +++ b/kernel/user.c
22328 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
22329         if (!up)
22330                 return;
22331  
22332 -       local_irq_save(flags);
22333 +       local_irq_save_nort(flags);
22334         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
22335                 free_user(up, flags);
22336         else
22337 -               local_irq_restore(flags);
22338 +               local_irq_restore_nort(flags);
22339  }
22340  
22341  struct user_struct *alloc_uid(kuid_t uid)
22342 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
22343 index 6d1020c03d41..70c6a2f79f7e 100644
22344 --- a/kernel/watchdog.c
22345 +++ b/kernel/watchdog.c
22346 @@ -315,6 +315,8 @@ static int is_softlockup(unsigned long touch_ts)
22347  
22348  #ifdef CONFIG_HARDLOCKUP_DETECTOR
22349  
22350 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
22351 +
22352  static struct perf_event_attr wd_hw_attr = {
22353         .type           = PERF_TYPE_HARDWARE,
22354         .config         = PERF_COUNT_HW_CPU_CYCLES,
22355 @@ -348,6 +350,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
22356                 /* only print hardlockups once */
22357                 if (__this_cpu_read(hard_watchdog_warn) == true)
22358                         return;
22359 +               /*
22360 +                * If early-printk is enabled then make sure we do not
22361 +                * lock up in printk() and kill console logging:
22362 +                */
22363 +               printk_kill();
22364 +
22365 +               raw_spin_lock(&watchdog_output_lock);
22366  
22367                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
22368                 print_modules();
22369 @@ -365,6 +374,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
22370                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
22371                         trigger_allbutself_cpu_backtrace();
22372  
22373 +               raw_spin_unlock(&watchdog_output_lock);
22374                 if (hardlockup_panic)
22375                         nmi_panic(regs, "Hard LOCKUP");
22376  
22377 @@ -512,6 +522,7 @@ static void watchdog_enable(unsigned int cpu)
22378         /* kick off the timer for the hardlockup detector */
22379         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22380         hrtimer->function = watchdog_timer_fn;
22381 +       hrtimer->irqsafe = 1;
22382  
22383         /* Enable the perf event */
22384         watchdog_nmi_enable(cpu);
22385 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
22386 index 479d840db286..24eba6620a45 100644
22387 --- a/kernel/workqueue.c
22388 +++ b/kernel/workqueue.c
22389 @@ -48,6 +48,8 @@
22390  #include <linux/nodemask.h>
22391  #include <linux/moduleparam.h>
22392  #include <linux/uaccess.h>
22393 +#include <linux/locallock.h>
22394 +#include <linux/delay.h>
22395  
22396  #include "workqueue_internal.h"
22397  
22398 @@ -121,11 +123,16 @@ enum {
22399   *    cpu or grabbing pool->lock is enough for read access.  If
22400   *    POOL_DISASSOCIATED is set, it's identical to L.
22401   *
22402 + *    On RT we need the extra protection via rt_lock_idle_list() for
22403 + *    the list manipulations against read access from
22404 + *    wq_worker_sleeping(). All other places are nicely serialized via
22405 + *    pool->lock.
22406 + *
22407   * A: pool->attach_mutex protected.
22408   *
22409   * PL: wq_pool_mutex protected.
22410   *
22411 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
22412 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
22413   *
22414   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
22415   *
22416 @@ -134,7 +141,7 @@ enum {
22417   *
22418   * WQ: wq->mutex protected.
22419   *
22420 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
22421 + * WR: wq->mutex protected for writes.  RCU protected for reads.
22422   *
22423   * MD: wq_mayday_lock protected.
22424   */
22425 @@ -185,7 +192,7 @@ struct worker_pool {
22426         atomic_t                nr_running ____cacheline_aligned_in_smp;
22427  
22428         /*
22429 -        * Destruction of pool is sched-RCU protected to allow dereferences
22430 +        * Destruction of pool is RCU protected to allow dereferences
22431          * from get_work_pool().
22432          */
22433         struct rcu_head         rcu;
22434 @@ -214,7 +221,7 @@ struct pool_workqueue {
22435         /*
22436          * Release of unbound pwq is punted to system_wq.  See put_pwq()
22437          * and pwq_unbound_release_workfn() for details.  pool_workqueue
22438 -        * itself is also sched-RCU protected so that the first pwq can be
22439 +        * itself is also RCU protected so that the first pwq can be
22440          * determined without grabbing wq->mutex.
22441          */
22442         struct work_struct      unbound_release_work;
22443 @@ -348,6 +355,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
22444  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
22445  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
22446  
22447 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
22448 +
22449  static int worker_thread(void *__worker);
22450  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22451  
22452 @@ -355,20 +364,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22453  #include <trace/events/workqueue.h>
22454  
22455  #define assert_rcu_or_pool_mutex()                                     \
22456 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22457 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22458                          !lockdep_is_held(&wq_pool_mutex),              \
22459 -                        "sched RCU or wq_pool_mutex should be held")
22460 +                        "RCU or wq_pool_mutex should be held")
22461  
22462  #define assert_rcu_or_wq_mutex(wq)                                     \
22463 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22464 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22465                          !lockdep_is_held(&wq->mutex),                  \
22466 -                        "sched RCU or wq->mutex should be held")
22467 +                        "RCU or wq->mutex should be held")
22468  
22469  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
22470 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22471 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22472                          !lockdep_is_held(&wq->mutex) &&                \
22473                          !lockdep_is_held(&wq_pool_mutex),              \
22474 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
22475 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
22476  
22477  #define for_each_cpu_worker_pool(pool, cpu)                            \
22478         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
22479 @@ -380,7 +389,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22480   * @pool: iteration cursor
22481   * @pi: integer used for iteration
22482   *
22483 - * This must be called either with wq_pool_mutex held or sched RCU read
22484 + * This must be called either with wq_pool_mutex held or RCU read
22485   * locked.  If the pool needs to be used beyond the locking in effect, the
22486   * caller is responsible for guaranteeing that the pool stays online.
22487   *
22488 @@ -412,7 +421,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22489   * @pwq: iteration cursor
22490   * @wq: the target workqueue
22491   *
22492 - * This must be called either with wq->mutex held or sched RCU read locked.
22493 + * This must be called either with wq->mutex held or RCU read locked.
22494   * If the pwq needs to be used beyond the locking in effect, the caller is
22495   * responsible for guaranteeing that the pwq stays online.
22496   *
22497 @@ -424,6 +433,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22498                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
22499                 else
22500  
22501 +#ifdef CONFIG_PREEMPT_RT_BASE
22502 +static inline void rt_lock_idle_list(struct worker_pool *pool)
22503 +{
22504 +       preempt_disable();
22505 +}
22506 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
22507 +{
22508 +       preempt_enable();
22509 +}
22510 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
22511 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
22512 +#else
22513 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
22514 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
22515 +static inline void sched_lock_idle_list(struct worker_pool *pool)
22516 +{
22517 +       spin_lock_irq(&pool->lock);
22518 +}
22519 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
22520 +{
22521 +       spin_unlock_irq(&pool->lock);
22522 +}
22523 +#endif
22524 +
22525 +
22526  #ifdef CONFIG_DEBUG_OBJECTS_WORK
22527  
22528  static struct debug_obj_descr work_debug_descr;
22529 @@ -548,7 +582,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
22530   * @wq: the target workqueue
22531   * @node: the node ID
22532   *
22533 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
22534 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
22535   * read locked.
22536   * If the pwq needs to be used beyond the locking in effect, the caller is
22537   * responsible for guaranteeing that the pwq stays online.
22538 @@ -692,8 +726,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
22539   * @work: the work item of interest
22540   *
22541   * Pools are created and destroyed under wq_pool_mutex, and allows read
22542 - * access under sched-RCU read lock.  As such, this function should be
22543 - * called under wq_pool_mutex or with preemption disabled.
22544 + * access under RCU read lock.  As such, this function should be
22545 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
22546   *
22547   * All fields of the returned pool are accessible as long as the above
22548   * mentioned locking is in effect.  If the returned pool needs to be used
22549 @@ -830,50 +864,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
22550   */
22551  static void wake_up_worker(struct worker_pool *pool)
22552  {
22553 -       struct worker *worker = first_idle_worker(pool);
22554 +       struct worker *worker;
22555 +
22556 +       rt_lock_idle_list(pool);
22557 +
22558 +       worker = first_idle_worker(pool);
22559  
22560         if (likely(worker))
22561                 wake_up_process(worker->task);
22562 +
22563 +       rt_unlock_idle_list(pool);
22564  }
22565  
22566  /**
22567 - * wq_worker_waking_up - a worker is waking up
22568 + * wq_worker_running - a worker is running again
22569   * @task: task waking up
22570 - * @cpu: CPU @task is waking up to
22571   *
22572 - * This function is called during try_to_wake_up() when a worker is
22573 - * being awoken.
22574 - *
22575 - * CONTEXT:
22576 - * spin_lock_irq(rq->lock)
22577 + * This function is called when a worker returns from schedule()
22578   */
22579 -void wq_worker_waking_up(struct task_struct *task, int cpu)
22580 +void wq_worker_running(struct task_struct *task)
22581  {
22582         struct worker *worker = kthread_data(task);
22583  
22584 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
22585 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
22586 +       if (!worker->sleeping)
22587 +               return;
22588 +       if (!(worker->flags & WORKER_NOT_RUNNING))
22589                 atomic_inc(&worker->pool->nr_running);
22590 -       }
22591 +       worker->sleeping = 0;
22592  }
22593  
22594  /**
22595   * wq_worker_sleeping - a worker is going to sleep
22596   * @task: task going to sleep
22597   *
22598 - * This function is called during schedule() when a busy worker is
22599 - * going to sleep.  Worker on the same cpu can be woken up by
22600 - * returning pointer to its task.
22601 - *
22602 - * CONTEXT:
22603 - * spin_lock_irq(rq->lock)
22604 - *
22605 - * Return:
22606 - * Worker task on @cpu to wake up, %NULL if none.
22607 + * This function is called from schedule() when a busy worker is
22608 + * going to sleep.
22609   */
22610 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
22611 +void wq_worker_sleeping(struct task_struct *task)
22612  {
22613 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
22614 +       struct worker *worker = kthread_data(task);
22615         struct worker_pool *pool;
22616  
22617         /*
22618 @@ -882,29 +911,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
22619          * checking NOT_RUNNING.
22620          */
22621         if (worker->flags & WORKER_NOT_RUNNING)
22622 -               return NULL;
22623 +               return;
22624  
22625         pool = worker->pool;
22626  
22627 -       /* this can only happen on the local cpu */
22628 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
22629 -               return NULL;
22630 +       if (WARN_ON_ONCE(worker->sleeping))
22631 +               return;
22632 +
22633 +       worker->sleeping = 1;
22634  
22635         /*
22636          * The counterpart of the following dec_and_test, implied mb,
22637          * worklist not empty test sequence is in insert_work().
22638          * Please read comment there.
22639 -        *
22640 -        * NOT_RUNNING is clear.  This means that we're bound to and
22641 -        * running on the local cpu w/ rq lock held and preemption
22642 -        * disabled, which in turn means that none else could be
22643 -        * manipulating idle_list, so dereferencing idle_list without pool
22644 -        * lock is safe.
22645          */
22646         if (atomic_dec_and_test(&pool->nr_running) &&
22647 -           !list_empty(&pool->worklist))
22648 -               to_wakeup = first_idle_worker(pool);
22649 -       return to_wakeup ? to_wakeup->task : NULL;
22650 +           !list_empty(&pool->worklist)) {
22651 +               sched_lock_idle_list(pool);
22652 +               wake_up_worker(pool);
22653 +               sched_unlock_idle_list(pool);
22654 +       }
22655  }
22656  
22657  /**
22658 @@ -1098,12 +1124,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
22659  {
22660         if (pwq) {
22661                 /*
22662 -                * As both pwqs and pools are sched-RCU protected, the
22663 +                * As both pwqs and pools are RCU protected, the
22664                  * following lock operations are safe.
22665                  */
22666 -               spin_lock_irq(&pwq->pool->lock);
22667 +               rcu_read_lock();
22668 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
22669                 put_pwq(pwq);
22670 -               spin_unlock_irq(&pwq->pool->lock);
22671 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
22672 +               rcu_read_unlock();
22673         }
22674  }
22675  
22676 @@ -1207,7 +1235,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
22677         struct worker_pool *pool;
22678         struct pool_workqueue *pwq;
22679  
22680 -       local_irq_save(*flags);
22681 +       local_lock_irqsave(pendingb_lock, *flags);
22682  
22683         /* try to steal the timer if it exists */
22684         if (is_dwork) {
22685 @@ -1226,6 +1254,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
22686         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
22687                 return 0;
22688  
22689 +       rcu_read_lock();
22690         /*
22691          * The queueing is in progress, or it is already queued. Try to
22692          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
22693 @@ -1264,14 +1293,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
22694                 set_work_pool_and_keep_pending(work, pool->id);
22695  
22696                 spin_unlock(&pool->lock);
22697 +               rcu_read_unlock();
22698                 return 1;
22699         }
22700         spin_unlock(&pool->lock);
22701  fail:
22702 -       local_irq_restore(*flags);
22703 +       rcu_read_unlock();
22704 +       local_unlock_irqrestore(pendingb_lock, *flags);
22705         if (work_is_canceling(work))
22706                 return -ENOENT;
22707 -       cpu_relax();
22708 +       cpu_chill();
22709         return -EAGAIN;
22710  }
22711  
22712 @@ -1373,7 +1404,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22713          * queued or lose PENDING.  Grabbing PENDING and queueing should
22714          * happen with IRQ disabled.
22715          */
22716 -       WARN_ON_ONCE(!irqs_disabled());
22717 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
22718  
22719         debug_work_activate(work);
22720  
22721 @@ -1381,6 +1412,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22722         if (unlikely(wq->flags & __WQ_DRAINING) &&
22723             WARN_ON_ONCE(!is_chained_work(wq)))
22724                 return;
22725 +       rcu_read_lock();
22726  retry:
22727         if (req_cpu == WORK_CPU_UNBOUND)
22728                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
22729 @@ -1437,10 +1469,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22730         /* pwq determined, queue */
22731         trace_workqueue_queue_work(req_cpu, pwq, work);
22732  
22733 -       if (WARN_ON(!list_empty(&work->entry))) {
22734 -               spin_unlock(&pwq->pool->lock);
22735 -               return;
22736 -       }
22737 +       if (WARN_ON(!list_empty(&work->entry)))
22738 +               goto out;
22739  
22740         pwq->nr_in_flight[pwq->work_color]++;
22741         work_flags = work_color_to_flags(pwq->work_color);
22742 @@ -1458,7 +1488,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22743  
22744         insert_work(pwq, work, worklist, work_flags);
22745  
22746 +out:
22747         spin_unlock(&pwq->pool->lock);
22748 +       rcu_read_unlock();
22749  }
22750  
22751  /**
22752 @@ -1478,14 +1510,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
22753         bool ret = false;
22754         unsigned long flags;
22755  
22756 -       local_irq_save(flags);
22757 +       local_lock_irqsave(pendingb_lock,flags);
22758  
22759         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
22760                 __queue_work(cpu, wq, work);
22761                 ret = true;
22762         }
22763  
22764 -       local_irq_restore(flags);
22765 +       local_unlock_irqrestore(pendingb_lock, flags);
22766         return ret;
22767  }
22768  EXPORT_SYMBOL(queue_work_on);
22769 @@ -1552,14 +1584,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
22770         unsigned long flags;
22771  
22772         /* read the comment in __queue_work() */
22773 -       local_irq_save(flags);
22774 +       local_lock_irqsave(pendingb_lock, flags);
22775  
22776         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
22777                 __queue_delayed_work(cpu, wq, dwork, delay);
22778                 ret = true;
22779         }
22780  
22781 -       local_irq_restore(flags);
22782 +       local_unlock_irqrestore(pendingb_lock, flags);
22783         return ret;
22784  }
22785  EXPORT_SYMBOL(queue_delayed_work_on);
22786 @@ -1594,7 +1626,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
22787  
22788         if (likely(ret >= 0)) {
22789                 __queue_delayed_work(cpu, wq, dwork, delay);
22790 -               local_irq_restore(flags);
22791 +               local_unlock_irqrestore(pendingb_lock, flags);
22792         }
22793  
22794         /* -ENOENT from try_to_grab_pending() becomes %true */
22795 @@ -1627,7 +1659,9 @@ static void worker_enter_idle(struct worker *worker)
22796         worker->last_active = jiffies;
22797  
22798         /* idle_list is LIFO */
22799 +       rt_lock_idle_list(pool);
22800         list_add(&worker->entry, &pool->idle_list);
22801 +       rt_unlock_idle_list(pool);
22802  
22803         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
22804                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
22805 @@ -1660,7 +1694,9 @@ static void worker_leave_idle(struct worker *worker)
22806                 return;
22807         worker_clr_flags(worker, WORKER_IDLE);
22808         pool->nr_idle--;
22809 +       rt_lock_idle_list(pool);
22810         list_del_init(&worker->entry);
22811 +       rt_unlock_idle_list(pool);
22812  }
22813  
22814  static struct worker *alloc_worker(int node)
22815 @@ -1826,7 +1862,9 @@ static void destroy_worker(struct worker *worker)
22816         pool->nr_workers--;
22817         pool->nr_idle--;
22818  
22819 +       rt_lock_idle_list(pool);
22820         list_del_init(&worker->entry);
22821 +       rt_unlock_idle_list(pool);
22822         worker->flags |= WORKER_DIE;
22823         wake_up_process(worker->task);
22824  }
22825 @@ -2785,14 +2823,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
22826  
22827         might_sleep();
22828  
22829 -       local_irq_disable();
22830 +       rcu_read_lock();
22831         pool = get_work_pool(work);
22832         if (!pool) {
22833 -               local_irq_enable();
22834 +               rcu_read_unlock();
22835                 return false;
22836         }
22837  
22838 -       spin_lock(&pool->lock);
22839 +       spin_lock_irq(&pool->lock);
22840         /* see the comment in try_to_grab_pending() with the same code */
22841         pwq = get_work_pwq(work);
22842         if (pwq) {
22843 @@ -2821,10 +2859,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
22844         else
22845                 lock_map_acquire_read(&pwq->wq->lockdep_map);
22846         lock_map_release(&pwq->wq->lockdep_map);
22847 -
22848 +       rcu_read_unlock();
22849         return true;
22850  already_gone:
22851         spin_unlock_irq(&pool->lock);
22852 +       rcu_read_unlock();
22853         return false;
22854  }
22855  
22856 @@ -2911,7 +2950,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
22857  
22858         /* tell other tasks trying to grab @work to back off */
22859         mark_work_canceling(work);
22860 -       local_irq_restore(flags);
22861 +       local_unlock_irqrestore(pendingb_lock, flags);
22862  
22863         flush_work(work);
22864         clear_work_data(work);
22865 @@ -2966,10 +3005,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
22866   */
22867  bool flush_delayed_work(struct delayed_work *dwork)
22868  {
22869 -       local_irq_disable();
22870 +       local_lock_irq(pendingb_lock);
22871         if (del_timer_sync(&dwork->timer))
22872                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
22873 -       local_irq_enable();
22874 +       local_unlock_irq(pendingb_lock);
22875         return flush_work(&dwork->work);
22876  }
22877  EXPORT_SYMBOL(flush_delayed_work);
22878 @@ -2987,7 +3026,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
22879                 return false;
22880  
22881         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
22882 -       local_irq_restore(flags);
22883 +       local_unlock_irqrestore(pendingb_lock, flags);
22884         return ret;
22885  }
22886  
22887 @@ -3245,7 +3284,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
22888   * put_unbound_pool - put a worker_pool
22889   * @pool: worker_pool to put
22890   *
22891 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
22892 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
22893   * safe manner.  get_unbound_pool() calls this function on its failure path
22894   * and this function should be able to release pools which went through,
22895   * successfully or not, init_worker_pool().
22896 @@ -3299,8 +3338,8 @@ static void put_unbound_pool(struct worker_pool *pool)
22897         del_timer_sync(&pool->idle_timer);
22898         del_timer_sync(&pool->mayday_timer);
22899  
22900 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
22901 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
22902 +       /* RCU protected to allow dereferences from get_work_pool() */
22903 +       call_rcu(&pool->rcu, rcu_free_pool);
22904  }
22905  
22906  /**
22907 @@ -3407,14 +3446,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
22908         put_unbound_pool(pool);
22909         mutex_unlock(&wq_pool_mutex);
22910  
22911 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
22912 +       call_rcu(&pwq->rcu, rcu_free_pwq);
22913  
22914         /*
22915          * If we're the last pwq going away, @wq is already dead and no one
22916          * is gonna access it anymore.  Schedule RCU free.
22917          */
22918         if (is_last)
22919 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
22920 +               call_rcu(&wq->rcu, rcu_free_wq);
22921  }
22922  
22923  /**
22924 @@ -4064,7 +4103,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
22925                  * The base ref is never dropped on per-cpu pwqs.  Directly
22926                  * schedule RCU free.
22927                  */
22928 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
22929 +               call_rcu(&wq->rcu, rcu_free_wq);
22930         } else {
22931                 /*
22932                  * We're the sole accessor of @wq at this point.  Directly
22933 @@ -4157,7 +4196,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
22934         struct pool_workqueue *pwq;
22935         bool ret;
22936  
22937 -       rcu_read_lock_sched();
22938 +       rcu_read_lock();
22939 +       preempt_disable();
22940  
22941         if (cpu == WORK_CPU_UNBOUND)
22942                 cpu = smp_processor_id();
22943 @@ -4168,7 +4208,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
22944                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
22945  
22946         ret = !list_empty(&pwq->delayed_works);
22947 -       rcu_read_unlock_sched();
22948 +       preempt_enable();
22949 +       rcu_read_unlock();
22950  
22951         return ret;
22952  }
22953 @@ -4194,15 +4235,15 @@ unsigned int work_busy(struct work_struct *work)
22954         if (work_pending(work))
22955                 ret |= WORK_BUSY_PENDING;
22956  
22957 -       local_irq_save(flags);
22958 +       rcu_read_lock();
22959         pool = get_work_pool(work);
22960         if (pool) {
22961 -               spin_lock(&pool->lock);
22962 +               spin_lock_irqsave(&pool->lock, flags);
22963                 if (find_worker_executing_work(pool, work))
22964                         ret |= WORK_BUSY_RUNNING;
22965 -               spin_unlock(&pool->lock);
22966 +               spin_unlock_irqrestore(&pool->lock, flags);
22967         }
22968 -       local_irq_restore(flags);
22969 +       rcu_read_unlock();
22970  
22971         return ret;
22972  }
22973 @@ -4391,7 +4432,7 @@ void show_workqueue_state(void)
22974         unsigned long flags;
22975         int pi;
22976  
22977 -       rcu_read_lock_sched();
22978 +       rcu_read_lock();
22979  
22980         pr_info("Showing busy workqueues and worker pools:\n");
22981  
22982 @@ -4444,7 +4485,7 @@ void show_workqueue_state(void)
22983                 spin_unlock_irqrestore(&pool->lock, flags);
22984         }
22985  
22986 -       rcu_read_unlock_sched();
22987 +       rcu_read_unlock();
22988  }
22989  
22990  /*
22991 @@ -4782,16 +4823,16 @@ bool freeze_workqueues_busy(void)
22992                  * nr_active is monotonically decreasing.  It's safe
22993                  * to peek without lock.
22994                  */
22995 -               rcu_read_lock_sched();
22996 +               rcu_read_lock();
22997                 for_each_pwq(pwq, wq) {
22998                         WARN_ON_ONCE(pwq->nr_active < 0);
22999                         if (pwq->nr_active) {
23000                                 busy = true;
23001 -                               rcu_read_unlock_sched();
23002 +                               rcu_read_unlock();
23003                                 goto out_unlock;
23004                         }
23005                 }
23006 -               rcu_read_unlock_sched();
23007 +               rcu_read_unlock();
23008         }
23009  out_unlock:
23010         mutex_unlock(&wq_pool_mutex);
23011 @@ -4981,7 +5022,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
23012         const char *delim = "";
23013         int node, written = 0;
23014  
23015 -       rcu_read_lock_sched();
23016 +       get_online_cpus();
23017 +       rcu_read_lock();
23018         for_each_node(node) {
23019                 written += scnprintf(buf + written, PAGE_SIZE - written,
23020                                      "%s%d:%d", delim, node,
23021 @@ -4989,7 +5031,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
23022                 delim = " ";
23023         }
23024         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
23025 -       rcu_read_unlock_sched();
23026 +       rcu_read_unlock();
23027 +       put_online_cpus();
23028  
23029         return written;
23030  }
23031 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
23032 index 8635417c587b..f000c4d6917e 100644
23033 --- a/kernel/workqueue_internal.h
23034 +++ b/kernel/workqueue_internal.h
23035 @@ -43,6 +43,7 @@ struct worker {
23036         unsigned long           last_active;    /* L: last active timestamp */
23037         unsigned int            flags;          /* X: flags */
23038         int                     id;             /* I: worker id */
23039 +       int                     sleeping;       /* None */
23040  
23041         /*
23042          * Opaque string set with work_set_desc().  Printed out with task
23043 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
23044   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
23045   * sched/core.c and workqueue.c.
23046   */
23047 -void wq_worker_waking_up(struct task_struct *task, int cpu);
23048 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
23049 +void wq_worker_running(struct task_struct *task);
23050 +void wq_worker_sleeping(struct task_struct *task);
23051  
23052  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
23053 diff --git a/lib/Kconfig b/lib/Kconfig
23054 index 260a80e313b9..b06becb3f477 100644
23055 --- a/lib/Kconfig
23056 +++ b/lib/Kconfig
23057 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
23058  
23059  config CPUMASK_OFFSTACK
23060         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
23061 +       depends on !PREEMPT_RT_FULL
23062         help
23063           Use dynamic allocation for cpumask_var_t, instead of putting
23064           them on the stack.  This is a bit more expensive, but avoids
23065 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
23066 index 056052dc8e91..d8494e126de8 100644
23067 --- a/lib/debugobjects.c
23068 +++ b/lib/debugobjects.c
23069 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
23070         struct debug_obj *obj;
23071         unsigned long flags;
23072  
23073 -       fill_pool();
23074 +#ifdef CONFIG_PREEMPT_RT_FULL
23075 +       if (preempt_count() == 0 && !irqs_disabled())
23076 +#endif
23077 +               fill_pool();
23078  
23079         db = get_bucket((unsigned long) addr);
23080  
23081 diff --git a/lib/idr.c b/lib/idr.c
23082 index 6098336df267..9decbe914595 100644
23083 --- a/lib/idr.c
23084 +++ b/lib/idr.c
23085 @@ -30,6 +30,7 @@
23086  #include <linux/idr.h>
23087  #include <linux/spinlock.h>
23088  #include <linux/percpu.h>
23089 +#include <linux/locallock.h>
23090  
23091  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
23092  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
23093 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
23094  static DEFINE_PER_CPU(int, idr_preload_cnt);
23095  static DEFINE_SPINLOCK(simple_ida_lock);
23096  
23097 +#ifdef CONFIG_PREEMPT_RT_FULL
23098 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
23099 +
23100 +static inline void idr_preload_lock(void)
23101 +{
23102 +       local_lock(idr_lock);
23103 +}
23104 +
23105 +static inline void idr_preload_unlock(void)
23106 +{
23107 +       local_unlock(idr_lock);
23108 +}
23109 +
23110 +void idr_preload_end(void)
23111 +{
23112 +       idr_preload_unlock();
23113 +}
23114 +EXPORT_SYMBOL(idr_preload_end);
23115 +#else
23116 +static inline void idr_preload_lock(void)
23117 +{
23118 +       preempt_disable();
23119 +}
23120 +
23121 +static inline void idr_preload_unlock(void)
23122 +{
23123 +       preempt_enable();
23124 +}
23125 +#endif
23126 +
23127 +
23128  /* the maximum ID which can be allocated given idr->layers */
23129  static int idr_max(int layers)
23130  {
23131 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
23132          * context.  See idr_preload() for details.
23133          */
23134         if (!in_interrupt()) {
23135 -               preempt_disable();
23136 +               idr_preload_lock();
23137                 new = __this_cpu_read(idr_preload_head);
23138                 if (new) {
23139                         __this_cpu_write(idr_preload_head, new->ary[0]);
23140                         __this_cpu_dec(idr_preload_cnt);
23141                         new->ary[0] = NULL;
23142                 }
23143 -               preempt_enable();
23144 +               idr_preload_unlock();
23145                 if (new)
23146                         return new;
23147         }
23148 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
23149         idr_mark_full(pa, id);
23150  }
23151  
23152 -
23153  /**
23154   * idr_preload - preload for idr_alloc()
23155   * @gfp_mask: allocation mask to use for preloading
23156 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
23157         WARN_ON_ONCE(in_interrupt());
23158         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
23159  
23160 -       preempt_disable();
23161 +       idr_preload_lock();
23162  
23163         /*
23164          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
23165 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
23166         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
23167                 struct idr_layer *new;
23168  
23169 -               preempt_enable();
23170 +               idr_preload_unlock();
23171                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
23172 -               preempt_disable();
23173 +               idr_preload_lock();
23174                 if (!new)
23175                         break;
23176  
23177 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
23178 index 1d6565e81030..b23a79761df7 100644
23179 --- a/lib/irq_poll.c
23180 +++ b/lib/irq_poll.c
23181 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
23182         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
23183         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
23184         local_irq_restore(flags);
23185 +       preempt_check_resched_rt();
23186  }
23187  EXPORT_SYMBOL(irq_poll_sched);
23188  
23189 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
23190         local_irq_save(flags);
23191         __irq_poll_complete(iop);
23192         local_irq_restore(flags);
23193 +       preempt_check_resched_rt();
23194  }
23195  EXPORT_SYMBOL(irq_poll_complete);
23196  
23197 @@ -95,6 +97,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
23198                 }
23199  
23200                 local_irq_enable();
23201 +               preempt_check_resched_rt();
23202  
23203                 /* Even though interrupts have been re-enabled, this
23204                  * access is safe because interrupts can only add new
23205 @@ -132,6 +135,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
23206                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
23207  
23208         local_irq_enable();
23209 +       preempt_check_resched_rt();
23210  }
23211  
23212  /**
23213 @@ -195,6 +199,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
23214                          this_cpu_ptr(&blk_cpu_iopoll));
23215         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
23216         local_irq_enable();
23217 +       preempt_check_resched_rt();
23218  
23219         return 0;
23220  }
23221 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
23222 index f3a217ea0388..4611b156ef79 100644
23223 --- a/lib/locking-selftest.c
23224 +++ b/lib/locking-selftest.c
23225 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
23226  #include "locking-selftest-spin-hardirq.h"
23227  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
23228  
23229 +#ifndef CONFIG_PREEMPT_RT_FULL
23230 +
23231  #include "locking-selftest-rlock-hardirq.h"
23232  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
23233  
23234 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
23235  #include "locking-selftest-wlock-softirq.h"
23236  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
23237  
23238 +#endif
23239 +
23240  #undef E1
23241  #undef E2
23242  
23243 +#ifndef CONFIG_PREEMPT_RT_FULL
23244  /*
23245   * Enabling hardirqs with a softirq-safe lock held:
23246   */
23247 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
23248  #undef E1
23249  #undef E2
23250  
23251 +#endif
23252 +
23253  /*
23254   * Enabling irqs with an irq-safe lock held:
23255   */
23256 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
23257  #include "locking-selftest-spin-hardirq.h"
23258  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
23259  
23260 +#ifndef CONFIG_PREEMPT_RT_FULL
23261 +
23262  #include "locking-selftest-rlock-hardirq.h"
23263  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
23264  
23265 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
23266  #include "locking-selftest-wlock-softirq.h"
23267  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
23268  
23269 +#endif
23270 +
23271  #undef E1
23272  #undef E2
23273  
23274 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
23275  #include "locking-selftest-spin-hardirq.h"
23276  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
23277  
23278 +#ifndef CONFIG_PREEMPT_RT_FULL
23279 +
23280  #include "locking-selftest-rlock-hardirq.h"
23281  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
23282  
23283 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
23284  #include "locking-selftest-wlock-softirq.h"
23285  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
23286  
23287 +#endif
23288 +
23289  #undef E1
23290  #undef E2
23291  #undef E3
23292 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
23293  #include "locking-selftest-spin-hardirq.h"
23294  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
23295  
23296 +#ifndef CONFIG_PREEMPT_RT_FULL
23297 +
23298  #include "locking-selftest-rlock-hardirq.h"
23299  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
23300  
23301 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
23302  #include "locking-selftest-wlock-softirq.h"
23303  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
23304  
23305 +#endif
23306 +
23307  #undef E1
23308  #undef E2
23309  #undef E3
23310  
23311 +#ifndef CONFIG_PREEMPT_RT_FULL
23312 +
23313  /*
23314   * read-lock / write-lock irq inversion.
23315   *
23316 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
23317  #undef E2
23318  #undef E3
23319  
23320 +#endif
23321 +
23322 +#ifndef CONFIG_PREEMPT_RT_FULL
23323 +
23324  /*
23325   * read-lock / write-lock recursion that is actually safe.
23326   */
23327 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
23328  #undef E2
23329  #undef E3
23330  
23331 +#endif
23332 +
23333  /*
23334   * read-lock / write-lock recursion that is unsafe.
23335   */
23336 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
23337  
23338         printk("  --------------------------------------------------------------------------\n");
23339  
23340 +#ifndef CONFIG_PREEMPT_RT_FULL
23341         /*
23342          * irq-context testcases:
23343          */
23344 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
23345  
23346         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
23347  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
23348 +#else
23349 +       /* On -rt, we only do hardirq context test for raw spinlock */
23350 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
23351 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
23352 +
23353 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
23354 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
23355 +
23356 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
23357 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
23358 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
23359 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
23360 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
23361 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
23362 +
23363 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
23364 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
23365 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
23366 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
23367 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
23368 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
23369 +#endif
23370  
23371         ww_tests();
23372  
23373 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
23374 index 6d40944960de..822a2c027e72 100644
23375 --- a/lib/percpu_ida.c
23376 +++ b/lib/percpu_ida.c
23377 @@ -26,6 +26,9 @@
23378  #include <linux/string.h>
23379  #include <linux/spinlock.h>
23380  #include <linux/percpu_ida.h>
23381 +#include <linux/locallock.h>
23382 +
23383 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
23384  
23385  struct percpu_ida_cpu {
23386         /*
23387 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23388         unsigned long flags;
23389         int tag;
23390  
23391 -       local_irq_save(flags);
23392 +       local_lock_irqsave(irq_off_lock, flags);
23393         tags = this_cpu_ptr(pool->tag_cpu);
23394  
23395         /* Fastpath */
23396         tag = alloc_local_tag(tags);
23397         if (likely(tag >= 0)) {
23398 -               local_irq_restore(flags);
23399 +               local_unlock_irqrestore(irq_off_lock, flags);
23400                 return tag;
23401         }
23402  
23403 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23404  
23405                 if (!tags->nr_free)
23406                         alloc_global_tags(pool, tags);
23407 +
23408                 if (!tags->nr_free)
23409                         steal_tags(pool, tags);
23410  
23411 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23412                 }
23413  
23414                 spin_unlock(&pool->lock);
23415 -               local_irq_restore(flags);
23416 +               local_unlock_irqrestore(irq_off_lock, flags);
23417  
23418                 if (tag >= 0 || state == TASK_RUNNING)
23419                         break;
23420 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23421  
23422                 schedule();
23423  
23424 -               local_irq_save(flags);
23425 +               local_lock_irqsave(irq_off_lock, flags);
23426                 tags = this_cpu_ptr(pool->tag_cpu);
23427         }
23428         if (state != TASK_RUNNING)
23429 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
23430  
23431         BUG_ON(tag >= pool->nr_tags);
23432  
23433 -       local_irq_save(flags);
23434 +       local_lock_irqsave(irq_off_lock, flags);
23435         tags = this_cpu_ptr(pool->tag_cpu);
23436  
23437         spin_lock(&tags->lock);
23438 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
23439                 spin_unlock(&pool->lock);
23440         }
23441  
23442 -       local_irq_restore(flags);
23443 +       local_unlock_irqrestore(irq_off_lock, flags);
23444  }
23445  EXPORT_SYMBOL_GPL(percpu_ida_free);
23446  
23447 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
23448         struct percpu_ida_cpu *remote;
23449         unsigned cpu, i, err = 0;
23450  
23451 -       local_irq_save(flags);
23452 +       local_lock_irqsave(irq_off_lock, flags);
23453         for_each_possible_cpu(cpu) {
23454                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
23455                 spin_lock(&remote->lock);
23456 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
23457         }
23458         spin_unlock(&pool->lock);
23459  out:
23460 -       local_irq_restore(flags);
23461 +       local_unlock_irqrestore(irq_off_lock, flags);
23462         return err;
23463  }
23464  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
23465 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
23466 index 8e6d552c40dd..741da5a77fd5 100644
23467 --- a/lib/radix-tree.c
23468 +++ b/lib/radix-tree.c
23469 @@ -36,7 +36,7 @@
23470  #include <linux/bitops.h>
23471  #include <linux/rcupdate.h>
23472  #include <linux/preempt.h>             /* in_interrupt() */
23473 -
23474 +#include <linux/locallock.h>
23475  
23476  /* Number of nodes in fully populated tree of given height */
23477  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
23478 @@ -68,6 +68,7 @@ struct radix_tree_preload {
23479         struct radix_tree_node *nodes;
23480  };
23481  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
23482 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
23483  
23484  static inline void *node_to_entry(void *ptr)
23485  {
23486 @@ -290,13 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
23487                  * succeed in getting a node here (and never reach
23488                  * kmem_cache_alloc)
23489                  */
23490 -               rtp = this_cpu_ptr(&radix_tree_preloads);
23491 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
23492                 if (rtp->nr) {
23493                         ret = rtp->nodes;
23494                         rtp->nodes = ret->private_data;
23495                         ret->private_data = NULL;
23496                         rtp->nr--;
23497                 }
23498 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
23499                 /*
23500                  * Update the allocation stack trace as this is more useful
23501                  * for debugging.
23502 @@ -357,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
23503          */
23504         gfp_mask &= ~__GFP_ACCOUNT;
23505  
23506 -       preempt_disable();
23507 +       local_lock(radix_tree_preloads_lock);
23508         rtp = this_cpu_ptr(&radix_tree_preloads);
23509         while (rtp->nr < nr) {
23510 -               preempt_enable();
23511 +               local_unlock(radix_tree_preloads_lock);
23512                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
23513                 if (node == NULL)
23514                         goto out;
23515 -               preempt_disable();
23516 +               local_lock(radix_tree_preloads_lock);
23517                 rtp = this_cpu_ptr(&radix_tree_preloads);
23518                 if (rtp->nr < nr) {
23519                         node->private_data = rtp->nodes;
23520 @@ -406,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
23521         if (gfpflags_allow_blocking(gfp_mask))
23522                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
23523         /* Preloading doesn't help anything with this gfp mask, skip it */
23524 -       preempt_disable();
23525 +       local_lock(radix_tree_preloads_lock);
23526         return 0;
23527  }
23528  EXPORT_SYMBOL(radix_tree_maybe_preload);
23529 @@ -422,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
23530  
23531         /* Preloading doesn't help anything with this gfp mask, skip it */
23532         if (!gfpflags_allow_blocking(gfp_mask)) {
23533 -               preempt_disable();
23534 +               local_lock(radix_tree_preloads_lock);
23535                 return 0;
23536         }
23537  
23538 @@ -456,6 +458,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
23539         return __radix_tree_preload(gfp_mask, nr_nodes);
23540  }
23541  
23542 +void radix_tree_preload_end(void)
23543 +{
23544 +       local_unlock(radix_tree_preloads_lock);
23545 +}
23546 +EXPORT_SYMBOL(radix_tree_preload_end);
23547 +
23548  /*
23549   * The maximum index which can be stored in a radix tree
23550   */
23551 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
23552 index 004fc70fc56a..ccc46992a517 100644
23553 --- a/lib/scatterlist.c
23554 +++ b/lib/scatterlist.c
23555 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
23556                         flush_kernel_dcache_page(miter->page);
23557  
23558                 if (miter->__flags & SG_MITER_ATOMIC) {
23559 -                       WARN_ON_ONCE(preemptible());
23560 +                       WARN_ON_ONCE(!pagefault_disabled());
23561                         kunmap_atomic(miter->addr);
23562                 } else
23563                         kunmap(miter->page);
23564 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
23565         if (!sg_miter_skip(&miter, skip))
23566                 return false;
23567  
23568 -       local_irq_save(flags);
23569 +       local_irq_save_nort(flags);
23570  
23571         while (sg_miter_next(&miter) && offset < buflen) {
23572                 unsigned int len;
23573 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
23574  
23575         sg_miter_stop(&miter);
23576  
23577 -       local_irq_restore(flags);
23578 +       local_irq_restore_nort(flags);
23579         return offset;
23580  }
23581  EXPORT_SYMBOL(sg_copy_buffer);
23582 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
23583 index 1afec32de6f2..11fa431046a8 100644
23584 --- a/lib/smp_processor_id.c
23585 +++ b/lib/smp_processor_id.c
23586 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
23587         if (!printk_ratelimit())
23588                 goto out_enable;
23589  
23590 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
23591 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
23592 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
23593 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
23594 +               current->comm, current->pid);
23595  
23596         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
23597         dump_stack();
23598 diff --git a/localversion-rt b/localversion-rt
23599 new file mode 100644
23600 index 000000000000..1199ebade17b
23601 --- /dev/null
23602 +++ b/localversion-rt
23603 @@ -0,0 +1 @@
23604 +-rt16
23605 diff --git a/mm/Kconfig b/mm/Kconfig
23606 index 86e3e0e74d20..77e5862a1ed2 100644
23607 --- a/mm/Kconfig
23608 +++ b/mm/Kconfig
23609 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
23610  
23611  config TRANSPARENT_HUGEPAGE
23612         bool "Transparent Hugepage Support"
23613 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
23614 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
23615         select COMPACTION
23616         select RADIX_TREE_MULTIORDER
23617         help
23618 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
23619 index 6ff2d7744223..b5a91dd53b5f 100644
23620 --- a/mm/backing-dev.c
23621 +++ b/mm/backing-dev.c
23622 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
23623  {
23624         unsigned long flags;
23625  
23626 -       local_irq_save(flags);
23627 +       local_irq_save_nort(flags);
23628         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
23629 -               local_irq_restore(flags);
23630 +               local_irq_restore_nort(flags);
23631                 return;
23632         }
23633  
23634 diff --git a/mm/compaction.c b/mm/compaction.c
23635 index 70e6bec46dc2..6678ed58b7c6 100644
23636 --- a/mm/compaction.c
23637 +++ b/mm/compaction.c
23638 @@ -1593,10 +1593,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
23639                                 block_start_pfn(cc->migrate_pfn, cc->order);
23640  
23641                         if (cc->last_migrated_pfn < current_block_start) {
23642 -                               cpu = get_cpu();
23643 +                               cpu = get_cpu_light();
23644 +                               local_lock_irq(swapvec_lock);
23645                                 lru_add_drain_cpu(cpu);
23646 +                               local_unlock_irq(swapvec_lock);
23647                                 drain_local_pages(zone);
23648 -                               put_cpu();
23649 +                               put_cpu_light();
23650                                 /* No more flushing until we migrate again */
23651                                 cc->last_migrated_pfn = 0;
23652                         }
23653 diff --git a/mm/filemap.c b/mm/filemap.c
23654 index edfb90e3830c..a8d2c7a73d54 100644
23655 --- a/mm/filemap.c
23656 +++ b/mm/filemap.c
23657 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
23658                  * node->private_list is protected by
23659                  * mapping->tree_lock.
23660                  */
23661 -               if (!list_empty(&node->private_list))
23662 -                       list_lru_del(&workingset_shadow_nodes,
23663 +               if (!list_empty(&node->private_list)) {
23664 +                       local_lock(workingset_shadow_lock);
23665 +                       list_lru_del(&__workingset_shadow_nodes,
23666                                      &node->private_list);
23667 +                       local_unlock(workingset_shadow_lock);
23668 +               }
23669         }
23670         return 0;
23671  }
23672 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
23673                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
23674                                 list_empty(&node->private_list)) {
23675                         node->private_data = mapping;
23676 -                       list_lru_add(&workingset_shadow_nodes,
23677 -                                       &node->private_list);
23678 +                       local_lock(workingset_shadow_lock);
23679 +                       list_lru_add(&__workingset_shadow_nodes,
23680 +                                    &node->private_list);
23681 +                       local_unlock(workingset_shadow_lock);
23682                 }
23683         }
23684  
23685 diff --git a/mm/highmem.c b/mm/highmem.c
23686 index 50b4ca6787f0..77518a3b35a1 100644
23687 --- a/mm/highmem.c
23688 +++ b/mm/highmem.c
23689 @@ -29,10 +29,11 @@
23690  #include <linux/kgdb.h>
23691  #include <asm/tlbflush.h>
23692  
23693 -
23694 +#ifndef CONFIG_PREEMPT_RT_FULL
23695  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
23696  DEFINE_PER_CPU(int, __kmap_atomic_idx);
23697  #endif
23698 +#endif
23699  
23700  /*
23701   * Virtual_count is not a pure "count".
23702 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
23703  unsigned long totalhigh_pages __read_mostly;
23704  EXPORT_SYMBOL(totalhigh_pages);
23705  
23706 -
23707 +#ifndef CONFIG_PREEMPT_RT_FULL
23708  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
23709 +#endif
23710  
23711  unsigned int nr_free_highpages (void)
23712  {
23713 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
23714 index 0de26691f0f5..db6fe1ba7b34 100644
23715 --- a/mm/memcontrol.c
23716 +++ b/mm/memcontrol.c
23717 @@ -67,6 +67,7 @@
23718  #include <net/sock.h>
23719  #include <net/ip.h>
23720  #include "slab.h"
23721 +#include <linux/locallock.h>
23722  
23723  #include <asm/uaccess.h>
23724  
23725 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
23726  #define do_swap_account                0
23727  #endif
23728  
23729 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
23730 +
23731  /* Whether legacy memory+swap accounting is active */
23732  static bool do_memsw_account(void)
23733  {
23734 @@ -1692,6 +1695,7 @@ struct memcg_stock_pcp {
23735  #define FLUSHING_CACHED_CHARGE 0
23736  };
23737  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
23738 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
23739  static DEFINE_MUTEX(percpu_charge_mutex);
23740  
23741  /**
23742 @@ -1714,7 +1718,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23743         if (nr_pages > CHARGE_BATCH)
23744                 return ret;
23745  
23746 -       local_irq_save(flags);
23747 +       local_lock_irqsave(memcg_stock_ll, flags);
23748  
23749         stock = this_cpu_ptr(&memcg_stock);
23750         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
23751 @@ -1722,7 +1726,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23752                 ret = true;
23753         }
23754  
23755 -       local_irq_restore(flags);
23756 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23757  
23758         return ret;
23759  }
23760 @@ -1749,13 +1753,13 @@ static void drain_local_stock(struct work_struct *dummy)
23761         struct memcg_stock_pcp *stock;
23762         unsigned long flags;
23763  
23764 -       local_irq_save(flags);
23765 +       local_lock_irqsave(memcg_stock_ll, flags);
23766  
23767         stock = this_cpu_ptr(&memcg_stock);
23768         drain_stock(stock);
23769         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
23770  
23771 -       local_irq_restore(flags);
23772 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23773  }
23774  
23775  /*
23776 @@ -1767,7 +1771,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23777         struct memcg_stock_pcp *stock;
23778         unsigned long flags;
23779  
23780 -       local_irq_save(flags);
23781 +       local_lock_irqsave(memcg_stock_ll, flags);
23782  
23783         stock = this_cpu_ptr(&memcg_stock);
23784         if (stock->cached != memcg) { /* reset if necessary */
23785 @@ -1776,7 +1780,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23786         }
23787         stock->nr_pages += nr_pages;
23788  
23789 -       local_irq_restore(flags);
23790 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23791  }
23792  
23793  /*
23794 @@ -1792,7 +1796,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
23795                 return;
23796         /* Notify other cpus that system-wide "drain" is running */
23797         get_online_cpus();
23798 -       curcpu = get_cpu();
23799 +       curcpu = get_cpu_light();
23800         for_each_online_cpu(cpu) {
23801                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
23802                 struct mem_cgroup *memcg;
23803 @@ -1809,7 +1813,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
23804                                 schedule_work_on(cpu, &stock->work);
23805                 }
23806         }
23807 -       put_cpu();
23808 +       put_cpu_light();
23809         put_online_cpus();
23810         mutex_unlock(&percpu_charge_mutex);
23811  }
23812 @@ -4553,12 +4557,12 @@ static int mem_cgroup_move_account(struct page *page,
23813  
23814         ret = 0;
23815  
23816 -       local_irq_disable();
23817 +       local_lock_irq(event_lock);
23818         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
23819         memcg_check_events(to, page);
23820         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
23821         memcg_check_events(from, page);
23822 -       local_irq_enable();
23823 +       local_unlock_irq(event_lock);
23824  out_unlock:
23825         unlock_page(page);
23826  out:
23827 @@ -5433,10 +5437,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
23828  
23829         commit_charge(page, memcg, lrucare);
23830  
23831 -       local_irq_disable();
23832 +       local_lock_irq(event_lock);
23833         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
23834         memcg_check_events(memcg, page);
23835 -       local_irq_enable();
23836 +       local_unlock_irq(event_lock);
23837  
23838         if (do_memsw_account() && PageSwapCache(page)) {
23839                 swp_entry_t entry = { .val = page_private(page) };
23840 @@ -5492,14 +5496,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
23841                 memcg_oom_recover(memcg);
23842         }
23843  
23844 -       local_irq_save(flags);
23845 +       local_lock_irqsave(event_lock, flags);
23846         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
23847         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
23848         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
23849         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
23850         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
23851         memcg_check_events(memcg, dummy_page);
23852 -       local_irq_restore(flags);
23853 +       local_unlock_irqrestore(event_lock, flags);
23854  
23855         if (!mem_cgroup_is_root(memcg))
23856                 css_put_many(&memcg->css, nr_pages);
23857 @@ -5654,10 +5658,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
23858  
23859         commit_charge(newpage, memcg, false);
23860  
23861 -       local_irq_save(flags);
23862 +       local_lock_irqsave(event_lock, flags);
23863         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
23864         memcg_check_events(memcg, newpage);
23865 -       local_irq_restore(flags);
23866 +       local_unlock_irqrestore(event_lock, flags);
23867  }
23868  
23869  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
23870 @@ -5837,6 +5841,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
23871  {
23872         struct mem_cgroup *memcg, *swap_memcg;
23873         unsigned short oldid;
23874 +       unsigned long flags;
23875  
23876         VM_BUG_ON_PAGE(PageLRU(page), page);
23877         VM_BUG_ON_PAGE(page_count(page), page);
23878 @@ -5877,12 +5882,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
23879          * important here to have the interrupts disabled because it is the
23880          * only synchronisation we have for udpating the per-CPU variables.
23881          */
23882 +       local_lock_irqsave(event_lock, flags);
23883 +#ifndef CONFIG_PREEMPT_RT_BASE
23884         VM_BUG_ON(!irqs_disabled());
23885 +#endif
23886         mem_cgroup_charge_statistics(memcg, page, false, -1);
23887         memcg_check_events(memcg, page);
23888  
23889         if (!mem_cgroup_is_root(memcg))
23890                 css_put(&memcg->css);
23891 +       local_unlock_irqrestore(event_lock, flags);
23892  }
23893  
23894  /*
23895 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
23896 index 6f4d27c5bb32..5cd25c745a8f 100644
23897 --- a/mm/mmu_context.c
23898 +++ b/mm/mmu_context.c
23899 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
23900         struct task_struct *tsk = current;
23901  
23902         task_lock(tsk);
23903 +       preempt_disable_rt();
23904         active_mm = tsk->active_mm;
23905         if (active_mm != mm) {
23906                 atomic_inc(&mm->mm_count);
23907 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
23908         }
23909         tsk->mm = mm;
23910         switch_mm(active_mm, mm, tsk);
23911 +       preempt_enable_rt();
23912         task_unlock(tsk);
23913  #ifdef finish_arch_post_lock_switch
23914         finish_arch_post_lock_switch();
23915 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
23916 index 1460e6ad5e14..dc4eac895340 100644
23917 --- a/mm/page_alloc.c
23918 +++ b/mm/page_alloc.c
23919 @@ -61,6 +61,7 @@
23920  #include <linux/page_ext.h>
23921  #include <linux/hugetlb.h>
23922  #include <linux/sched/rt.h>
23923 +#include <linux/locallock.h>
23924  #include <linux/page_owner.h>
23925  #include <linux/kthread.h>
23926  #include <linux/memcontrol.h>
23927 @@ -281,6 +282,18 @@ EXPORT_SYMBOL(nr_node_ids);
23928  EXPORT_SYMBOL(nr_online_nodes);
23929  #endif
23930  
23931 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
23932 +
23933 +#ifdef CONFIG_PREEMPT_RT_BASE
23934 +# define cpu_lock_irqsave(cpu, flags)          \
23935 +       local_lock_irqsave_on(pa_lock, flags, cpu)
23936 +# define cpu_unlock_irqrestore(cpu, flags)     \
23937 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
23938 +#else
23939 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
23940 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
23941 +#endif
23942 +
23943  int page_group_by_mobility_disabled __read_mostly;
23944  
23945  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
23946 @@ -1072,7 +1085,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
23947  #endif /* CONFIG_DEBUG_VM */
23948  
23949  /*
23950 - * Frees a number of pages from the PCP lists
23951 + * Frees a number of pages which have been collected from the pcp lists.
23952   * Assumes all pages on list are in same zone, and of same order.
23953   * count is the number of pages to free.
23954   *
23955 @@ -1083,19 +1096,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
23956   * pinned" detection logic.
23957   */
23958  static void free_pcppages_bulk(struct zone *zone, int count,
23959 -                                       struct per_cpu_pages *pcp)
23960 +                              struct list_head *list)
23961  {
23962 -       int migratetype = 0;
23963 -       int batch_free = 0;
23964         unsigned long nr_scanned;
23965         bool isolated_pageblocks;
23966 +       unsigned long flags;
23967 +
23968 +       spin_lock_irqsave(&zone->lock, flags);
23969  
23970 -       spin_lock(&zone->lock);
23971         isolated_pageblocks = has_isolate_pageblock(zone);
23972         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
23973         if (nr_scanned)
23974                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
23975  
23976 +       while (!list_empty(list)) {
23977 +               struct page *page;
23978 +               int mt; /* migratetype of the to-be-freed page */
23979 +
23980 +               page = list_first_entry(list, struct page, lru);
23981 +               /* must delete as __free_one_page list manipulates */
23982 +               list_del(&page->lru);
23983 +
23984 +               mt = get_pcppage_migratetype(page);
23985 +               /* MIGRATE_ISOLATE page should not go to pcplists */
23986 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
23987 +               /* Pageblock could have been isolated meanwhile */
23988 +               if (unlikely(isolated_pageblocks))
23989 +                       mt = get_pageblock_migratetype(page);
23990 +
23991 +               if (bulkfree_pcp_prepare(page))
23992 +                       continue;
23993 +
23994 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
23995 +               trace_mm_page_pcpu_drain(page, 0, mt);
23996 +               count--;
23997 +       }
23998 +       WARN_ON(count != 0);
23999 +       spin_unlock_irqrestore(&zone->lock, flags);
24000 +}
24001 +
24002 +/*
24003 + * Moves a number of pages from the PCP lists to free list which
24004 + * is freed outside of the locked region.
24005 + *
24006 + * Assumes all pages on list are in same zone, and of same order.
24007 + * count is the number of pages to free.
24008 + */
24009 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
24010 +                             struct list_head *dst)
24011 +{
24012 +       int migratetype = 0;
24013 +       int batch_free = 0;
24014 +
24015         while (count) {
24016                 struct page *page;
24017                 struct list_head *list;
24018 @@ -1111,7 +1163,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
24019                         batch_free++;
24020                         if (++migratetype == MIGRATE_PCPTYPES)
24021                                 migratetype = 0;
24022 -                       list = &pcp->lists[migratetype];
24023 +                       list = &src->lists[migratetype];
24024                 } while (list_empty(list));
24025  
24026                 /* This is the only non-empty list. Free them all. */
24027 @@ -1119,27 +1171,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
24028                         batch_free = count;
24029  
24030                 do {
24031 -                       int mt; /* migratetype of the to-be-freed page */
24032 -
24033                         page = list_last_entry(list, struct page, lru);
24034 -                       /* must delete as __free_one_page list manipulates */
24035                         list_del(&page->lru);
24036  
24037 -                       mt = get_pcppage_migratetype(page);
24038 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
24039 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
24040 -                       /* Pageblock could have been isolated meanwhile */
24041 -                       if (unlikely(isolated_pageblocks))
24042 -                               mt = get_pageblock_migratetype(page);
24043 -
24044 -                       if (bulkfree_pcp_prepare(page))
24045 -                               continue;
24046 -
24047 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
24048 -                       trace_mm_page_pcpu_drain(page, 0, mt);
24049 +                       list_add(&page->lru, dst);
24050                 } while (--count && --batch_free && !list_empty(list));
24051         }
24052 -       spin_unlock(&zone->lock);
24053  }
24054  
24055  static void free_one_page(struct zone *zone,
24056 @@ -1148,7 +1185,9 @@ static void free_one_page(struct zone *zone,
24057                                 int migratetype)
24058  {
24059         unsigned long nr_scanned;
24060 -       spin_lock(&zone->lock);
24061 +       unsigned long flags;
24062 +
24063 +       spin_lock_irqsave(&zone->lock, flags);
24064         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
24065         if (nr_scanned)
24066                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
24067 @@ -1158,7 +1197,7 @@ static void free_one_page(struct zone *zone,
24068                 migratetype = get_pfnblock_migratetype(page, pfn);
24069         }
24070         __free_one_page(page, pfn, zone, order, migratetype);
24071 -       spin_unlock(&zone->lock);
24072 +       spin_unlock_irqrestore(&zone->lock, flags);
24073  }
24074  
24075  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
24076 @@ -1244,10 +1283,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
24077                 return;
24078  
24079         migratetype = get_pfnblock_migratetype(page, pfn);
24080 -       local_irq_save(flags);
24081 +       local_lock_irqsave(pa_lock, flags);
24082         __count_vm_events(PGFREE, 1 << order);
24083         free_one_page(page_zone(page), page, pfn, order, migratetype);
24084 -       local_irq_restore(flags);
24085 +       local_unlock_irqrestore(pa_lock, flags);
24086  }
24087  
24088  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
24089 @@ -2246,16 +2285,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
24090  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
24091  {
24092         unsigned long flags;
24093 +       LIST_HEAD(dst);
24094         int to_drain, batch;
24095  
24096 -       local_irq_save(flags);
24097 +       local_lock_irqsave(pa_lock, flags);
24098         batch = READ_ONCE(pcp->batch);
24099         to_drain = min(pcp->count, batch);
24100         if (to_drain > 0) {
24101 -               free_pcppages_bulk(zone, to_drain, pcp);
24102 +               isolate_pcp_pages(to_drain, pcp, &dst);
24103                 pcp->count -= to_drain;
24104         }
24105 -       local_irq_restore(flags);
24106 +       local_unlock_irqrestore(pa_lock, flags);
24107 +       free_pcppages_bulk(zone, to_drain, &dst);
24108  }
24109  #endif
24110  
24111 @@ -2271,16 +2312,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
24112         unsigned long flags;
24113         struct per_cpu_pageset *pset;
24114         struct per_cpu_pages *pcp;
24115 +       LIST_HEAD(dst);
24116 +       int count;
24117  
24118 -       local_irq_save(flags);
24119 +       cpu_lock_irqsave(cpu, flags);
24120         pset = per_cpu_ptr(zone->pageset, cpu);
24121  
24122         pcp = &pset->pcp;
24123 -       if (pcp->count) {
24124 -               free_pcppages_bulk(zone, pcp->count, pcp);
24125 +       count = pcp->count;
24126 +       if (count) {
24127 +               isolate_pcp_pages(count, pcp, &dst);
24128                 pcp->count = 0;
24129         }
24130 -       local_irq_restore(flags);
24131 +       cpu_unlock_irqrestore(cpu, flags);
24132 +       if (count)
24133 +               free_pcppages_bulk(zone, count, &dst);
24134  }
24135  
24136  /*
24137 @@ -2366,8 +2412,17 @@ void drain_all_pages(struct zone *zone)
24138                 else
24139                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
24140         }
24141 +#ifndef CONFIG_PREEMPT_RT_BASE
24142         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
24143                                                                 zone, 1);
24144 +#else
24145 +       for_each_cpu(cpu, &cpus_with_pcps) {
24146 +               if (zone)
24147 +                       drain_pages_zone(cpu, zone);
24148 +               else
24149 +                       drain_pages(cpu);
24150 +       }
24151 +#endif
24152  }
24153  
24154  #ifdef CONFIG_HIBERNATION
24155 @@ -2427,7 +2482,7 @@ void free_hot_cold_page(struct page *page, bool cold)
24156  
24157         migratetype = get_pfnblock_migratetype(page, pfn);
24158         set_pcppage_migratetype(page, migratetype);
24159 -       local_irq_save(flags);
24160 +       local_lock_irqsave(pa_lock, flags);
24161         __count_vm_event(PGFREE);
24162  
24163         /*
24164 @@ -2453,12 +2508,17 @@ void free_hot_cold_page(struct page *page, bool cold)
24165         pcp->count++;
24166         if (pcp->count >= pcp->high) {
24167                 unsigned long batch = READ_ONCE(pcp->batch);
24168 -               free_pcppages_bulk(zone, batch, pcp);
24169 +               LIST_HEAD(dst);
24170 +
24171 +               isolate_pcp_pages(batch, pcp, &dst);
24172                 pcp->count -= batch;
24173 +               local_unlock_irqrestore(pa_lock, flags);
24174 +               free_pcppages_bulk(zone, batch, &dst);
24175 +               return;
24176         }
24177  
24178  out:
24179 -       local_irq_restore(flags);
24180 +       local_unlock_irqrestore(pa_lock, flags);
24181  }
24182  
24183  /*
24184 @@ -2600,7 +2660,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
24185                 struct per_cpu_pages *pcp;
24186                 struct list_head *list;
24187  
24188 -               local_irq_save(flags);
24189 +               local_lock_irqsave(pa_lock, flags);
24190                 do {
24191                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
24192                         list = &pcp->lists[migratetype];
24193 @@ -2627,7 +2687,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
24194                  * allocate greater than order-1 page units with __GFP_NOFAIL.
24195                  */
24196                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
24197 -               spin_lock_irqsave(&zone->lock, flags);
24198 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
24199  
24200                 do {
24201                         page = NULL;
24202 @@ -2639,22 +2699,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
24203                         if (!page)
24204                                 page = __rmqueue(zone, order, migratetype);
24205                 } while (page && check_new_pages(page, order));
24206 -               spin_unlock(&zone->lock);
24207 -               if (!page)
24208 +               if (!page) {
24209 +                       spin_unlock(&zone->lock);
24210                         goto failed;
24211 +               }
24212                 __mod_zone_freepage_state(zone, -(1 << order),
24213                                           get_pcppage_migratetype(page));
24214 +               spin_unlock(&zone->lock);
24215         }
24216  
24217         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
24218         zone_statistics(preferred_zone, zone, gfp_flags);
24219 -       local_irq_restore(flags);
24220 +       local_unlock_irqrestore(pa_lock, flags);
24221  
24222         VM_BUG_ON_PAGE(bad_range(zone, page), page);
24223         return page;
24224  
24225  failed:
24226 -       local_irq_restore(flags);
24227 +       local_unlock_irqrestore(pa_lock, flags);
24228         return NULL;
24229  }
24230  
24231 @@ -6531,7 +6593,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
24232         int cpu = (unsigned long)hcpu;
24233  
24234         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
24235 +               local_lock_irq_on(swapvec_lock, cpu);
24236                 lru_add_drain_cpu(cpu);
24237 +               local_unlock_irq_on(swapvec_lock, cpu);
24238                 drain_pages(cpu);
24239  
24240                 /*
24241 @@ -6557,6 +6621,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
24242  void __init page_alloc_init(void)
24243  {
24244         hotcpu_notifier(page_alloc_cpu_notify, 0);
24245 +       local_irq_lock_init(pa_lock);
24246  }
24247  
24248  /*
24249 @@ -7385,7 +7450,7 @@ void zone_pcp_reset(struct zone *zone)
24250         struct per_cpu_pageset *pset;
24251  
24252         /* avoid races with drain_pages()  */
24253 -       local_irq_save(flags);
24254 +       local_lock_irqsave(pa_lock, flags);
24255         if (zone->pageset != &boot_pageset) {
24256                 for_each_online_cpu(cpu) {
24257                         pset = per_cpu_ptr(zone->pageset, cpu);
24258 @@ -7394,7 +7459,7 @@ void zone_pcp_reset(struct zone *zone)
24259                 free_percpu(zone->pageset);
24260                 zone->pageset = &boot_pageset;
24261         }
24262 -       local_irq_restore(flags);
24263 +       local_unlock_irqrestore(pa_lock, flags);
24264  }
24265  
24266  #ifdef CONFIG_MEMORY_HOTREMOVE
24267 diff --git a/mm/percpu.c b/mm/percpu.c
24268 index f014cebbf405..4e739fcf91bf 100644
24269 --- a/mm/percpu.c
24270 +++ b/mm/percpu.c
24271 @@ -1283,6 +1283,31 @@ void free_percpu(void __percpu *ptr)
24272  }
24273  EXPORT_SYMBOL_GPL(free_percpu);
24274  
24275 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
24276 +{
24277 +#ifdef CONFIG_SMP
24278 +       const size_t static_size = __per_cpu_end - __per_cpu_start;
24279 +       void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
24280 +       unsigned int cpu;
24281 +
24282 +       for_each_possible_cpu(cpu) {
24283 +               void *start = per_cpu_ptr(base, cpu);
24284 +               void *va = (void *)addr;
24285 +
24286 +               if (va >= start && va < start + static_size) {
24287 +                       if (can_addr) {
24288 +                               *can_addr = (unsigned long) (va - start);
24289 +                               *can_addr += (unsigned long)
24290 +                                       per_cpu_ptr(base, get_boot_cpu_id());
24291 +                       }
24292 +                       return true;
24293 +               }
24294 +       }
24295 +#endif
24296 +       /* on UP, can't distinguish from other static vars, always false */
24297 +       return false;
24298 +}
24299 +
24300  /**
24301   * is_kernel_percpu_address - test whether address is from static percpu area
24302   * @addr: address to test
24303 @@ -1296,20 +1321,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
24304   */
24305  bool is_kernel_percpu_address(unsigned long addr)
24306  {
24307 -#ifdef CONFIG_SMP
24308 -       const size_t static_size = __per_cpu_end - __per_cpu_start;
24309 -       void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
24310 -       unsigned int cpu;
24311 -
24312 -       for_each_possible_cpu(cpu) {
24313 -               void *start = per_cpu_ptr(base, cpu);
24314 -
24315 -               if ((void *)addr >= start && (void *)addr < start + static_size)
24316 -                       return true;
24317 -        }
24318 -#endif
24319 -       /* on UP, can't distinguish from other static vars, always false */
24320 -       return false;
24321 +       return __is_kernel_percpu_address(addr, NULL);
24322  }
24323  
24324  /**
24325 diff --git a/mm/slab.h b/mm/slab.h
24326 index ceb7d70cdb76..dfd281e43fbe 100644
24327 --- a/mm/slab.h
24328 +++ b/mm/slab.h
24329 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
24330   * The slab lists for all objects.
24331   */
24332  struct kmem_cache_node {
24333 +#ifdef CONFIG_SLUB
24334 +       raw_spinlock_t list_lock;
24335 +#else
24336         spinlock_t list_lock;
24337 +#endif
24338  
24339  #ifdef CONFIG_SLAB
24340         struct list_head slabs_partial; /* partial list first, better asm code */
24341 diff --git a/mm/slub.c b/mm/slub.c
24342 index 58c7526f8de2..6d72b7f87129 100644
24343 --- a/mm/slub.c
24344 +++ b/mm/slub.c
24345 @@ -1141,7 +1141,7 @@ static noinline int free_debug_processing(
24346         unsigned long uninitialized_var(flags);
24347         int ret = 0;
24348  
24349 -       spin_lock_irqsave(&n->list_lock, flags);
24350 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24351         slab_lock(page);
24352  
24353         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
24354 @@ -1176,7 +1176,7 @@ static noinline int free_debug_processing(
24355                          bulk_cnt, cnt);
24356  
24357         slab_unlock(page);
24358 -       spin_unlock_irqrestore(&n->list_lock, flags);
24359 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24360         if (!ret)
24361                 slab_fix(s, "Object at 0x%p not freed", object);
24362         return ret;
24363 @@ -1304,6 +1304,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
24364  
24365  #endif /* CONFIG_SLUB_DEBUG */
24366  
24367 +struct slub_free_list {
24368 +       raw_spinlock_t          lock;
24369 +       struct list_head        list;
24370 +};
24371 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
24372 +
24373  /*
24374   * Hooks for other subsystems that check memory allocations. In a typical
24375   * production configuration these hooks all should produce no code at all.
24376 @@ -1527,10 +1533,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
24377         void *start, *p;
24378         int idx, order;
24379         bool shuffle;
24380 +       bool enableirqs = false;
24381  
24382         flags &= gfp_allowed_mask;
24383  
24384         if (gfpflags_allow_blocking(flags))
24385 +               enableirqs = true;
24386 +#ifdef CONFIG_PREEMPT_RT_FULL
24387 +       if (system_state == SYSTEM_RUNNING)
24388 +               enableirqs = true;
24389 +#endif
24390 +       if (enableirqs)
24391                 local_irq_enable();
24392  
24393         flags |= s->allocflags;
24394 @@ -1605,7 +1618,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
24395         page->frozen = 1;
24396  
24397  out:
24398 -       if (gfpflags_allow_blocking(flags))
24399 +       if (enableirqs)
24400                 local_irq_disable();
24401         if (!page)
24402                 return NULL;
24403 @@ -1664,6 +1677,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
24404         __free_pages(page, order);
24405  }
24406  
24407 +static void free_delayed(struct list_head *h)
24408 +{
24409 +       while(!list_empty(h)) {
24410 +               struct page *page = list_first_entry(h, struct page, lru);
24411 +
24412 +               list_del(&page->lru);
24413 +               __free_slab(page->slab_cache, page);
24414 +       }
24415 +}
24416 +
24417  #define need_reserve_slab_rcu                                          \
24418         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
24419  
24420 @@ -1695,6 +1718,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
24421                 }
24422  
24423                 call_rcu(head, rcu_free_slab);
24424 +       } else if (irqs_disabled()) {
24425 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
24426 +
24427 +               raw_spin_lock(&f->lock);
24428 +               list_add(&page->lru, &f->list);
24429 +               raw_spin_unlock(&f->lock);
24430         } else
24431                 __free_slab(s, page);
24432  }
24433 @@ -1802,7 +1831,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
24434         if (!n || !n->nr_partial)
24435                 return NULL;
24436  
24437 -       spin_lock(&n->list_lock);
24438 +       raw_spin_lock(&n->list_lock);
24439         list_for_each_entry_safe(page, page2, &n->partial, lru) {
24440                 void *t;
24441  
24442 @@ -1827,7 +1856,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
24443                         break;
24444  
24445         }
24446 -       spin_unlock(&n->list_lock);
24447 +       raw_spin_unlock(&n->list_lock);
24448         return object;
24449  }
24450  
24451 @@ -2073,7 +2102,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
24452                          * that acquire_slab() will see a slab page that
24453                          * is frozen
24454                          */
24455 -                       spin_lock(&n->list_lock);
24456 +                       raw_spin_lock(&n->list_lock);
24457                 }
24458         } else {
24459                 m = M_FULL;
24460 @@ -2084,7 +2113,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
24461                          * slabs from diagnostic functions will not see
24462                          * any frozen slabs.
24463                          */
24464 -                       spin_lock(&n->list_lock);
24465 +                       raw_spin_lock(&n->list_lock);
24466                 }
24467         }
24468  
24469 @@ -2119,7 +2148,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
24470                 goto redo;
24471  
24472         if (lock)
24473 -               spin_unlock(&n->list_lock);
24474 +               raw_spin_unlock(&n->list_lock);
24475  
24476         if (m == M_FREE) {
24477                 stat(s, DEACTIVATE_EMPTY);
24478 @@ -2151,10 +2180,10 @@ static void unfreeze_partials(struct kmem_cache *s,
24479                 n2 = get_node(s, page_to_nid(page));
24480                 if (n != n2) {
24481                         if (n)
24482 -                               spin_unlock(&n->list_lock);
24483 +                               raw_spin_unlock(&n->list_lock);
24484  
24485                         n = n2;
24486 -                       spin_lock(&n->list_lock);
24487 +                       raw_spin_lock(&n->list_lock);
24488                 }
24489  
24490                 do {
24491 @@ -2183,7 +2212,7 @@ static void unfreeze_partials(struct kmem_cache *s,
24492         }
24493  
24494         if (n)
24495 -               spin_unlock(&n->list_lock);
24496 +               raw_spin_unlock(&n->list_lock);
24497  
24498         while (discard_page) {
24499                 page = discard_page;
24500 @@ -2222,14 +2251,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
24501                         pobjects = oldpage->pobjects;
24502                         pages = oldpage->pages;
24503                         if (drain && pobjects > s->cpu_partial) {
24504 +                               struct slub_free_list *f;
24505                                 unsigned long flags;
24506 +                               LIST_HEAD(tofree);
24507                                 /*
24508                                  * partial array is full. Move the existing
24509                                  * set to the per node partial list.
24510                                  */
24511                                 local_irq_save(flags);
24512                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
24513 +                               f = this_cpu_ptr(&slub_free_list);
24514 +                               raw_spin_lock(&f->lock);
24515 +                               list_splice_init(&f->list, &tofree);
24516 +                               raw_spin_unlock(&f->lock);
24517                                 local_irq_restore(flags);
24518 +                               free_delayed(&tofree);
24519                                 oldpage = NULL;
24520                                 pobjects = 0;
24521                                 pages = 0;
24522 @@ -2301,7 +2337,22 @@ static bool has_cpu_slab(int cpu, void *info)
24523  
24524  static void flush_all(struct kmem_cache *s)
24525  {
24526 +       LIST_HEAD(tofree);
24527 +       int cpu;
24528 +
24529         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
24530 +       for_each_online_cpu(cpu) {
24531 +               struct slub_free_list *f;
24532 +
24533 +               if (!has_cpu_slab(cpu, s))
24534 +                       continue;
24535 +
24536 +               f = &per_cpu(slub_free_list, cpu);
24537 +               raw_spin_lock_irq(&f->lock);
24538 +               list_splice_init(&f->list, &tofree);
24539 +               raw_spin_unlock_irq(&f->lock);
24540 +               free_delayed(&tofree);
24541 +       }
24542  }
24543  
24544  /*
24545 @@ -2356,10 +2407,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
24546         unsigned long x = 0;
24547         struct page *page;
24548  
24549 -       spin_lock_irqsave(&n->list_lock, flags);
24550 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24551         list_for_each_entry(page, &n->partial, lru)
24552                 x += get_count(page);
24553 -       spin_unlock_irqrestore(&n->list_lock, flags);
24554 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24555         return x;
24556  }
24557  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
24558 @@ -2497,8 +2548,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
24559   * already disabled (which is the case for bulk allocation).
24560   */
24561  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24562 -                         unsigned long addr, struct kmem_cache_cpu *c)
24563 +                         unsigned long addr, struct kmem_cache_cpu *c,
24564 +                         struct list_head *to_free)
24565  {
24566 +       struct slub_free_list *f;
24567         void *freelist;
24568         struct page *page;
24569  
24570 @@ -2558,6 +2611,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24571         VM_BUG_ON(!c->page->frozen);
24572         c->freelist = get_freepointer(s, freelist);
24573         c->tid = next_tid(c->tid);
24574 +
24575 +out:
24576 +       f = this_cpu_ptr(&slub_free_list);
24577 +       raw_spin_lock(&f->lock);
24578 +       list_splice_init(&f->list, to_free);
24579 +       raw_spin_unlock(&f->lock);
24580 +
24581         return freelist;
24582  
24583  new_slab:
24584 @@ -2589,7 +2649,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24585         deactivate_slab(s, page, get_freepointer(s, freelist));
24586         c->page = NULL;
24587         c->freelist = NULL;
24588 -       return freelist;
24589 +       goto out;
24590  }
24591  
24592  /*
24593 @@ -2601,6 +2661,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24594  {
24595         void *p;
24596         unsigned long flags;
24597 +       LIST_HEAD(tofree);
24598  
24599         local_irq_save(flags);
24600  #ifdef CONFIG_PREEMPT
24601 @@ -2612,8 +2673,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24602         c = this_cpu_ptr(s->cpu_slab);
24603  #endif
24604  
24605 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
24606 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
24607         local_irq_restore(flags);
24608 +       free_delayed(&tofree);
24609         return p;
24610  }
24611  
24612 @@ -2799,7 +2861,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24613  
24614         do {
24615                 if (unlikely(n)) {
24616 -                       spin_unlock_irqrestore(&n->list_lock, flags);
24617 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24618                         n = NULL;
24619                 }
24620                 prior = page->freelist;
24621 @@ -2831,7 +2893,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24622                                  * Otherwise the list_lock will synchronize with
24623                                  * other processors updating the list of slabs.
24624                                  */
24625 -                               spin_lock_irqsave(&n->list_lock, flags);
24626 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
24627  
24628                         }
24629                 }
24630 @@ -2873,7 +2935,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24631                 add_partial(n, page, DEACTIVATE_TO_TAIL);
24632                 stat(s, FREE_ADD_PARTIAL);
24633         }
24634 -       spin_unlock_irqrestore(&n->list_lock, flags);
24635 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24636         return;
24637  
24638  slab_empty:
24639 @@ -2888,7 +2950,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24640                 remove_full(s, n, page);
24641         }
24642  
24643 -       spin_unlock_irqrestore(&n->list_lock, flags);
24644 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24645         stat(s, FREE_SLAB);
24646         discard_slab(s, page);
24647  }
24648 @@ -3093,6 +3155,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
24649                           void **p)
24650  {
24651         struct kmem_cache_cpu *c;
24652 +       LIST_HEAD(to_free);
24653         int i;
24654  
24655         /* memcg and kmem_cache debug support */
24656 @@ -3116,7 +3179,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
24657                          * of re-populating per CPU c->freelist
24658                          */
24659                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
24660 -                                           _RET_IP_, c);
24661 +                                           _RET_IP_, c, &to_free);
24662                         if (unlikely(!p[i]))
24663                                 goto error;
24664  
24665 @@ -3128,6 +3191,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
24666         }
24667         c->tid = next_tid(c->tid);
24668         local_irq_enable();
24669 +       free_delayed(&to_free);
24670  
24671         /* Clear memory outside IRQ disabled fastpath loop */
24672         if (unlikely(flags & __GFP_ZERO)) {
24673 @@ -3275,7 +3339,7 @@ static void
24674  init_kmem_cache_node(struct kmem_cache_node *n)
24675  {
24676         n->nr_partial = 0;
24677 -       spin_lock_init(&n->list_lock);
24678 +       raw_spin_lock_init(&n->list_lock);
24679         INIT_LIST_HEAD(&n->partial);
24680  #ifdef CONFIG_SLUB_DEBUG
24681         atomic_long_set(&n->nr_slabs, 0);
24682 @@ -3619,6 +3683,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
24683                                                         const char *text)
24684  {
24685  #ifdef CONFIG_SLUB_DEBUG
24686 +#ifdef CONFIG_PREEMPT_RT_BASE
24687 +       /* XXX move out of irq-off section */
24688 +       slab_err(s, page, text, s->name);
24689 +#else
24690         void *addr = page_address(page);
24691         void *p;
24692         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
24693 @@ -3639,6 +3707,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
24694         slab_unlock(page);
24695         kfree(map);
24696  #endif
24697 +#endif
24698  }
24699  
24700  /*
24701 @@ -3652,7 +3721,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
24702         struct page *page, *h;
24703  
24704         BUG_ON(irqs_disabled());
24705 -       spin_lock_irq(&n->list_lock);
24706 +       raw_spin_lock_irq(&n->list_lock);
24707         list_for_each_entry_safe(page, h, &n->partial, lru) {
24708                 if (!page->inuse) {
24709                         remove_partial(n, page);
24710 @@ -3662,7 +3731,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
24711                         "Objects remaining in %s on __kmem_cache_shutdown()");
24712                 }
24713         }
24714 -       spin_unlock_irq(&n->list_lock);
24715 +       raw_spin_unlock_irq(&n->list_lock);
24716  
24717         list_for_each_entry_safe(page, h, &discard, lru)
24718                 discard_slab(s, page);
24719 @@ -3905,7 +3974,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
24720                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
24721                         INIT_LIST_HEAD(promote + i);
24722  
24723 -               spin_lock_irqsave(&n->list_lock, flags);
24724 +               raw_spin_lock_irqsave(&n->list_lock, flags);
24725  
24726                 /*
24727                  * Build lists of slabs to discard or promote.
24728 @@ -3936,7 +4005,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
24729                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
24730                         list_splice(promote + i, &n->partial);
24731  
24732 -               spin_unlock_irqrestore(&n->list_lock, flags);
24733 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
24734  
24735                 /* Release empty slabs */
24736                 list_for_each_entry_safe(page, t, &discard, lru)
24737 @@ -4112,6 +4181,12 @@ void __init kmem_cache_init(void)
24738  {
24739         static __initdata struct kmem_cache boot_kmem_cache,
24740                 boot_kmem_cache_node;
24741 +       int cpu;
24742 +
24743 +       for_each_possible_cpu(cpu) {
24744 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
24745 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
24746 +       }
24747  
24748         if (debug_guardpage_minorder())
24749                 slub_max_order = 0;
24750 @@ -4320,7 +4395,7 @@ static int validate_slab_node(struct kmem_cache *s,
24751         struct page *page;
24752         unsigned long flags;
24753  
24754 -       spin_lock_irqsave(&n->list_lock, flags);
24755 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24756  
24757         list_for_each_entry(page, &n->partial, lru) {
24758                 validate_slab_slab(s, page, map);
24759 @@ -4342,7 +4417,7 @@ static int validate_slab_node(struct kmem_cache *s,
24760                        s->name, count, atomic_long_read(&n->nr_slabs));
24761  
24762  out:
24763 -       spin_unlock_irqrestore(&n->list_lock, flags);
24764 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24765         return count;
24766  }
24767  
24768 @@ -4530,12 +4605,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
24769                 if (!atomic_long_read(&n->nr_slabs))
24770                         continue;
24771  
24772 -               spin_lock_irqsave(&n->list_lock, flags);
24773 +               raw_spin_lock_irqsave(&n->list_lock, flags);
24774                 list_for_each_entry(page, &n->partial, lru)
24775                         process_slab(&t, s, page, alloc, map);
24776                 list_for_each_entry(page, &n->full, lru)
24777                         process_slab(&t, s, page, alloc, map);
24778 -               spin_unlock_irqrestore(&n->list_lock, flags);
24779 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
24780         }
24781  
24782         for (i = 0; i < t.count; i++) {
24783 diff --git a/mm/swap.c b/mm/swap.c
24784 index 4dcf852e1e6d..69c3a5b24060 100644
24785 --- a/mm/swap.c
24786 +++ b/mm/swap.c
24787 @@ -32,6 +32,7 @@
24788  #include <linux/memcontrol.h>
24789  #include <linux/gfp.h>
24790  #include <linux/uio.h>
24791 +#include <linux/locallock.h>
24792  #include <linux/hugetlb.h>
24793  #include <linux/page_idle.h>
24794  
24795 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
24796  #ifdef CONFIG_SMP
24797  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
24798  #endif
24799 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
24800 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
24801  
24802  /*
24803   * This path almost never happens for VM activity - pages are normally
24804 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
24805                 unsigned long flags;
24806  
24807                 get_page(page);
24808 -               local_irq_save(flags);
24809 +               local_lock_irqsave(rotate_lock, flags);
24810                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
24811                 if (!pagevec_add(pvec, page) || PageCompound(page))
24812                         pagevec_move_tail(pvec);
24813 -               local_irq_restore(flags);
24814 +               local_unlock_irqrestore(rotate_lock, flags);
24815         }
24816  }
24817  
24818 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
24819  {
24820         page = compound_head(page);
24821         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
24822 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
24823 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24824 +                                                      activate_page_pvecs);
24825  
24826                 get_page(page);
24827                 if (!pagevec_add(pvec, page) || PageCompound(page))
24828                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
24829 -               put_cpu_var(activate_page_pvecs);
24830 +               put_locked_var(swapvec_lock, activate_page_pvecs);
24831         }
24832  }
24833  
24834 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
24835  
24836  static void __lru_cache_activate_page(struct page *page)
24837  {
24838 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
24839 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
24840         int i;
24841  
24842         /*
24843 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
24844                 }
24845         }
24846  
24847 -       put_cpu_var(lru_add_pvec);
24848 +       put_locked_var(swapvec_lock, lru_add_pvec);
24849  }
24850  
24851  /*
24852 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
24853  
24854  static void __lru_cache_add(struct page *page)
24855  {
24856 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
24857 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
24858  
24859         get_page(page);
24860         if (!pagevec_add(pvec, page) || PageCompound(page))
24861                 __pagevec_lru_add(pvec);
24862 -       put_cpu_var(lru_add_pvec);
24863 +       put_locked_var(swapvec_lock, lru_add_pvec);
24864  }
24865  
24866  /**
24867 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
24868                 unsigned long flags;
24869  
24870                 /* No harm done if a racing interrupt already did this */
24871 -               local_irq_save(flags);
24872 +#ifdef CONFIG_PREEMPT_RT_BASE
24873 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
24874                 pagevec_move_tail(pvec);
24875 -               local_irq_restore(flags);
24876 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
24877 +#else
24878 +               local_lock_irqsave(rotate_lock, flags);
24879 +               pagevec_move_tail(pvec);
24880 +               local_unlock_irqrestore(rotate_lock, flags);
24881 +#endif
24882         }
24883  
24884         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
24885 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
24886                 return;
24887  
24888         if (likely(get_page_unless_zero(page))) {
24889 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
24890 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24891 +                                                      lru_deactivate_file_pvecs);
24892  
24893                 if (!pagevec_add(pvec, page) || PageCompound(page))
24894                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
24895 -               put_cpu_var(lru_deactivate_file_pvecs);
24896 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
24897         }
24898  }
24899  
24900 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
24901  void deactivate_page(struct page *page)
24902  {
24903         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
24904 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
24905 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24906 +                                                      lru_deactivate_pvecs);
24907  
24908                 get_page(page);
24909                 if (!pagevec_add(pvec, page) || PageCompound(page))
24910                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
24911 -               put_cpu_var(lru_deactivate_pvecs);
24912 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
24913         }
24914  }
24915  
24916  void lru_add_drain(void)
24917  {
24918 -       lru_add_drain_cpu(get_cpu());
24919 -       put_cpu();
24920 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
24921 +       local_unlock_cpu(swapvec_lock);
24922  }
24923  
24924 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
24925 +#ifdef CONFIG_PREEMPT_RT_BASE
24926 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
24927  {
24928 -       lru_add_drain();
24929 +       local_lock_on(swapvec_lock, cpu);
24930 +       lru_add_drain_cpu(cpu);
24931 +       local_unlock_on(swapvec_lock, cpu);
24932  }
24933  
24934 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
24935 +#else
24936  
24937  /*
24938   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
24939 @@ -686,6 +701,22 @@ static int __init lru_init(void)
24940  }
24941  early_initcall(lru_init);
24942  
24943 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
24944 +{
24945 +       lru_add_drain();
24946 +}
24947 +
24948 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
24949 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
24950 +{
24951 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
24952 +
24953 +       INIT_WORK(work, lru_add_drain_per_cpu);
24954 +       queue_work_on(cpu, lru_add_drain_wq, work);
24955 +       cpumask_set_cpu(cpu, has_work);
24956 +}
24957 +#endif
24958 +
24959  void lru_add_drain_all(void)
24960  {
24961         static DEFINE_MUTEX(lock);
24962 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
24963         cpumask_clear(&has_work);
24964  
24965         for_each_online_cpu(cpu) {
24966 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
24967 -
24968                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
24969                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
24970                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
24971                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
24972 -                   need_activate_page_drain(cpu)) {
24973 -                       INIT_WORK(work, lru_add_drain_per_cpu);
24974 -                       queue_work_on(cpu, lru_add_drain_wq, work);
24975 -                       cpumask_set_cpu(cpu, &has_work);
24976 -               }
24977 +                   need_activate_page_drain(cpu))
24978 +                       remote_lru_add_drain(cpu, &has_work);
24979         }
24980  
24981 +#ifndef CONFIG_PREEMPT_RT_BASE
24982         for_each_cpu(cpu, &has_work)
24983                 flush_work(&per_cpu(lru_add_drain_work, cpu));
24984 +#endif
24985  
24986         put_online_cpus();
24987         mutex_unlock(&lock);
24988 diff --git a/mm/truncate.c b/mm/truncate.c
24989 index 8d8c62d89e6d..5bf1bd25d077 100644
24990 --- a/mm/truncate.c
24991 +++ b/mm/truncate.c
24992 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
24993          * protected by mapping->tree_lock.
24994          */
24995         if (!workingset_node_shadows(node) &&
24996 -           !list_empty(&node->private_list))
24997 -               list_lru_del(&workingset_shadow_nodes,
24998 +           !list_empty(&node->private_list)) {
24999 +               local_lock(workingset_shadow_lock);
25000 +               list_lru_del(&__workingset_shadow_nodes,
25001                                 &node->private_list);
25002 +               local_unlock(workingset_shadow_lock);
25003 +       }
25004         __radix_tree_delete_node(&mapping->page_tree, node);
25005  unlock:
25006         spin_unlock_irq(&mapping->tree_lock);
25007 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
25008 index f2481cb4e6b2..db4de08fa97c 100644
25009 --- a/mm/vmalloc.c
25010 +++ b/mm/vmalloc.c
25011 @@ -845,7 +845,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
25012         struct vmap_block *vb;
25013         struct vmap_area *va;
25014         unsigned long vb_idx;
25015 -       int node, err;
25016 +       int node, err, cpu;
25017         void *vaddr;
25018  
25019         node = numa_node_id();
25020 @@ -888,11 +888,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
25021         BUG_ON(err);
25022         radix_tree_preload_end();
25023  
25024 -       vbq = &get_cpu_var(vmap_block_queue);
25025 +       cpu = get_cpu_light();
25026 +       vbq = this_cpu_ptr(&vmap_block_queue);
25027         spin_lock(&vbq->lock);
25028         list_add_tail_rcu(&vb->free_list, &vbq->free);
25029         spin_unlock(&vbq->lock);
25030 -       put_cpu_var(vmap_block_queue);
25031 +       put_cpu_light();
25032  
25033         return vaddr;
25034  }
25035 @@ -961,6 +962,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
25036         struct vmap_block *vb;
25037         void *vaddr = NULL;
25038         unsigned int order;
25039 +       int cpu;
25040  
25041         BUG_ON(offset_in_page(size));
25042         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
25043 @@ -975,7 +977,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
25044         order = get_order(size);
25045  
25046         rcu_read_lock();
25047 -       vbq = &get_cpu_var(vmap_block_queue);
25048 +       cpu = get_cpu_light();
25049 +       vbq = this_cpu_ptr(&vmap_block_queue);
25050         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
25051                 unsigned long pages_off;
25052  
25053 @@ -998,7 +1001,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
25054                 break;
25055         }
25056  
25057 -       put_cpu_var(vmap_block_queue);
25058 +       put_cpu_light();
25059         rcu_read_unlock();
25060  
25061         /* Allocate new block if nothing was found */
25062 diff --git a/mm/vmstat.c b/mm/vmstat.c
25063 index 604f26a4f696..312006d2db50 100644
25064 --- a/mm/vmstat.c
25065 +++ b/mm/vmstat.c
25066 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
25067         long x;
25068         long t;
25069  
25070 +       preempt_disable_rt();
25071         x = delta + __this_cpu_read(*p);
25072  
25073         t = __this_cpu_read(pcp->stat_threshold);
25074 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
25075                 x = 0;
25076         }
25077         __this_cpu_write(*p, x);
25078 +       preempt_enable_rt();
25079  }
25080  EXPORT_SYMBOL(__mod_zone_page_state);
25081  
25082 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
25083         long x;
25084         long t;
25085  
25086 +       preempt_disable_rt();
25087         x = delta + __this_cpu_read(*p);
25088  
25089         t = __this_cpu_read(pcp->stat_threshold);
25090 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
25091                 x = 0;
25092         }
25093         __this_cpu_write(*p, x);
25094 +       preempt_enable_rt();
25095  }
25096  EXPORT_SYMBOL(__mod_node_page_state);
25097  
25098 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
25099         s8 __percpu *p = pcp->vm_stat_diff + item;
25100         s8 v, t;
25101  
25102 +       preempt_disable_rt();
25103         v = __this_cpu_inc_return(*p);
25104         t = __this_cpu_read(pcp->stat_threshold);
25105         if (unlikely(v > t)) {
25106 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
25107                 zone_page_state_add(v + overstep, zone, item);
25108                 __this_cpu_write(*p, -overstep);
25109         }
25110 +       preempt_enable_rt();
25111  }
25112  
25113  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25114 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25115         s8 __percpu *p = pcp->vm_node_stat_diff + item;
25116         s8 v, t;
25117  
25118 +       preempt_disable_rt();
25119         v = __this_cpu_inc_return(*p);
25120         t = __this_cpu_read(pcp->stat_threshold);
25121         if (unlikely(v > t)) {
25122 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25123                 node_page_state_add(v + overstep, pgdat, item);
25124                 __this_cpu_write(*p, -overstep);
25125         }
25126 +       preempt_enable_rt();
25127  }
25128  
25129  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
25130 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
25131         s8 __percpu *p = pcp->vm_stat_diff + item;
25132         s8 v, t;
25133  
25134 +       preempt_disable_rt();
25135         v = __this_cpu_dec_return(*p);
25136         t = __this_cpu_read(pcp->stat_threshold);
25137         if (unlikely(v < - t)) {
25138 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
25139                 zone_page_state_add(v - overstep, zone, item);
25140                 __this_cpu_write(*p, overstep);
25141         }
25142 +       preempt_enable_rt();
25143  }
25144  
25145  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25146 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25147         s8 __percpu *p = pcp->vm_node_stat_diff + item;
25148         s8 v, t;
25149  
25150 +       preempt_disable_rt();
25151         v = __this_cpu_dec_return(*p);
25152         t = __this_cpu_read(pcp->stat_threshold);
25153         if (unlikely(v < - t)) {
25154 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25155                 node_page_state_add(v - overstep, pgdat, item);
25156                 __this_cpu_write(*p, overstep);
25157         }
25158 +       preempt_enable_rt();
25159  }
25160  
25161  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
25162 diff --git a/mm/workingset.c b/mm/workingset.c
25163 index 33f6f4db32fd..f4ff55f4b60e 100644
25164 --- a/mm/workingset.c
25165 +++ b/mm/workingset.c
25166 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
25167   * point where they would still be useful.
25168   */
25169  
25170 -struct list_lru workingset_shadow_nodes;
25171 +struct list_lru __workingset_shadow_nodes;
25172 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
25173  
25174  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
25175                                         struct shrink_control *sc)
25176 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
25177         unsigned long pages;
25178  
25179         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
25180 -       local_irq_disable();
25181 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
25182 -       local_irq_enable();
25183 +       local_lock_irq(workingset_shadow_lock);
25184 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
25185 +       local_unlock_irq(workingset_shadow_lock);
25186  
25187         if (sc->memcg) {
25188                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
25189 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
25190         spin_unlock(&mapping->tree_lock);
25191         ret = LRU_REMOVED_RETRY;
25192  out:
25193 -       local_irq_enable();
25194 +       local_unlock_irq(workingset_shadow_lock);
25195         cond_resched();
25196 -       local_irq_disable();
25197 +       local_lock_irq(workingset_shadow_lock);
25198         spin_lock(lru_lock);
25199         return ret;
25200  }
25201 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
25202         unsigned long ret;
25203  
25204         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
25205 -       local_irq_disable();
25206 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
25207 +       local_lock_irq(workingset_shadow_lock);
25208 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
25209                                     shadow_lru_isolate, NULL);
25210 -       local_irq_enable();
25211 +       local_unlock_irq(workingset_shadow_lock);
25212         return ret;
25213  }
25214  
25215 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
25216         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
25217                timestamp_bits, max_order, bucket_order);
25218  
25219 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
25220 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
25221         if (ret)
25222                 goto err;
25223         ret = register_shrinker(&workingset_shadow_shrinker);
25224 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
25225                 goto err_list_lru;
25226         return 0;
25227  err_list_lru:
25228 -       list_lru_destroy(&workingset_shadow_nodes);
25229 +       list_lru_destroy(&__workingset_shadow_nodes);
25230  err:
25231         return ret;
25232  }
25233 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
25234 index b0bc023d25c5..5af6426fbcbe 100644
25235 --- a/mm/zsmalloc.c
25236 +++ b/mm/zsmalloc.c
25237 @@ -53,6 +53,7 @@
25238  #include <linux/mount.h>
25239  #include <linux/migrate.h>
25240  #include <linux/pagemap.h>
25241 +#include <linux/locallock.h>
25242  
25243  #define ZSPAGE_MAGIC   0x58
25244  
25245 @@ -70,9 +71,22 @@
25246   */
25247  #define ZS_MAX_ZSPAGE_ORDER 2
25248  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
25249 -
25250  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
25251  
25252 +#ifdef CONFIG_PREEMPT_RT_FULL
25253 +
25254 +struct zsmalloc_handle {
25255 +       unsigned long addr;
25256 +       struct mutex lock;
25257 +};
25258 +
25259 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
25260 +
25261 +#else
25262 +
25263 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
25264 +#endif
25265 +
25266  /*
25267   * Object location (<PFN>, <obj_idx>) is encoded as
25268   * as single (unsigned long) handle value.
25269 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
25270  
25271  static int create_cache(struct zs_pool *pool)
25272  {
25273 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
25274 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
25275                                         0, 0, NULL);
25276         if (!pool->handle_cachep)
25277                 return 1;
25278 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
25279  
25280  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
25281  {
25282 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
25283 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
25284 +       void *p;
25285 +
25286 +       p = kmem_cache_alloc(pool->handle_cachep,
25287 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
25288 +#ifdef CONFIG_PREEMPT_RT_FULL
25289 +       if (p) {
25290 +               struct zsmalloc_handle *zh = p;
25291 +
25292 +               mutex_init(&zh->lock);
25293 +       }
25294 +#endif
25295 +       return (unsigned long)p;
25296  }
25297  
25298 +#ifdef CONFIG_PREEMPT_RT_FULL
25299 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
25300 +{
25301 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
25302 +}
25303 +#endif
25304 +
25305  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
25306  {
25307         kmem_cache_free(pool->handle_cachep, (void *)handle);
25308 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
25309  
25310  static void record_obj(unsigned long handle, unsigned long obj)
25311  {
25312 +#ifdef CONFIG_PREEMPT_RT_FULL
25313 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25314 +
25315 +       WRITE_ONCE(zh->addr, obj);
25316 +#else
25317         /*
25318          * lsb of @obj represents handle lock while other bits
25319          * represent object value the handle is pointing so
25320          * updating shouldn't do store tearing.
25321          */
25322         WRITE_ONCE(*(unsigned long *)handle, obj);
25323 +#endif
25324  }
25325  
25326  /* zpool driver */
25327 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
25328  
25329  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
25330  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
25331 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
25332  
25333  static bool is_zspage_isolated(struct zspage *zspage)
25334  {
25335 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
25336  
25337  static unsigned long handle_to_obj(unsigned long handle)
25338  {
25339 +#ifdef CONFIG_PREEMPT_RT_FULL
25340 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25341 +
25342 +       return zh->addr;
25343 +#else
25344         return *(unsigned long *)handle;
25345 +#endif
25346  }
25347  
25348  static unsigned long obj_to_head(struct page *page, void *obj)
25349 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
25350  
25351  static inline int testpin_tag(unsigned long handle)
25352  {
25353 +#ifdef CONFIG_PREEMPT_RT_FULL
25354 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25355 +
25356 +       return mutex_is_locked(&zh->lock);
25357 +#else
25358         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
25359 +#endif
25360  }
25361  
25362  static inline int trypin_tag(unsigned long handle)
25363  {
25364 +#ifdef CONFIG_PREEMPT_RT_FULL
25365 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25366 +
25367 +       return mutex_trylock(&zh->lock);
25368 +#else
25369         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
25370 +#endif
25371  }
25372  
25373  static void pin_tag(unsigned long handle)
25374  {
25375 +#ifdef CONFIG_PREEMPT_RT_FULL
25376 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25377 +
25378 +       return mutex_lock(&zh->lock);
25379 +#else
25380         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
25381 +#endif
25382  }
25383  
25384  static void unpin_tag(unsigned long handle)
25385  {
25386 +#ifdef CONFIG_PREEMPT_RT_FULL
25387 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25388 +
25389 +       return mutex_unlock(&zh->lock);
25390 +#else
25391         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
25392 +#endif
25393  }
25394  
25395  static void reset_page(struct page *page)
25396 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
25397         class = pool->size_class[class_idx];
25398         off = (class->size * obj_idx) & ~PAGE_MASK;
25399  
25400 -       area = &get_cpu_var(zs_map_area);
25401 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
25402         area->vm_mm = mm;
25403         if (off + class->size <= PAGE_SIZE) {
25404                 /* this object is contained entirely within a page */
25405 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
25406  
25407                 __zs_unmap_object(area, pages, off, class->size);
25408         }
25409 -       put_cpu_var(zs_map_area);
25410 +       put_locked_var(zs_map_area_lock, zs_map_area);
25411  
25412         migrate_read_unlock(zspage);
25413         unpin_tag(handle);
25414 diff --git a/net/core/dev.c b/net/core/dev.c
25415 index 2e04fd188081..3ba60ef8c79e 100644
25416 --- a/net/core/dev.c
25417 +++ b/net/core/dev.c
25418 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
25419  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
25420  
25421  static seqcount_t devnet_rename_seq;
25422 +static DEFINE_MUTEX(devnet_rename_mutex);
25423  
25424  static inline void dev_base_seq_inc(struct net *net)
25425  {
25426 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
25427  static inline void rps_lock(struct softnet_data *sd)
25428  {
25429  #ifdef CONFIG_RPS
25430 -       spin_lock(&sd->input_pkt_queue.lock);
25431 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
25432  #endif
25433  }
25434  
25435  static inline void rps_unlock(struct softnet_data *sd)
25436  {
25437  #ifdef CONFIG_RPS
25438 -       spin_unlock(&sd->input_pkt_queue.lock);
25439 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
25440  #endif
25441  }
25442  
25443 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
25444         strcpy(name, dev->name);
25445         rcu_read_unlock();
25446         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
25447 -               cond_resched();
25448 +               mutex_lock(&devnet_rename_mutex);
25449 +               mutex_unlock(&devnet_rename_mutex);
25450                 goto retry;
25451         }
25452  
25453 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
25454         if (dev->flags & IFF_UP)
25455                 return -EBUSY;
25456  
25457 -       write_seqcount_begin(&devnet_rename_seq);
25458 +       mutex_lock(&devnet_rename_mutex);
25459 +       __raw_write_seqcount_begin(&devnet_rename_seq);
25460  
25461 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
25462 -               write_seqcount_end(&devnet_rename_seq);
25463 -               return 0;
25464 -       }
25465 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
25466 +               goto outunlock;
25467  
25468         memcpy(oldname, dev->name, IFNAMSIZ);
25469  
25470         err = dev_get_valid_name(net, dev, newname);
25471 -       if (err < 0) {
25472 -               write_seqcount_end(&devnet_rename_seq);
25473 -               return err;
25474 -       }
25475 +       if (err < 0)
25476 +               goto outunlock;
25477  
25478         if (oldname[0] && !strchr(oldname, '%'))
25479                 netdev_info(dev, "renamed from %s\n", oldname);
25480 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
25481         if (ret) {
25482                 memcpy(dev->name, oldname, IFNAMSIZ);
25483                 dev->name_assign_type = old_assign_type;
25484 -               write_seqcount_end(&devnet_rename_seq);
25485 -               return ret;
25486 +               err = ret;
25487 +               goto outunlock;
25488         }
25489  
25490 -       write_seqcount_end(&devnet_rename_seq);
25491 +       __raw_write_seqcount_end(&devnet_rename_seq);
25492 +       mutex_unlock(&devnet_rename_mutex);
25493  
25494         netdev_adjacent_rename_links(dev, oldname);
25495  
25496 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
25497                 /* err >= 0 after dev_alloc_name() or stores the first errno */
25498                 if (err >= 0) {
25499                         err = ret;
25500 -                       write_seqcount_begin(&devnet_rename_seq);
25501 +                       mutex_lock(&devnet_rename_mutex);
25502 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
25503                         memcpy(dev->name, oldname, IFNAMSIZ);
25504                         memcpy(oldname, newname, IFNAMSIZ);
25505                         dev->name_assign_type = old_assign_type;
25506 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
25507         }
25508  
25509         return err;
25510 +
25511 +outunlock:
25512 +       __raw_write_seqcount_end(&devnet_rename_seq);
25513 +       mutex_unlock(&devnet_rename_mutex);
25514 +       return err;
25515  }
25516  
25517  /**
25518 @@ -2285,6 +2291,7 @@ static void __netif_reschedule(struct Qdisc *q)
25519         sd->output_queue_tailp = &q->next_sched;
25520         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25521         local_irq_restore(flags);
25522 +       preempt_check_resched_rt();
25523  }
25524  
25525  void __netif_schedule(struct Qdisc *q)
25526 @@ -2366,6 +2373,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
25527         __this_cpu_write(softnet_data.completion_queue, skb);
25528         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25529         local_irq_restore(flags);
25530 +       preempt_check_resched_rt();
25531  }
25532  EXPORT_SYMBOL(__dev_kfree_skb_irq);
25533  
25534 @@ -3100,7 +3108,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
25535          * This permits qdisc->running owner to get the lock more
25536          * often and dequeue packets faster.
25537          */
25538 +#ifdef CONFIG_PREEMPT_RT_FULL
25539 +       contended = true;
25540 +#else
25541         contended = qdisc_is_running(q);
25542 +#endif
25543         if (unlikely(contended))
25544                 spin_lock(&q->busylock);
25545  
25546 @@ -3163,8 +3175,10 @@ static void skb_update_prio(struct sk_buff *skb)
25547  #define skb_update_prio(skb)
25548  #endif
25549  
25550 +#ifndef CONFIG_PREEMPT_RT_FULL
25551  DEFINE_PER_CPU(int, xmit_recursion);
25552  EXPORT_SYMBOL(xmit_recursion);
25553 +#endif
25554  
25555  /**
25556   *     dev_loopback_xmit - loop back @skb
25557 @@ -3398,8 +3412,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
25558                 int cpu = smp_processor_id(); /* ok because BHs are off */
25559  
25560                 if (txq->xmit_lock_owner != cpu) {
25561 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
25562 -                                    XMIT_RECURSION_LIMIT))
25563 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
25564                                 goto recursion_alert;
25565  
25566                         skb = validate_xmit_skb(skb, dev);
25567 @@ -3409,9 +3422,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
25568                         HARD_TX_LOCK(dev, txq, cpu);
25569  
25570                         if (!netif_xmit_stopped(txq)) {
25571 -                               __this_cpu_inc(xmit_recursion);
25572 +                               xmit_rec_inc();
25573                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
25574 -                               __this_cpu_dec(xmit_recursion);
25575 +                               xmit_rec_dec();
25576                                 if (dev_xmit_complete(rc)) {
25577                                         HARD_TX_UNLOCK(dev, txq);
25578                                         goto out;
25579 @@ -3785,6 +3798,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
25580         rps_unlock(sd);
25581  
25582         local_irq_restore(flags);
25583 +       preempt_check_resched_rt();
25584  
25585         atomic_long_inc(&skb->dev->rx_dropped);
25586         kfree_skb(skb);
25587 @@ -3803,7 +3817,7 @@ static int netif_rx_internal(struct sk_buff *skb)
25588                 struct rps_dev_flow voidflow, *rflow = &voidflow;
25589                 int cpu;
25590  
25591 -               preempt_disable();
25592 +               migrate_disable();
25593                 rcu_read_lock();
25594  
25595                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
25596 @@ -3813,13 +3827,13 @@ static int netif_rx_internal(struct sk_buff *skb)
25597                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
25598  
25599                 rcu_read_unlock();
25600 -               preempt_enable();
25601 +               migrate_enable();
25602         } else
25603  #endif
25604         {
25605                 unsigned int qtail;
25606 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
25607 -               put_cpu();
25608 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
25609 +               put_cpu_light();
25610         }
25611         return ret;
25612  }
25613 @@ -3853,11 +3867,9 @@ int netif_rx_ni(struct sk_buff *skb)
25614  
25615         trace_netif_rx_ni_entry(skb);
25616  
25617 -       preempt_disable();
25618 +       local_bh_disable();
25619         err = netif_rx_internal(skb);
25620 -       if (local_softirq_pending())
25621 -               do_softirq();
25622 -       preempt_enable();
25623 +       local_bh_enable();
25624  
25625         return err;
25626  }
25627 @@ -4336,7 +4348,7 @@ static void flush_backlog(struct work_struct *work)
25628         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
25629                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
25630                         __skb_unlink(skb, &sd->input_pkt_queue);
25631 -                       kfree_skb(skb);
25632 +                       __skb_queue_tail(&sd->tofree_queue, skb);
25633                         input_queue_head_incr(sd);
25634                 }
25635         }
25636 @@ -4346,11 +4358,14 @@ static void flush_backlog(struct work_struct *work)
25637         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
25638                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
25639                         __skb_unlink(skb, &sd->process_queue);
25640 -                       kfree_skb(skb);
25641 +                       __skb_queue_tail(&sd->tofree_queue, skb);
25642                         input_queue_head_incr(sd);
25643                 }
25644         }
25645 +       if (!skb_queue_empty(&sd->tofree_queue))
25646 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
25647         local_bh_enable();
25648 +
25649  }
25650  
25651  static void flush_all_backlogs(void)
25652 @@ -4831,6 +4846,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
25653                 sd->rps_ipi_list = NULL;
25654  
25655                 local_irq_enable();
25656 +               preempt_check_resched_rt();
25657  
25658                 /* Send pending IPI's to kick RPS processing on remote cpus. */
25659                 while (remsd) {
25660 @@ -4844,6 +4860,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
25661         } else
25662  #endif
25663                 local_irq_enable();
25664 +       preempt_check_resched_rt();
25665  }
25666  
25667  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
25668 @@ -4873,7 +4890,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
25669         while (again) {
25670                 struct sk_buff *skb;
25671  
25672 +               local_irq_disable();
25673                 while ((skb = __skb_dequeue(&sd->process_queue))) {
25674 +                       local_irq_enable();
25675                         rcu_read_lock();
25676                         __netif_receive_skb(skb);
25677                         rcu_read_unlock();
25678 @@ -4881,9 +4900,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
25679                         if (++work >= quota)
25680                                 return work;
25681  
25682 +                       local_irq_disable();
25683                 }
25684  
25685 -               local_irq_disable();
25686                 rps_lock(sd);
25687                 if (skb_queue_empty(&sd->input_pkt_queue)) {
25688                         /*
25689 @@ -4921,9 +4940,11 @@ void __napi_schedule(struct napi_struct *n)
25690         local_irq_save(flags);
25691         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
25692         local_irq_restore(flags);
25693 +       preempt_check_resched_rt();
25694  }
25695  EXPORT_SYMBOL(__napi_schedule);
25696  
25697 +#ifndef CONFIG_PREEMPT_RT_FULL
25698  /**
25699   * __napi_schedule_irqoff - schedule for receive
25700   * @n: entry to schedule
25701 @@ -4935,6 +4956,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
25702         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
25703  }
25704  EXPORT_SYMBOL(__napi_schedule_irqoff);
25705 +#endif
25706  
25707  void __napi_complete(struct napi_struct *n)
25708  {
25709 @@ -5224,13 +5246,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
25710         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
25711         unsigned long time_limit = jiffies + 2;
25712         int budget = netdev_budget;
25713 +       struct sk_buff_head tofree_q;
25714 +       struct sk_buff *skb;
25715         LIST_HEAD(list);
25716         LIST_HEAD(repoll);
25717  
25718 +       __skb_queue_head_init(&tofree_q);
25719 +
25720         local_irq_disable();
25721 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
25722         list_splice_init(&sd->poll_list, &list);
25723         local_irq_enable();
25724  
25725 +       while ((skb = __skb_dequeue(&tofree_q)))
25726 +               kfree_skb(skb);
25727 +
25728         for (;;) {
25729                 struct napi_struct *n;
25730  
25731 @@ -5261,7 +5291,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
25732         list_splice_tail(&repoll, &list);
25733         list_splice(&list, &sd->poll_list);
25734         if (!list_empty(&sd->poll_list))
25735 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
25736 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
25737  
25738         net_rps_action_and_irq_enable(sd);
25739  }
25740 @@ -8022,16 +8052,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
25741  
25742         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25743         local_irq_enable();
25744 +       preempt_check_resched_rt();
25745  
25746         /* Process offline CPU's input_pkt_queue */
25747         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
25748                 netif_rx_ni(skb);
25749                 input_queue_head_incr(oldsd);
25750         }
25751 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
25752 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
25753                 netif_rx_ni(skb);
25754                 input_queue_head_incr(oldsd);
25755         }
25756 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
25757 +               kfree_skb(skb);
25758 +       }
25759  
25760         return NOTIFY_OK;
25761  }
25762 @@ -8336,8 +8370,9 @@ static int __init net_dev_init(void)
25763  
25764                 INIT_WORK(flush, flush_backlog);
25765  
25766 -               skb_queue_head_init(&sd->input_pkt_queue);
25767 -               skb_queue_head_init(&sd->process_queue);
25768 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
25769 +               skb_queue_head_init_raw(&sd->process_queue);
25770 +               skb_queue_head_init_raw(&sd->tofree_queue);
25771                 INIT_LIST_HEAD(&sd->poll_list);
25772                 sd->output_queue_tailp = &sd->output_queue;
25773  #ifdef CONFIG_RPS
25774 diff --git a/net/core/filter.c b/net/core/filter.c
25775 index b391209838ef..b86e9681a88e 100644
25776 --- a/net/core/filter.c
25777 +++ b/net/core/filter.c
25778 @@ -1645,7 +1645,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
25779  {
25780         int ret;
25781  
25782 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
25783 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
25784                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
25785                 kfree_skb(skb);
25786                 return -ENETDOWN;
25787 @@ -1653,9 +1653,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
25788  
25789         skb->dev = dev;
25790  
25791 -       __this_cpu_inc(xmit_recursion);
25792 +       xmit_rec_inc();
25793         ret = dev_queue_xmit(skb);
25794 -       __this_cpu_dec(xmit_recursion);
25795 +       xmit_rec_dec();
25796  
25797         return ret;
25798  }
25799 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
25800 index cad8e791f28e..2a9364fe62a5 100644
25801 --- a/net/core/gen_estimator.c
25802 +++ b/net/core/gen_estimator.c
25803 @@ -84,7 +84,7 @@ struct gen_estimator
25804         struct gnet_stats_basic_packed  *bstats;
25805         struct gnet_stats_rate_est64    *rate_est;
25806         spinlock_t              *stats_lock;
25807 -       seqcount_t              *running;
25808 +       net_seqlock_t           *running;
25809         int                     ewma_log;
25810         u32                     last_packets;
25811         unsigned long           avpps;
25812 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
25813                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
25814                       struct gnet_stats_rate_est64 *rate_est,
25815                       spinlock_t *stats_lock,
25816 -                     seqcount_t *running,
25817 +                     net_seqlock_t *running,
25818                       struct nlattr *opt)
25819  {
25820         struct gen_estimator *est;
25821 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
25822                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
25823                           struct gnet_stats_rate_est64 *rate_est,
25824                           spinlock_t *stats_lock,
25825 -                         seqcount_t *running, struct nlattr *opt)
25826 +                         net_seqlock_t *running, struct nlattr *opt)
25827  {
25828         gen_kill_estimator(bstats, rate_est);
25829         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
25830 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
25831 index 508e051304fb..bc3b17b78c94 100644
25832 --- a/net/core/gen_stats.c
25833 +++ b/net/core/gen_stats.c
25834 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
25835  }
25836  
25837  void
25838 -__gnet_stats_copy_basic(const seqcount_t *running,
25839 +__gnet_stats_copy_basic(net_seqlock_t *running,
25840                         struct gnet_stats_basic_packed *bstats,
25841                         struct gnet_stats_basic_cpu __percpu *cpu,
25842                         struct gnet_stats_basic_packed *b)
25843 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
25844         }
25845         do {
25846                 if (running)
25847 -                       seq = read_seqcount_begin(running);
25848 +                       seq = net_seq_begin(running);
25849                 bstats->bytes = b->bytes;
25850                 bstats->packets = b->packets;
25851 -       } while (running && read_seqcount_retry(running, seq));
25852 +       } while (running && net_seq_retry(running, seq));
25853  }
25854  EXPORT_SYMBOL(__gnet_stats_copy_basic);
25855  
25856 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
25857   * if the room in the socket buffer was not sufficient.
25858   */
25859  int
25860 -gnet_stats_copy_basic(const seqcount_t *running,
25861 +gnet_stats_copy_basic(net_seqlock_t *running,
25862                       struct gnet_dump *d,
25863                       struct gnet_stats_basic_cpu __percpu *cpu,
25864                       struct gnet_stats_basic_packed *b)
25865 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
25866 index f0f462c0573d..3cf5ed766dcd 100644
25867 --- a/net/core/skbuff.c
25868 +++ b/net/core/skbuff.c
25869 @@ -64,6 +64,7 @@
25870  #include <linux/errqueue.h>
25871  #include <linux/prefetch.h>
25872  #include <linux/if_vlan.h>
25873 +#include <linux/locallock.h>
25874  
25875  #include <net/protocol.h>
25876  #include <net/dst.h>
25877 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
25878  
25879  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
25880  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
25881 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
25882 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
25883  
25884  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25885  {
25886 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25887         unsigned long flags;
25888         void *data;
25889  
25890 -       local_irq_save(flags);
25891 +       local_lock_irqsave(netdev_alloc_lock, flags);
25892         nc = this_cpu_ptr(&netdev_alloc_cache);
25893         data = __alloc_page_frag(nc, fragsz, gfp_mask);
25894 -       local_irq_restore(flags);
25895 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
25896         return data;
25897  }
25898  
25899 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
25900  
25901  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25902  {
25903 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25904 +       struct napi_alloc_cache *nc;
25905 +       void *data;
25906  
25907 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
25908 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25909 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
25910 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25911 +       return data;
25912  }
25913  
25914  void *napi_alloc_frag(unsigned int fragsz)
25915 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
25916         if (sk_memalloc_socks())
25917                 gfp_mask |= __GFP_MEMALLOC;
25918  
25919 -       local_irq_save(flags);
25920 +       local_lock_irqsave(netdev_alloc_lock, flags);
25921  
25922         nc = this_cpu_ptr(&netdev_alloc_cache);
25923         data = __alloc_page_frag(nc, len, gfp_mask);
25924         pfmemalloc = nc->pfmemalloc;
25925  
25926 -       local_irq_restore(flags);
25927 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
25928  
25929         if (unlikely(!data))
25930                 return NULL;
25931 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
25932  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25933                                  gfp_t gfp_mask)
25934  {
25935 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25936 +       struct napi_alloc_cache *nc;
25937         struct sk_buff *skb;
25938         void *data;
25939 +       bool pfmemalloc;
25940  
25941         len += NET_SKB_PAD + NET_IP_ALIGN;
25942  
25943 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25944         if (sk_memalloc_socks())
25945                 gfp_mask |= __GFP_MEMALLOC;
25946  
25947 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25948         data = __alloc_page_frag(&nc->page, len, gfp_mask);
25949 +       pfmemalloc = nc->page.pfmemalloc;
25950 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25951         if (unlikely(!data))
25952                 return NULL;
25953  
25954 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25955         }
25956  
25957         /* use OR instead of assignment to avoid clearing of bits in mask */
25958 -       if (nc->page.pfmemalloc)
25959 +       if (pfmemalloc)
25960                 skb->pfmemalloc = 1;
25961         skb->head_frag = 1;
25962  
25963 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
25964  
25965  void __kfree_skb_flush(void)
25966  {
25967 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25968 +       struct napi_alloc_cache *nc;
25969  
25970 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25971         /* flush skb_cache if containing objects */
25972         if (nc->skb_count) {
25973                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
25974                                      nc->skb_cache);
25975                 nc->skb_count = 0;
25976         }
25977 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25978  }
25979  
25980  static inline void _kfree_skb_defer(struct sk_buff *skb)
25981  {
25982 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25983 +       struct napi_alloc_cache *nc;
25984  
25985         /* drop skb->head and call any destructors for packet */
25986         skb_release_all(skb);
25987  
25988 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25989         /* record skb to CPU local list */
25990         nc->skb_cache[nc->skb_count++] = skb;
25991  
25992 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
25993                                      nc->skb_cache);
25994                 nc->skb_count = 0;
25995         }
25996 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25997  }
25998  void __kfree_skb_defer(struct sk_buff *skb)
25999  {
26000 diff --git a/net/core/sock.c b/net/core/sock.c
26001 index 470a2043b846..2b09a5a33d8d 100644
26002 --- a/net/core/sock.c
26003 +++ b/net/core/sock.c
26004 @@ -2499,12 +2499,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
26005         if (sk->sk_lock.owned)
26006                 __lock_sock(sk);
26007         sk->sk_lock.owned = 1;
26008 -       spin_unlock(&sk->sk_lock.slock);
26009 +       spin_unlock_bh(&sk->sk_lock.slock);
26010         /*
26011          * The sk_lock has mutex_lock() semantics here:
26012          */
26013         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
26014 -       local_bh_enable();
26015  }
26016  EXPORT_SYMBOL(lock_sock_nested);
26017  
26018 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
26019 index 48734ee6293f..e6864ff11352 100644
26020 --- a/net/ipv4/icmp.c
26021 +++ b/net/ipv4/icmp.c
26022 @@ -69,6 +69,7 @@
26023  #include <linux/jiffies.h>
26024  #include <linux/kernel.h>
26025  #include <linux/fcntl.h>
26026 +#include <linux/sysrq.h>
26027  #include <linux/socket.h>
26028  #include <linux/in.h>
26029  #include <linux/inet.h>
26030 @@ -77,6 +78,7 @@
26031  #include <linux/string.h>
26032  #include <linux/netfilter_ipv4.h>
26033  #include <linux/slab.h>
26034 +#include <linux/locallock.h>
26035  #include <net/snmp.h>
26036  #include <net/ip.h>
26037  #include <net/route.h>
26038 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
26039   *
26040   *     On SMP we have one ICMP socket per-cpu.
26041   */
26042 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
26043 +
26044  static struct sock *icmp_sk(struct net *net)
26045  {
26046         return *this_cpu_ptr(net->ipv4.icmp_sk);
26047 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
26048  
26049         local_bh_disable();
26050  
26051 +       local_lock(icmp_sk_lock);
26052         sk = icmp_sk(net);
26053  
26054         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
26055                 /* This can happen if the output path signals a
26056                  * dst_link_failure() for an outgoing ICMP packet.
26057                  */
26058 +               local_unlock(icmp_sk_lock);
26059                 local_bh_enable();
26060                 return NULL;
26061         }
26062 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
26063  static inline void icmp_xmit_unlock(struct sock *sk)
26064  {
26065         spin_unlock_bh(&sk->sk_lock.slock);
26066 +       local_unlock(icmp_sk_lock);
26067  }
26068  
26069  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
26070 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
26071         struct sock *sk;
26072         struct sk_buff *skb;
26073  
26074 +       local_lock(icmp_sk_lock);
26075         sk = icmp_sk(dev_net((*rt)->dst.dev));
26076         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
26077                            icmp_param->data_len+icmp_param->head_len,
26078 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
26079                 skb->ip_summed = CHECKSUM_NONE;
26080                 ip_push_pending_frames(sk, fl4);
26081         }
26082 +       local_unlock(icmp_sk_lock);
26083  }
26084  
26085  /*
26086 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
26087  }
26088  
26089  /*
26090 + * 32bit and 64bit have different timestamp length, so we check for
26091 + * the cookie at offset 20 and verify it is repeated at offset 50
26092 + */
26093 +#define CO_POS0                20
26094 +#define CO_POS1                50
26095 +#define CO_SIZE                sizeof(int)
26096 +#define ICMP_SYSRQ_SIZE        57
26097 +
26098 +/*
26099 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
26100 + * pattern and if it matches send the next byte as a trigger to sysrq.
26101 + */
26102 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
26103 +{
26104 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
26105 +       char *p = skb->data;
26106 +
26107 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
26108 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
26109 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
26110 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
26111 +}
26112 +
26113 +/*
26114   *     Handle ICMP_ECHO ("ping") requests.
26115   *
26116   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
26117 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
26118                 icmp_param.data_len        = skb->len;
26119                 icmp_param.head_len        = sizeof(struct icmphdr);
26120                 icmp_reply(&icmp_param, skb);
26121 +
26122 +               if (skb->len == ICMP_SYSRQ_SIZE &&
26123 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
26124 +                       icmp_check_sysrq(net, skb);
26125 +               }
26126         }
26127         /* should there be an ICMP stat for ignored echos? */
26128         return true;
26129 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
26130 index 80bc36b25de2..215b90adfb05 100644
26131 --- a/net/ipv4/sysctl_net_ipv4.c
26132 +++ b/net/ipv4/sysctl_net_ipv4.c
26133 @@ -681,6 +681,13 @@ static struct ctl_table ipv4_net_table[] = {
26134                 .proc_handler   = proc_dointvec
26135         },
26136         {
26137 +               .procname       = "icmp_echo_sysrq",
26138 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
26139 +               .maxlen         = sizeof(int),
26140 +               .mode           = 0644,
26141 +               .proc_handler   = proc_dointvec
26142 +       },
26143 +       {
26144                 .procname       = "icmp_ignore_bogus_error_responses",
26145                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
26146                 .maxlen         = sizeof(int),
26147 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
26148 index 6988566dc72f..672fffcde28c 100644
26149 --- a/net/ipv4/tcp_ipv4.c
26150 +++ b/net/ipv4/tcp_ipv4.c
26151 @@ -62,6 +62,7 @@
26152  #include <linux/init.h>
26153  #include <linux/times.h>
26154  #include <linux/slab.h>
26155 +#include <linux/locallock.h>
26156  
26157  #include <net/net_namespace.h>
26158  #include <net/icmp.h>
26159 @@ -568,6 +569,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
26160  }
26161  EXPORT_SYMBOL(tcp_v4_send_check);
26162  
26163 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
26164  /*
26165   *     This routine will send an RST to the other tcp.
26166   *
26167 @@ -695,6 +697,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
26168                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
26169  
26170         arg.tos = ip_hdr(skb)->tos;
26171 +
26172 +       local_lock(tcp_sk_lock);
26173         local_bh_disable();
26174         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
26175                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
26176 @@ -704,6 +708,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
26177         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
26178         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
26179         local_bh_enable();
26180 +       local_unlock(tcp_sk_lock);
26181  
26182  #ifdef CONFIG_TCP_MD5SIG
26183  out:
26184 @@ -779,6 +784,7 @@ static void tcp_v4_send_ack(struct net *net,
26185         if (oif)
26186                 arg.bound_dev_if = oif;
26187         arg.tos = tos;
26188 +       local_lock(tcp_sk_lock);
26189         local_bh_disable();
26190         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
26191                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
26192 @@ -787,6 +793,7 @@ static void tcp_v4_send_ack(struct net *net,
26193  
26194         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
26195         local_bh_enable();
26196 +       local_unlock(tcp_sk_lock);
26197  }
26198  
26199  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
26200 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
26201 index a697ddf56334..f1867acd0e81 100644
26202 --- a/net/mac80211/rx.c
26203 +++ b/net/mac80211/rx.c
26204 @@ -4180,7 +4180,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
26205         struct ieee80211_supported_band *sband;
26206         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
26207  
26208 -       WARN_ON_ONCE(softirq_count() == 0);
26209 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
26210  
26211         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
26212                 goto drop;
26213 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
26214 index 004af030ef1a..b64f751bda45 100644
26215 --- a/net/netfilter/core.c
26216 +++ b/net/netfilter/core.c
26217 @@ -22,12 +22,18 @@
26218  #include <linux/proc_fs.h>
26219  #include <linux/mutex.h>
26220  #include <linux/slab.h>
26221 +#include <linux/locallock.h>
26222  #include <linux/rcupdate.h>
26223  #include <net/net_namespace.h>
26224  #include <net/sock.h>
26225  
26226  #include "nf_internals.h"
26227  
26228 +#ifdef CONFIG_PREEMPT_RT_BASE
26229 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
26230 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
26231 +#endif
26232 +
26233  static DEFINE_MUTEX(afinfo_mutex);
26234  
26235  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
26236 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
26237 index f2b04a77258d..7d841bcae677 100644
26238 --- a/net/packet/af_packet.c
26239 +++ b/net/packet/af_packet.c
26240 @@ -63,6 +63,7 @@
26241  #include <linux/if_packet.h>
26242  #include <linux/wireless.h>
26243  #include <linux/kernel.h>
26244 +#include <linux/delay.h>
26245  #include <linux/kmod.h>
26246  #include <linux/slab.h>
26247  #include <linux/vmalloc.h>
26248 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
26249         if (BLOCK_NUM_PKTS(pbd)) {
26250                 while (atomic_read(&pkc->blk_fill_in_prog)) {
26251                         /* Waiting for skb_copy_bits to finish... */
26252 -                       cpu_relax();
26253 +                       cpu_chill();
26254                 }
26255         }
26256  
26257 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
26258                 if (!(status & TP_STATUS_BLK_TMO)) {
26259                         while (atomic_read(&pkc->blk_fill_in_prog)) {
26260                                 /* Waiting for skb_copy_bits to finish... */
26261 -                               cpu_relax();
26262 +                               cpu_chill();
26263                         }
26264                 }
26265                 prb_close_block(pkc, pbd, po, status);
26266 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
26267 index 977f69886c00..f3e7a36b0396 100644
26268 --- a/net/rds/ib_rdma.c
26269 +++ b/net/rds/ib_rdma.c
26270 @@ -34,6 +34,7 @@
26271  #include <linux/slab.h>
26272  #include <linux/rculist.h>
26273  #include <linux/llist.h>
26274 +#include <linux/delay.h>
26275  
26276  #include "rds_single_path.h"
26277  #include "ib_mr.h"
26278 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
26279         for_each_online_cpu(cpu) {
26280                 flag = &per_cpu(clean_list_grace, cpu);
26281                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
26282 -                       cpu_relax();
26283 +                       cpu_chill();
26284         }
26285  }
26286  
26287 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
26288 index 7d921e56e715..13df56a738e5 100644
26289 --- a/net/rxrpc/security.c
26290 +++ b/net/rxrpc/security.c
26291 @@ -19,9 +19,6 @@
26292  #include <keys/rxrpc-type.h>
26293  #include "ar-internal.h"
26294  
26295 -static LIST_HEAD(rxrpc_security_methods);
26296 -static DECLARE_RWSEM(rxrpc_security_sem);
26297 -
26298  static const struct rxrpc_security *rxrpc_security_types[] = {
26299         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
26300  #ifdef CONFIG_RXKAD
26301 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
26302 index 206dc24add3a..00ea9bde5bb3 100644
26303 --- a/net/sched/sch_api.c
26304 +++ b/net/sched/sch_api.c
26305 @@ -981,7 +981,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
26306                         rcu_assign_pointer(sch->stab, stab);
26307                 }
26308                 if (tca[TCA_RATE]) {
26309 -                       seqcount_t *running;
26310 +                       net_seqlock_t *running;
26311  
26312                         err = -EOPNOTSUPP;
26313                         if (sch->flags & TCQ_F_MQROOT)
26314 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
26315 index 6cfb6e9038c2..20727e1347de 100644
26316 --- a/net/sched/sch_generic.c
26317 +++ b/net/sched/sch_generic.c
26318 @@ -425,7 +425,11 @@ struct Qdisc noop_qdisc = {
26319         .ops            =       &noop_qdisc_ops,
26320         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
26321         .dev_queue      =       &noop_netdev_queue,
26322 +#ifdef CONFIG_PREEMPT_RT_BASE
26323 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
26324 +#else
26325         .running        =       SEQCNT_ZERO(noop_qdisc.running),
26326 +#endif
26327         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
26328  };
26329  EXPORT_SYMBOL(noop_qdisc);
26330 @@ -624,9 +628,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
26331         lockdep_set_class(&sch->busylock,
26332                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
26333  
26334 +#ifdef CONFIG_PREEMPT_RT_BASE
26335 +       seqlock_init(&sch->running);
26336 +       lockdep_set_class(&sch->running.seqcount,
26337 +                         dev->qdisc_running_key ?: &qdisc_running_key);
26338 +       lockdep_set_class(&sch->running.lock,
26339 +                         dev->qdisc_running_key ?: &qdisc_running_key);
26340 +#else
26341         seqcount_init(&sch->running);
26342         lockdep_set_class(&sch->running,
26343                           dev->qdisc_running_key ?: &qdisc_running_key);
26344 +#endif
26345  
26346         sch->ops = ops;
26347         sch->enqueue = ops->enqueue;
26348 @@ -925,7 +937,7 @@ void dev_deactivate_many(struct list_head *head)
26349         /* Wait for outstanding qdisc_run calls. */
26350         list_for_each_entry(dev, head, close_list)
26351                 while (some_qdisc_is_busy(dev))
26352 -                       yield();
26353 +                       msleep(1);
26354  }
26355  
26356  void dev_deactivate(struct net_device *dev)
26357 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
26358 index 9c9db55a0c1e..e6583b018a72 100644
26359 --- a/net/sunrpc/svc_xprt.c
26360 +++ b/net/sunrpc/svc_xprt.c
26361 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
26362                 goto out;
26363         }
26364  
26365 -       cpu = get_cpu();
26366 +       cpu = get_cpu_light();
26367         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
26368  
26369         atomic_long_inc(&pool->sp_stats.packets);
26370 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
26371  
26372                 atomic_long_inc(&pool->sp_stats.threads_woken);
26373                 wake_up_process(rqstp->rq_task);
26374 -               put_cpu();
26375 +               put_cpu_light();
26376                 goto out;
26377         }
26378         rcu_read_unlock();
26379 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
26380                 goto redo_search;
26381         }
26382         rqstp = NULL;
26383 -       put_cpu();
26384 +       put_cpu_light();
26385  out:
26386         trace_svc_xprt_do_enqueue(xprt, rqstp);
26387  }
26388 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
26389 index 6fdc97ef6023..523e0420d7f0 100755
26390 --- a/scripts/mkcompile_h
26391 +++ b/scripts/mkcompile_h
26392 @@ -4,7 +4,8 @@ TARGET=$1
26393  ARCH=$2
26394  SMP=$3
26395  PREEMPT=$4
26396 -CC=$5
26397 +RT=$5
26398 +CC=$6
26399  
26400  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
26401  
26402 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
26403  CONFIG_FLAGS=""
26404  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
26405  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
26406 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
26407  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
26408  
26409  # Truncate to maximum length
26410 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
26411 index 9d33c1e85c79..3d307bda86f9 100644
26412 --- a/sound/core/pcm_native.c
26413 +++ b/sound/core/pcm_native.c
26414 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
26415  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
26416  {
26417         if (!substream->pcm->nonatomic)
26418 -               local_irq_disable();
26419 +               local_irq_disable_nort();
26420         snd_pcm_stream_lock(substream);
26421  }
26422  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
26423 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
26424  {
26425         snd_pcm_stream_unlock(substream);
26426         if (!substream->pcm->nonatomic)
26427 -               local_irq_enable();
26428 +               local_irq_enable_nort();
26429  }
26430  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
26431  
26432 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
26433  {
26434         unsigned long flags = 0;
26435         if (!substream->pcm->nonatomic)
26436 -               local_irq_save(flags);
26437 +               local_irq_save_nort(flags);
26438         snd_pcm_stream_lock(substream);
26439         return flags;
26440  }
26441 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
26442  {
26443         snd_pcm_stream_unlock(substream);
26444         if (!substream->pcm->nonatomic)
26445 -               local_irq_restore(flags);
26446 +               local_irq_restore_nort(flags);
26447  }
26448  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
26449  
This page took 2.233257 seconds and 3 git commands to generate.