]> git.pld-linux.org Git - packages/kernel.git/blob - kernel-rt.patch
rt patch updated
[packages/kernel.git] / kernel-rt.patch
1 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
2 index 3a3b30ac2a75..9e0745cafbd8 100644
3 --- a/Documentation/sysrq.txt
4 +++ b/Documentation/sysrq.txt
5 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
6  On other - If you know of the key combos for other architectures, please
7             let me know so I can add them to this section.
8  
9 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
10 -
11 +On all -  write a character to /proc/sysrq-trigger, e.g.:
12                 echo t > /proc/sysrq-trigger
13  
14 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
15 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
16 +        Send an ICMP echo request with this pattern plus the particular
17 +        SysRq command key. Example:
18 +               # ping -c1 -s57 -p0102030468
19 +        will trigger the SysRq-H (help) command.
20 +
21 +
22  *  What are the 'command' keys?
23  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24  'b'     - Will immediately reboot the system without syncing or unmounting
25 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
26 new file mode 100644
27 index 000000000000..6f2aeabf7faa
28 --- /dev/null
29 +++ b/Documentation/trace/histograms.txt
30 @@ -0,0 +1,186 @@
31 +               Using the Linux Kernel Latency Histograms
32 +
33 +
34 +This document gives a short explanation how to enable, configure and use
35 +latency histograms. Latency histograms are primarily relevant in the
36 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
37 +and are used in the quality management of the Linux real-time
38 +capabilities.
39 +
40 +
41 +* Purpose of latency histograms
42 +
43 +A latency histogram continuously accumulates the frequencies of latency
44 +data. There are two types of histograms
45 +- potential sources of latencies
46 +- effective latencies
47 +
48 +
49 +* Potential sources of latencies
50 +
51 +Potential sources of latencies are code segments where interrupts,
52 +preemption or both are disabled (aka critical sections). To create
53 +histograms of potential sources of latency, the kernel stores the time
54 +stamp at the start of a critical section, determines the time elapsed
55 +when the end of the section is reached, and increments the frequency
56 +counter of that latency value - irrespective of whether any concurrently
57 +running process is affected by latency or not.
58 +- Configuration items (in the Kernel hacking/Tracers submenu)
59 +  CONFIG_INTERRUPT_OFF_LATENCY
60 +  CONFIG_PREEMPT_OFF_LATENCY
61 +
62 +
63 +* Effective latencies
64 +
65 +Effective latencies are actually occuring during wakeup of a process. To
66 +determine effective latencies, the kernel stores the time stamp when a
67 +process is scheduled to be woken up, and determines the duration of the
68 +wakeup time shortly before control is passed over to this process. Note
69 +that the apparent latency in user space may be somewhat longer, since the
70 +process may be interrupted after control is passed over to it but before
71 +the execution in user space takes place. Simply measuring the interval
72 +between enqueuing and wakeup may also not appropriate in cases when a
73 +process is scheduled as a result of a timer expiration. The timer may have
74 +missed its deadline, e.g. due to disabled interrupts, but this latency
75 +would not be registered. Therefore, the offsets of missed timers are
76 +recorded in a separate histogram. If both wakeup latency and missed timer
77 +offsets are configured and enabled, a third histogram may be enabled that
78 +records the overall latency as a sum of the timer latency, if any, and the
79 +wakeup latency. This histogram is called "timerandwakeup".
80 +- Configuration items (in the Kernel hacking/Tracers submenu)
81 +  CONFIG_WAKEUP_LATENCY
82 +  CONFIG_MISSED_TIMER_OFSETS
83 +
84 +
85 +* Usage
86 +
87 +The interface to the administration of the latency histograms is located
88 +in the debugfs file system. To mount it, either enter
89 +
90 +mount -t sysfs nodev /sys
91 +mount -t debugfs nodev /sys/kernel/debug
92 +
93 +from shell command line level, or add
94 +
95 +nodev  /sys                    sysfs   defaults        0 0
96 +nodev  /sys/kernel/debug       debugfs defaults        0 0
97 +
98 +to the file /etc/fstab. All latency histogram related files are then
99 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
100 +particular histogram type is enabled by writing non-zero to the related
101 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
102 +Select "preemptirqsoff" for the histograms of potential sources of
103 +latencies and "wakeup" for histograms of effective latencies etc. The
104 +histogram data - one per CPU - are available in the files
105 +
106 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
107 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
108 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
109 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
110 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
111 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
112 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
113 +
114 +The histograms are reset by writing non-zero to the file "reset" in a
115 +particular latency directory. To reset all latency data, use
116 +
117 +#!/bin/sh
118 +
119 +TRACINGDIR=/sys/kernel/debug/tracing
120 +HISTDIR=$TRACINGDIR/latency_hist
121 +
122 +if test -d $HISTDIR
123 +then
124 +  cd $HISTDIR
125 +  for i in `find . | grep /reset$`
126 +  do
127 +    echo 1 >$i
128 +  done
129 +fi
130 +
131 +
132 +* Data format
133 +
134 +Latency data are stored with a resolution of one microsecond. The
135 +maximum latency is 10,240 microseconds. The data are only valid, if the
136 +overflow register is empty. Every output line contains the latency in
137 +microseconds in the first row and the number of samples in the second
138 +row. To display only lines with a positive latency count, use, for
139 +example,
140 +
141 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
142 +
143 +#Minimum latency: 0 microseconds.
144 +#Average latency: 0 microseconds.
145 +#Maximum latency: 25 microseconds.
146 +#Total samples: 3104770694
147 +#There are 0 samples greater or equal than 10240 microseconds
148 +#usecs          samples
149 +    0        2984486876
150 +    1          49843506
151 +    2          58219047
152 +    3           5348126
153 +    4           2187960
154 +    5           3388262
155 +    6            959289
156 +    7            208294
157 +    8             40420
158 +    9              4485
159 +   10             14918
160 +   11             18340
161 +   12             25052
162 +   13             19455
163 +   14              5602
164 +   15               969
165 +   16                47
166 +   17                18
167 +   18                14
168 +   19                 1
169 +   20                 3
170 +   21                 2
171 +   22                 5
172 +   23                 2
173 +   25                 1
174 +
175 +
176 +* Wakeup latency of a selected process
177 +
178 +To only collect wakeup latency data of a particular process, write the
179 +PID of the requested process to
180 +
181 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
182 +
183 +PIDs are not considered, if this variable is set to 0.
184 +
185 +
186 +* Details of the process with the highest wakeup latency so far
187 +
188 +Selected data of the process that suffered from the highest wakeup
189 +latency that occurred in a particular CPU are available in the file
190 +
191 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
192 +
193 +In addition, other relevant system data at the time when the
194 +latency occurred are given.
195 +
196 +The format of the data is (all in one line):
197 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
198 +<- <PID> <Priority> <Command> <Timestamp>
199 +
200 +The value of <Timeroffset> is only relevant in the combined timer
201 +and wakeup latency recording. In the wakeup recording, it is
202 +always 0, in the missed_timer_offsets recording, it is the same
203 +as <Latency>.
204 +
205 +When retrospectively searching for the origin of a latency and
206 +tracing was not enabled, it may be helpful to know the name and
207 +some basic data of the task that (finally) was switching to the
208 +late real-tlme task. In addition to the victim's data, also the
209 +data of the possible culprit are therefore displayed after the
210 +"<-" symbol.
211 +
212 +Finally, the timestamp of the time when the latency occurred
213 +in <seconds>.<microseconds> after the most recent system boot
214 +is provided.
215 +
216 +These data are also reset when the wakeup histogram is reset.
217 diff --git a/arch/Kconfig b/arch/Kconfig
218 index 659bdd079277..099fc0f5155e 100644
219 --- a/arch/Kconfig
220 +++ b/arch/Kconfig
221 @@ -9,6 +9,7 @@ config OPROFILE
222         tristate "OProfile system profiling"
223         depends on PROFILING
224         depends on HAVE_OPROFILE
225 +       depends on !PREEMPT_RT_FULL
226         select RING_BUFFER
227         select RING_BUFFER_ALLOW_SWAP
228         help
229 @@ -52,6 +53,7 @@ config KPROBES
230  config JUMP_LABEL
231         bool "Optimize very unlikely/likely branches"
232         depends on HAVE_ARCH_JUMP_LABEL
233 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
234         help
235           This option enables a transparent branch optimization that
236          makes certain almost-always-true or almost-always-false branch
237 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
238 index b5d529fdffab..5715844e83e3 100644
239 --- a/arch/arm/Kconfig
240 +++ b/arch/arm/Kconfig
241 @@ -36,7 +36,7 @@ config ARM
242         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
243         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
244         select HAVE_ARCH_HARDENED_USERCOPY
245 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
246 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
247         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
248         select HAVE_ARCH_MMAP_RND_BITS if MMU
249         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
250 @@ -75,6 +75,7 @@ config ARM
251         select HAVE_PERF_EVENTS
252         select HAVE_PERF_REGS
253         select HAVE_PERF_USER_STACK_DUMP
254 +       select HAVE_PREEMPT_LAZY
255         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
256         select HAVE_REGS_AND_STACK_ACCESS_API
257         select HAVE_SYSCALL_TRACEPOINTS
258 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
259 index e53638c8ed8a..6095a1649865 100644
260 --- a/arch/arm/include/asm/irq.h
261 +++ b/arch/arm/include/asm/irq.h
262 @@ -22,6 +22,8 @@
263  #endif
264  
265  #ifndef __ASSEMBLY__
266 +#include <linux/cpumask.h>
267 +
268  struct irqaction;
269  struct pt_regs;
270  extern void migrate_irqs(void);
271 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
272 index 12ebfcc1d539..c962084605bc 100644
273 --- a/arch/arm/include/asm/switch_to.h
274 +++ b/arch/arm/include/asm/switch_to.h
275 @@ -3,6 +3,13 @@
276  
277  #include <linux/thread_info.h>
278  
279 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
280 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
281 +#else
282 +static inline void
283 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
284 +#endif
285 +
286  /*
287   * For v7 SMP cores running a preemptible kernel we may be pre-empted
288   * during a TLB maintenance operation, so execute an inner-shareable dsb
289 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
290  #define switch_to(prev,next,last)                                      \
291  do {                                                                   \
292         __complete_pending_tlbi();                                      \
293 +       switch_kmaps(prev, next);                                       \
294         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
295  } while (0)
296  
297 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
298 index 776757d1604a..1f36a4eccc72 100644
299 --- a/arch/arm/include/asm/thread_info.h
300 +++ b/arch/arm/include/asm/thread_info.h
301 @@ -49,6 +49,7 @@ struct cpu_context_save {
302  struct thread_info {
303         unsigned long           flags;          /* low level flags */
304         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
305 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
306         mm_segment_t            addr_limit;     /* address limit */
307         struct task_struct      *task;          /* main task structure */
308         __u32                   cpu;            /* cpu */
309 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
310  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
311  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
312  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
313 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
314 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
315 +#define TIF_NEED_RESCHED_LAZY  7
316  
317  #define TIF_NOHZ               12      /* in adaptive nohz mode */
318  #define TIF_USING_IWMMXT       17
319 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
320  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
321  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
322  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
323 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
324  #define _TIF_UPROBE            (1 << TIF_UPROBE)
325  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
326  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
327 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
328   * Change these and you break ASM code in entry-common.S
329   */
330  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
331 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
332 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
333 +                                _TIF_NEED_RESCHED_LAZY)
334  
335  #endif /* __KERNEL__ */
336  #endif /* __ASM_ARM_THREAD_INFO_H */
337 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
338 index 608008229c7d..3866da3f7bb7 100644
339 --- a/arch/arm/kernel/asm-offsets.c
340 +++ b/arch/arm/kernel/asm-offsets.c
341 @@ -65,6 +65,7 @@ int main(void)
342    BLANK();
343    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
344    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
345 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
346    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
347    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
348    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
349 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
350 index 9f157e7c51e7..468e224d76aa 100644
351 --- a/arch/arm/kernel/entry-armv.S
352 +++ b/arch/arm/kernel/entry-armv.S
353 @@ -220,11 +220,18 @@ ENDPROC(__dabt_svc)
354  
355  #ifdef CONFIG_PREEMPT
356         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
357 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
358         teq     r8, #0                          @ if preempt count != 0
359 +       bne     1f                              @ return from exeption
360 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
361 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
362 +       blne    svc_preempt                     @ preempt!
363 +
364 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
365 +       teq     r8, #0                          @ if preempt lazy count != 0
366         movne   r0, #0                          @ force flags to 0
367 -       tst     r0, #_TIF_NEED_RESCHED
368 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
369         blne    svc_preempt
370 +1:
371  #endif
372  
373         svc_exit r5, irq = 1                    @ return from exception
374 @@ -239,8 +246,14 @@ ENDPROC(__irq_svc)
375  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
376         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
377         tst     r0, #_TIF_NEED_RESCHED
378 +       bne     1b
379 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
380         reteq   r8                              @ go again
381 -       b       1b
382 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
383 +       teq     r0, #0                          @ if preempt lazy count != 0
384 +       beq     1b
385 +       ret     r8                              @ go again
386 +
387  #endif
388  
389  __und_fault:
390 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
391 index 10c3283d6c19..8872937862cc 100644
392 --- a/arch/arm/kernel/entry-common.S
393 +++ b/arch/arm/kernel/entry-common.S
394 @@ -36,7 +36,9 @@
395   UNWIND(.cantunwind    )
396         disable_irq_notrace                     @ disable interrupts
397         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
398 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
399 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
400 +       bne     fast_work_pending
401 +       tst     r1, #_TIF_SECCOMP
402         bne     fast_work_pending
403  
404         /* perform architecture specific actions before user return */
405 @@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
406         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
407         disable_irq_notrace                     @ disable interrupts
408         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
409 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
410 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
411 +       bne     do_slower_path
412 +       tst     r1, #_TIF_SECCOMP
413         beq     no_work_pending
414 +do_slower_path:
415   UNWIND(.fnend         )
416  ENDPROC(ret_fast_syscall)
417  
418 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
419 index 69bda1a5707e..1f665acaa6a9 100644
420 --- a/arch/arm/kernel/patch.c
421 +++ b/arch/arm/kernel/patch.c
422 @@ -15,7 +15,7 @@ struct patch {
423         unsigned int insn;
424  };
425  
426 -static DEFINE_SPINLOCK(patch_lock);
427 +static DEFINE_RAW_SPINLOCK(patch_lock);
428  
429  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
430         __acquires(&patch_lock)
431 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
432                 return addr;
433  
434         if (flags)
435 -               spin_lock_irqsave(&patch_lock, *flags);
436 +               raw_spin_lock_irqsave(&patch_lock, *flags);
437         else
438                 __acquire(&patch_lock);
439  
440 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
441         clear_fixmap(fixmap);
442  
443         if (flags)
444 -               spin_unlock_irqrestore(&patch_lock, *flags);
445 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
446         else
447                 __release(&patch_lock);
448  }
449 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
450 index 91d2d5b01414..750550098b59 100644
451 --- a/arch/arm/kernel/process.c
452 +++ b/arch/arm/kernel/process.c
453 @@ -322,6 +322,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
454  }
455  
456  #ifdef CONFIG_MMU
457 +/*
458 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
459 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
460 + * fail.
461 + */
462 +static int __init vectors_user_mapping_init_page(void)
463 +{
464 +       struct page *page;
465 +       unsigned long addr = 0xffff0000;
466 +       pgd_t *pgd;
467 +       pud_t *pud;
468 +       pmd_t *pmd;
469 +
470 +       pgd = pgd_offset_k(addr);
471 +       pud = pud_offset(pgd, addr);
472 +       pmd = pmd_offset(pud, addr);
473 +       page = pmd_page(*(pmd));
474 +
475 +       pgtable_page_ctor(page);
476 +
477 +       return 0;
478 +}
479 +late_initcall(vectors_user_mapping_init_page);
480 +
481  #ifdef CONFIG_KUSER_HELPERS
482  /*
483   * The vectors page is always readable from user space for the
484 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
485 index 7b8f2141427b..96541e00b74a 100644
486 --- a/arch/arm/kernel/signal.c
487 +++ b/arch/arm/kernel/signal.c
488 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
489          */
490         trace_hardirqs_off();
491         do {
492 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
493 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
494 +                                          _TIF_NEED_RESCHED_LAZY))) {
495                         schedule();
496                 } else {
497                         if (unlikely(!user_mode(regs)))
498 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
499 index 7dd14e8395e6..4cd7e3d98035 100644
500 --- a/arch/arm/kernel/smp.c
501 +++ b/arch/arm/kernel/smp.c
502 @@ -234,8 +234,6 @@ int __cpu_disable(void)
503         flush_cache_louis();
504         local_flush_tlb_all();
505  
506 -       clear_tasks_mm_cpumask(cpu);
507 -
508         return 0;
509  }
510  
511 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
512                 pr_err("CPU%u: cpu didn't die\n", cpu);
513                 return;
514         }
515 +
516 +       clear_tasks_mm_cpumask(cpu);
517 +
518         pr_notice("CPU%u: shutdown\n", cpu);
519  
520         /*
521 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
522 index 0bee233fef9a..314cfb232a63 100644
523 --- a/arch/arm/kernel/unwind.c
524 +++ b/arch/arm/kernel/unwind.c
525 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
526  static const struct unwind_idx *__origin_unwind_idx;
527  extern const struct unwind_idx __stop_unwind_idx[];
528  
529 -static DEFINE_SPINLOCK(unwind_lock);
530 +static DEFINE_RAW_SPINLOCK(unwind_lock);
531  static LIST_HEAD(unwind_tables);
532  
533  /* Convert a prel31 symbol to an absolute address */
534 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
535                 /* module unwind tables */
536                 struct unwind_table *table;
537  
538 -               spin_lock_irqsave(&unwind_lock, flags);
539 +               raw_spin_lock_irqsave(&unwind_lock, flags);
540                 list_for_each_entry(table, &unwind_tables, list) {
541                         if (addr >= table->begin_addr &&
542                             addr < table->end_addr) {
543 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
544                                 break;
545                         }
546                 }
547 -               spin_unlock_irqrestore(&unwind_lock, flags);
548 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
549         }
550  
551         pr_debug("%s: idx = %p\n", __func__, idx);
552 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
553         tab->begin_addr = text_addr;
554         tab->end_addr = text_addr + text_size;
555  
556 -       spin_lock_irqsave(&unwind_lock, flags);
557 +       raw_spin_lock_irqsave(&unwind_lock, flags);
558         list_add_tail(&tab->list, &unwind_tables);
559 -       spin_unlock_irqrestore(&unwind_lock, flags);
560 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
561  
562         return tab;
563  }
564 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
565         if (!tab)
566                 return;
567  
568 -       spin_lock_irqsave(&unwind_lock, flags);
569 +       raw_spin_lock_irqsave(&unwind_lock, flags);
570         list_del(&tab->list);
571 -       spin_unlock_irqrestore(&unwind_lock, flags);
572 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
573  
574         kfree(tab);
575  }
576 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
577 index 19b5f5c1c0ff..82aa639e6737 100644
578 --- a/arch/arm/kvm/arm.c
579 +++ b/arch/arm/kvm/arm.c
580 @@ -619,7 +619,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
581                  * involves poking the GIC, which must be done in a
582                  * non-preemptible context.
583                  */
584 -               preempt_disable();
585 +               migrate_disable();
586                 kvm_pmu_flush_hwstate(vcpu);
587                 kvm_timer_flush_hwstate(vcpu);
588                 kvm_vgic_flush_hwstate(vcpu);
589 @@ -640,7 +640,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
590                         kvm_pmu_sync_hwstate(vcpu);
591                         kvm_timer_sync_hwstate(vcpu);
592                         kvm_vgic_sync_hwstate(vcpu);
593 -                       preempt_enable();
594 +                       migrate_enable();
595                         continue;
596                 }
597  
598 @@ -696,7 +696,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
599  
600                 kvm_vgic_sync_hwstate(vcpu);
601  
602 -               preempt_enable();
603 +               migrate_enable();
604  
605                 ret = handle_exit(vcpu, run, ret);
606         }
607 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
608 index 98ffe1e62ad5..df9769ddece5 100644
609 --- a/arch/arm/mach-exynos/platsmp.c
610 +++ b/arch/arm/mach-exynos/platsmp.c
611 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
612         return (void __iomem *)(S5P_VA_SCU);
613  }
614  
615 -static DEFINE_SPINLOCK(boot_lock);
616 +static DEFINE_RAW_SPINLOCK(boot_lock);
617  
618  static void exynos_secondary_init(unsigned int cpu)
619  {
620 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
621         /*
622          * Synchronise with the boot thread.
623          */
624 -       spin_lock(&boot_lock);
625 -       spin_unlock(&boot_lock);
626 +       raw_spin_lock(&boot_lock);
627 +       raw_spin_unlock(&boot_lock);
628  }
629  
630  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
631 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
632          * Set synchronisation state between this boot processor
633          * and the secondary one
634          */
635 -       spin_lock(&boot_lock);
636 +       raw_spin_lock(&boot_lock);
637  
638         /*
639          * The secondary processor is waiting to be released from
640 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
641  
642                 if (timeout == 0) {
643                         printk(KERN_ERR "cpu1 power enable failed");
644 -                       spin_unlock(&boot_lock);
645 +                       raw_spin_unlock(&boot_lock);
646                         return -ETIMEDOUT;
647                 }
648         }
649 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
650          * calibrations, then wait for it to finish
651          */
652  fail:
653 -       spin_unlock(&boot_lock);
654 +       raw_spin_unlock(&boot_lock);
655  
656         return pen_release != -1 ? ret : 0;
657  }
658 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
659 index 4b653a8cb75c..b03d5a922cb1 100644
660 --- a/arch/arm/mach-hisi/platmcpm.c
661 +++ b/arch/arm/mach-hisi/platmcpm.c
662 @@ -61,7 +61,7 @@
663  
664  static void __iomem *sysctrl, *fabric;
665  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
666 -static DEFINE_SPINLOCK(boot_lock);
667 +static DEFINE_RAW_SPINLOCK(boot_lock);
668  static u32 fabric_phys_addr;
669  /*
670   * [0]: bootwrapper physical address
671 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
672         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
673                 return -EINVAL;
674  
675 -       spin_lock_irq(&boot_lock);
676 +       raw_spin_lock_irq(&boot_lock);
677  
678         if (hip04_cpu_table[cluster][cpu])
679                 goto out;
680 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
681  
682  out:
683         hip04_cpu_table[cluster][cpu]++;
684 -       spin_unlock_irq(&boot_lock);
685 +       raw_spin_unlock_irq(&boot_lock);
686  
687         return 0;
688  }
689 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
690         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
691         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
692  
693 -       spin_lock(&boot_lock);
694 +       raw_spin_lock(&boot_lock);
695         hip04_cpu_table[cluster][cpu]--;
696         if (hip04_cpu_table[cluster][cpu] == 1) {
697                 /* A power_up request went ahead of us. */
698 -               spin_unlock(&boot_lock);
699 +               raw_spin_unlock(&boot_lock);
700                 return;
701         } else if (hip04_cpu_table[cluster][cpu] > 1) {
702                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
703 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
704         }
705  
706         last_man = hip04_cluster_is_down(cluster);
707 -       spin_unlock(&boot_lock);
708 +       raw_spin_unlock(&boot_lock);
709         if (last_man) {
710                 /* Since it's Cortex A15, disable L2 prefetching. */
711                 asm volatile(
712 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
713                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
714  
715         count = TIMEOUT_MSEC / POLL_MSEC;
716 -       spin_lock_irq(&boot_lock);
717 +       raw_spin_lock_irq(&boot_lock);
718         for (tries = 0; tries < count; tries++) {
719                 if (hip04_cpu_table[cluster][cpu])
720                         goto err;
721 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
722                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
723                 if (data & CORE_WFI_STATUS(cpu))
724                         break;
725 -               spin_unlock_irq(&boot_lock);
726 +               raw_spin_unlock_irq(&boot_lock);
727                 /* Wait for clean L2 when the whole cluster is down. */
728                 msleep(POLL_MSEC);
729 -               spin_lock_irq(&boot_lock);
730 +               raw_spin_lock_irq(&boot_lock);
731         }
732         if (tries >= count)
733                 goto err;
734 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
735                 goto err;
736         if (hip04_cluster_is_down(cluster))
737                 hip04_set_snoop_filter(cluster, 0);
738 -       spin_unlock_irq(&boot_lock);
739 +       raw_spin_unlock_irq(&boot_lock);
740         return 1;
741  err:
742 -       spin_unlock_irq(&boot_lock);
743 +       raw_spin_unlock_irq(&boot_lock);
744         return 0;
745  }
746  #endif
747 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
748 index b4de3da6dffa..b52893319d75 100644
749 --- a/arch/arm/mach-omap2/omap-smp.c
750 +++ b/arch/arm/mach-omap2/omap-smp.c
751 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
752         .startup_addr = omap5_secondary_startup,
753  };
754  
755 -static DEFINE_SPINLOCK(boot_lock);
756 +static DEFINE_RAW_SPINLOCK(boot_lock);
757  
758  void __iomem *omap4_get_scu_base(void)
759  {
760 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
761         /*
762          * Synchronise with the boot thread.
763          */
764 -       spin_lock(&boot_lock);
765 -       spin_unlock(&boot_lock);
766 +       raw_spin_lock(&boot_lock);
767 +       raw_spin_unlock(&boot_lock);
768  }
769  
770  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
771 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
772          * Set synchronisation state between this boot processor
773          * and the secondary one
774          */
775 -       spin_lock(&boot_lock);
776 +       raw_spin_lock(&boot_lock);
777  
778         /*
779          * Update the AuxCoreBoot0 with boot state for secondary core.
780 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
781          * Now the secondary core is starting up let it run its
782          * calibrations, then wait for it to finish
783          */
784 -       spin_unlock(&boot_lock);
785 +       raw_spin_unlock(&boot_lock);
786  
787         return 0;
788  }
789 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
790 index 0875b99add18..18b6d98d2581 100644
791 --- a/arch/arm/mach-prima2/platsmp.c
792 +++ b/arch/arm/mach-prima2/platsmp.c
793 @@ -22,7 +22,7 @@
794  
795  static void __iomem *clk_base;
796  
797 -static DEFINE_SPINLOCK(boot_lock);
798 +static DEFINE_RAW_SPINLOCK(boot_lock);
799  
800  static void sirfsoc_secondary_init(unsigned int cpu)
801  {
802 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
803         /*
804          * Synchronise with the boot thread.
805          */
806 -       spin_lock(&boot_lock);
807 -       spin_unlock(&boot_lock);
808 +       raw_spin_lock(&boot_lock);
809 +       raw_spin_unlock(&boot_lock);
810  }
811  
812  static const struct of_device_id clk_ids[]  = {
813 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
814         /* make sure write buffer is drained */
815         mb();
816  
817 -       spin_lock(&boot_lock);
818 +       raw_spin_lock(&boot_lock);
819  
820         /*
821          * The secondary processor is waiting to be released from
822 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
823          * now the secondary core is starting up let it run its
824          * calibrations, then wait for it to finish
825          */
826 -       spin_unlock(&boot_lock);
827 +       raw_spin_unlock(&boot_lock);
828  
829         return pen_release != -1 ? -ENOSYS : 0;
830  }
831 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
832 index 5494c9e0c909..e8ce157d3548 100644
833 --- a/arch/arm/mach-qcom/platsmp.c
834 +++ b/arch/arm/mach-qcom/platsmp.c
835 @@ -46,7 +46,7 @@
836  
837  extern void secondary_startup_arm(void);
838  
839 -static DEFINE_SPINLOCK(boot_lock);
840 +static DEFINE_RAW_SPINLOCK(boot_lock);
841  
842  #ifdef CONFIG_HOTPLUG_CPU
843  static void qcom_cpu_die(unsigned int cpu)
844 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
845         /*
846          * Synchronise with the boot thread.
847          */
848 -       spin_lock(&boot_lock);
849 -       spin_unlock(&boot_lock);
850 +       raw_spin_lock(&boot_lock);
851 +       raw_spin_unlock(&boot_lock);
852  }
853  
854  static int scss_release_secondary(unsigned int cpu)
855 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
856          * set synchronisation state between this boot processor
857          * and the secondary one
858          */
859 -       spin_lock(&boot_lock);
860 +       raw_spin_lock(&boot_lock);
861  
862         /*
863          * Send the secondary CPU a soft interrupt, thereby causing
864 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
865          * now the secondary core is starting up let it run its
866          * calibrations, then wait for it to finish
867          */
868 -       spin_unlock(&boot_lock);
869 +       raw_spin_unlock(&boot_lock);
870  
871         return ret;
872  }
873 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
874 index 8d1e2d551786..7fa56cc78118 100644
875 --- a/arch/arm/mach-spear/platsmp.c
876 +++ b/arch/arm/mach-spear/platsmp.c
877 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
878         sync_cache_w(&pen_release);
879  }
880  
881 -static DEFINE_SPINLOCK(boot_lock);
882 +static DEFINE_RAW_SPINLOCK(boot_lock);
883  
884  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
885  
886 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
887         /*
888          * Synchronise with the boot thread.
889          */
890 -       spin_lock(&boot_lock);
891 -       spin_unlock(&boot_lock);
892 +       raw_spin_lock(&boot_lock);
893 +       raw_spin_unlock(&boot_lock);
894  }
895  
896  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
897 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
898          * set synchronisation state between this boot processor
899          * and the secondary one
900          */
901 -       spin_lock(&boot_lock);
902 +       raw_spin_lock(&boot_lock);
903  
904         /*
905          * The secondary processor is waiting to be released from
906 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
907          * now the secondary core is starting up let it run its
908          * calibrations, then wait for it to finish
909          */
910 -       spin_unlock(&boot_lock);
911 +       raw_spin_unlock(&boot_lock);
912  
913         return pen_release != -1 ? -ENOSYS : 0;
914  }
915 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
916 index ea5a2277ee46..b988e081ac79 100644
917 --- a/arch/arm/mach-sti/platsmp.c
918 +++ b/arch/arm/mach-sti/platsmp.c
919 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
920         sync_cache_w(&pen_release);
921  }
922  
923 -static DEFINE_SPINLOCK(boot_lock);
924 +static DEFINE_RAW_SPINLOCK(boot_lock);
925  
926  static void sti_secondary_init(unsigned int cpu)
927  {
928 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
929         /*
930          * Synchronise with the boot thread.
931          */
932 -       spin_lock(&boot_lock);
933 -       spin_unlock(&boot_lock);
934 +       raw_spin_lock(&boot_lock);
935 +       raw_spin_unlock(&boot_lock);
936  }
937  
938  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
939 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
940          * set synchronisation state between this boot processor
941          * and the secondary one
942          */
943 -       spin_lock(&boot_lock);
944 +       raw_spin_lock(&boot_lock);
945  
946         /*
947          * The secondary processor is waiting to be released from
948 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
949          * now the secondary core is starting up let it run its
950          * calibrations, then wait for it to finish
951          */
952 -       spin_unlock(&boot_lock);
953 +       raw_spin_unlock(&boot_lock);
954  
955         return pen_release != -1 ? -ENOSYS : 0;
956  }
957 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
958 index 3a2e678b8d30..3ed1e9ba6a01 100644
959 --- a/arch/arm/mm/fault.c
960 +++ b/arch/arm/mm/fault.c
961 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
962         if (addr < TASK_SIZE)
963                 return do_page_fault(addr, fsr, regs);
964  
965 +       if (interrupts_enabled(regs))
966 +               local_irq_enable();
967 +
968         if (user_mode(regs))
969                 goto bad_area;
970  
971 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
972  static int
973  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
974  {
975 +       if (interrupts_enabled(regs))
976 +               local_irq_enable();
977 +
978         do_bad_area(addr, fsr, regs);
979         return 0;
980  }
981 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
982 index d02f8187b1cc..542692dbd40a 100644
983 --- a/arch/arm/mm/highmem.c
984 +++ b/arch/arm/mm/highmem.c
985 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
986         return *ptep;
987  }
988  
989 +static unsigned int fixmap_idx(int type)
990 +{
991 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
992 +}
993 +
994  void *kmap(struct page *page)
995  {
996         might_sleep();
997 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
998  
999  void *kmap_atomic(struct page *page)
1000  {
1001 +       pte_t pte = mk_pte(page, kmap_prot);
1002         unsigned int idx;
1003         unsigned long vaddr;
1004         void *kmap;
1005         int type;
1006  
1007 -       preempt_disable();
1008 +       preempt_disable_nort();
1009         pagefault_disable();
1010         if (!PageHighMem(page))
1011                 return page_address(page);
1012 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1013  
1014         type = kmap_atomic_idx_push();
1015  
1016 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1017 +       idx = fixmap_idx(type);
1018         vaddr = __fix_to_virt(idx);
1019  #ifdef CONFIG_DEBUG_HIGHMEM
1020         /*
1021 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1022          * in place, so the contained TLB flush ensures the TLB is updated
1023          * with the new mapping.
1024          */
1025 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1026 +#ifdef CONFIG_PREEMPT_RT_FULL
1027 +       current->kmap_pte[type] = pte;
1028 +#endif
1029 +       set_fixmap_pte(idx, pte);
1030  
1031         return (void *)vaddr;
1032  }
1033 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1034  
1035         if (kvaddr >= (void *)FIXADDR_START) {
1036                 type = kmap_atomic_idx();
1037 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1038 +               idx = fixmap_idx(type);
1039  
1040                 if (cache_is_vivt())
1041                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1042 +#ifdef CONFIG_PREEMPT_RT_FULL
1043 +               current->kmap_pte[type] = __pte(0);
1044 +#endif
1045  #ifdef CONFIG_DEBUG_HIGHMEM
1046                 BUG_ON(vaddr != __fix_to_virt(idx));
1047 -               set_fixmap_pte(idx, __pte(0));
1048  #else
1049                 (void) idx;  /* to kill a warning */
1050  #endif
1051 +               set_fixmap_pte(idx, __pte(0));
1052                 kmap_atomic_idx_pop();
1053         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1054                 /* this address was obtained through kmap_high_get() */
1055                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1056         }
1057         pagefault_enable();
1058 -       preempt_enable();
1059 +       preempt_enable_nort();
1060  }
1061  EXPORT_SYMBOL(__kunmap_atomic);
1062  
1063  void *kmap_atomic_pfn(unsigned long pfn)
1064  {
1065 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1066         unsigned long vaddr;
1067         int idx, type;
1068         struct page *page = pfn_to_page(pfn);
1069  
1070 -       preempt_disable();
1071 +       preempt_disable_nort();
1072         pagefault_disable();
1073         if (!PageHighMem(page))
1074                 return page_address(page);
1075  
1076         type = kmap_atomic_idx_push();
1077 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1078 +       idx = fixmap_idx(type);
1079         vaddr = __fix_to_virt(idx);
1080  #ifdef CONFIG_DEBUG_HIGHMEM
1081         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1082  #endif
1083 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1084 +#ifdef CONFIG_PREEMPT_RT_FULL
1085 +       current->kmap_pte[type] = pte;
1086 +#endif
1087 +       set_fixmap_pte(idx, pte);
1088  
1089         return (void *)vaddr;
1090  }
1091 +#if defined CONFIG_PREEMPT_RT_FULL
1092 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1093 +{
1094 +       int i;
1095 +
1096 +       /*
1097 +        * Clear @prev's kmap_atomic mappings
1098 +        */
1099 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1100 +               int idx = fixmap_idx(i);
1101 +
1102 +               set_fixmap_pte(idx, __pte(0));
1103 +       }
1104 +       /*
1105 +        * Restore @next_p's kmap_atomic mappings
1106 +        */
1107 +       for (i = 0; i < next_p->kmap_idx; i++) {
1108 +               int idx = fixmap_idx(i);
1109 +
1110 +               if (!pte_none(next_p->kmap_pte[i]))
1111 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1112 +       }
1113 +}
1114 +#endif
1115 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1116 index c2366510187a..6b60f582b738 100644
1117 --- a/arch/arm/plat-versatile/platsmp.c
1118 +++ b/arch/arm/plat-versatile/platsmp.c
1119 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1120         sync_cache_w(&pen_release);
1121  }
1122  
1123 -static DEFINE_SPINLOCK(boot_lock);
1124 +static DEFINE_RAW_SPINLOCK(boot_lock);
1125  
1126  void versatile_secondary_init(unsigned int cpu)
1127  {
1128 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1129         /*
1130          * Synchronise with the boot thread.
1131          */
1132 -       spin_lock(&boot_lock);
1133 -       spin_unlock(&boot_lock);
1134 +       raw_spin_lock(&boot_lock);
1135 +       raw_spin_unlock(&boot_lock);
1136  }
1137  
1138  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1139 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1140          * Set synchronisation state between this boot processor
1141          * and the secondary one
1142          */
1143 -       spin_lock(&boot_lock);
1144 +       raw_spin_lock(&boot_lock);
1145  
1146         /*
1147          * This is really belt and braces; we hold unintended secondary
1148 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1149          * now the secondary core is starting up let it run its
1150          * calibrations, then wait for it to finish
1151          */
1152 -       spin_unlock(&boot_lock);
1153 +       raw_spin_unlock(&boot_lock);
1154  
1155         return pen_release != -1 ? -ENOSYS : 0;
1156  }
1157 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1158 index 969ef880d234..1182fe883771 100644
1159 --- a/arch/arm64/Kconfig
1160 +++ b/arch/arm64/Kconfig
1161 @@ -91,6 +91,7 @@ config ARM64
1162         select HAVE_PERF_EVENTS
1163         select HAVE_PERF_REGS
1164         select HAVE_PERF_USER_STACK_DUMP
1165 +       select HAVE_PREEMPT_LAZY
1166         select HAVE_REGS_AND_STACK_ACCESS_API
1167         select HAVE_RCU_TABLE_FREE
1168         select HAVE_SYSCALL_TRACEPOINTS
1169 @@ -694,7 +695,7 @@ config XEN_DOM0
1170  
1171  config XEN
1172         bool "Xen guest support on ARM64"
1173 -       depends on ARM64 && OF
1174 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1175         select SWIOTLB_XEN
1176         select PARAVIRT
1177         help
1178 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1179 index e9ea5a6bd449..6c500ad63c6a 100644
1180 --- a/arch/arm64/include/asm/thread_info.h
1181 +++ b/arch/arm64/include/asm/thread_info.h
1182 @@ -49,6 +49,7 @@ struct thread_info {
1183         mm_segment_t            addr_limit;     /* address limit */
1184         struct task_struct      *task;          /* main task structure */
1185         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1186 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1187         int                     cpu;            /* cpu */
1188  };
1189  
1190 @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
1191  #define TIF_NEED_RESCHED       1
1192  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1193  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1194 +#define TIF_NEED_RESCHED_LAZY  4
1195  #define TIF_NOHZ               7
1196  #define TIF_SYSCALL_TRACE      8
1197  #define TIF_SYSCALL_AUDIT      9
1198 @@ -127,6 +129,7 @@ static inline struct thread_info *current_thread_info(void)
1199  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1200  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1201  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1202 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1203  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1204  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1205  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1206 @@ -135,7 +138,9 @@ static inline struct thread_info *current_thread_info(void)
1207  #define _TIF_32BIT             (1 << TIF_32BIT)
1208  
1209  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1210 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1211 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1212 +                                _TIF_NEED_RESCHED_LAZY)
1213 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1214  
1215  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1216                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1217 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1218 index 4a2f0f0fef32..6bf2bc17c400 100644
1219 --- a/arch/arm64/kernel/asm-offsets.c
1220 +++ b/arch/arm64/kernel/asm-offsets.c
1221 @@ -38,6 +38,7 @@ int main(void)
1222    BLANK();
1223    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1224    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1225 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1226    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1227    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1228    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1229 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1230 index 223d54a4d66b..266c0e2141ca 100644
1231 --- a/arch/arm64/kernel/entry.S
1232 +++ b/arch/arm64/kernel/entry.S
1233 @@ -428,11 +428,16 @@ ENDPROC(el1_sync)
1234  
1235  #ifdef CONFIG_PREEMPT
1236         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1237 -       cbnz    w24, 1f                         // preempt count != 0
1238 +       cbnz    w24, 2f                         // preempt count != 0
1239         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1240 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1241 -       bl      el1_preempt
1242 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1243 +
1244 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1245 +       cbnz    w24, 2f                         // preempt lazy count != 0
1246 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1247  1:
1248 +       bl      el1_preempt
1249 +2:
1250  #endif
1251  #ifdef CONFIG_TRACE_IRQFLAGS
1252         bl      trace_hardirqs_on
1253 @@ -446,6 +451,7 @@ ENDPROC(el1_irq)
1254  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1255         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1256         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1257 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1258         ret     x24
1259  #endif
1260  
1261 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
1262 index 404dd67080b9..639dc6d12e72 100644
1263 --- a/arch/arm64/kernel/signal.c
1264 +++ b/arch/arm64/kernel/signal.c
1265 @@ -409,7 +409,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
1266          */
1267         trace_hardirqs_off();
1268         do {
1269 -               if (thread_flags & _TIF_NEED_RESCHED) {
1270 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1271                         schedule();
1272                 } else {
1273                         local_irq_enable();
1274 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1275 index b3c5bde43d34..8122bf058de0 100644
1276 --- a/arch/mips/Kconfig
1277 +++ b/arch/mips/Kconfig
1278 @@ -2514,7 +2514,7 @@ config MIPS_ASID_BITS_VARIABLE
1279  #
1280  config HIGHMEM
1281         bool "High Memory Support"
1282 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1283 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1284  
1285  config CPU_SUPPORTS_HIGHMEM
1286         bool
1287 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1288 index 65fba4c34cd7..4b5ba68910e0 100644
1289 --- a/arch/powerpc/Kconfig
1290 +++ b/arch/powerpc/Kconfig
1291 @@ -52,10 +52,11 @@ config LOCKDEP_SUPPORT
1292  
1293  config RWSEM_GENERIC_SPINLOCK
1294         bool
1295 +       default y if PREEMPT_RT_FULL
1296  
1297  config RWSEM_XCHGADD_ALGORITHM
1298         bool
1299 -       default y
1300 +       default y if !PREEMPT_RT_FULL
1301  
1302  config GENERIC_LOCKBREAK
1303         bool
1304 @@ -134,6 +135,7 @@ config PPC
1305         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1306         select GENERIC_STRNCPY_FROM_USER
1307         select GENERIC_STRNLEN_USER
1308 +       select HAVE_PREEMPT_LAZY
1309         select HAVE_MOD_ARCH_SPECIFIC
1310         select MODULES_USE_ELF_RELA
1311         select CLONE_BACKWARDS
1312 @@ -321,7 +323,7 @@ menu "Kernel options"
1313  
1314  config HIGHMEM
1315         bool "High memory support"
1316 -       depends on PPC32
1317 +       depends on PPC32 && !PREEMPT_RT_FULL
1318  
1319  source kernel/Kconfig.hz
1320  source kernel/Kconfig.preempt
1321 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1322 index 87e4b2d8dcd4..981e501a4359 100644
1323 --- a/arch/powerpc/include/asm/thread_info.h
1324 +++ b/arch/powerpc/include/asm/thread_info.h
1325 @@ -43,6 +43,8 @@ struct thread_info {
1326         int             cpu;                    /* cpu we're on */
1327         int             preempt_count;          /* 0 => preemptable,
1328                                                    <0 => BUG */
1329 +       int             preempt_lazy_count;     /* 0 => preemptable,
1330 +                                                  <0 => BUG */
1331         unsigned long   local_flags;            /* private flags for thread */
1332  #ifdef CONFIG_LIVEPATCH
1333         unsigned long *livepatch_sp;
1334 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1335  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1336  #define TIF_SIGPENDING         1       /* signal pending */
1337  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1338 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1339 -                                          TIF_NEED_RESCHED */
1340 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1341  #define TIF_32BIT              4       /* 32 bit binary */
1342  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1343  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1344 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1345  #if defined(CONFIG_PPC64)
1346  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1347  #endif
1348 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1349 +                                          TIF_NEED_RESCHED */
1350  
1351  /* as above, but as bit values */
1352  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1353 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1354  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1355  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1356  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1357 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1358  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1359                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1360                                  _TIF_NOHZ)
1361  
1362  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1363                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1364 -                                _TIF_RESTORE_TM)
1365 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1366  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1367 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1368  
1369  /* Bits in local_flags */
1370  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1371 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1372 index c833d88c423d..96e9fbc3f684 100644
1373 --- a/arch/powerpc/kernel/asm-offsets.c
1374 +++ b/arch/powerpc/kernel/asm-offsets.c
1375 @@ -156,6 +156,7 @@ int main(void)
1376         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1377         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1378         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1379 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1380         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1381         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1382  
1383 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1384 index 3841d749a430..6dbaeff192b9 100644
1385 --- a/arch/powerpc/kernel/entry_32.S
1386 +++ b/arch/powerpc/kernel/entry_32.S
1387 @@ -835,7 +835,14 @@ user_exc_return:           /* r10 contains MSR_KERNEL here */
1388         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1389         bne     restore
1390         andi.   r8,r8,_TIF_NEED_RESCHED
1391 +       bne+    1f
1392 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1393 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1394 +       bne     restore
1395 +       lwz     r0,TI_FLAGS(r9)
1396 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1397         beq+    restore
1398 +1:
1399         lwz     r3,_MSR(r1)
1400         andi.   r0,r3,MSR_EE    /* interrupts off? */
1401         beq     restore         /* don't schedule if so */
1402 @@ -846,11 +853,11 @@ user_exc_return:          /* r10 contains MSR_KERNEL here */
1403          */
1404         bl      trace_hardirqs_off
1405  #endif
1406 -1:     bl      preempt_schedule_irq
1407 +2:     bl      preempt_schedule_irq
1408         CURRENT_THREAD_INFO(r9, r1)
1409         lwz     r3,TI_FLAGS(r9)
1410 -       andi.   r0,r3,_TIF_NEED_RESCHED
1411 -       bne-    1b
1412 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1413 +       bne-    2b
1414  #ifdef CONFIG_TRACE_IRQFLAGS
1415         /* And now, to properly rebalance the above, we tell lockdep they
1416          * are being turned back on, which will happen when we return
1417 @@ -1171,7 +1178,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
1418  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1419  
1420  do_work:                       /* r10 contains MSR_KERNEL here */
1421 -       andi.   r0,r9,_TIF_NEED_RESCHED
1422 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1423         beq     do_user_signal
1424  
1425  do_resched:                    /* r10 contains MSR_KERNEL here */
1426 @@ -1192,7 +1199,7 @@ do_resched:                       /* r10 contains MSR_KERNEL here */
1427         MTMSRD(r10)             /* disable interrupts */
1428         CURRENT_THREAD_INFO(r9, r1)
1429         lwz     r9,TI_FLAGS(r9)
1430 -       andi.   r0,r9,_TIF_NEED_RESCHED
1431 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1432         bne-    do_resched
1433         andi.   r0,r9,_TIF_USER_WORK_MASK
1434         beq     restore_user
1435 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1436 index 6432d4bf08c8..5509a26f1070 100644
1437 --- a/arch/powerpc/kernel/entry_64.S
1438 +++ b/arch/powerpc/kernel/entry_64.S
1439 @@ -656,7 +656,7 @@ _GLOBAL(ret_from_except_lite)
1440         bl      restore_math
1441         b       restore
1442  #endif
1443 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1444 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1445         beq     2f
1446         bl      restore_interrupts
1447         SCHEDULE_USER
1448 @@ -718,10 +718,18 @@ _GLOBAL(ret_from_except_lite)
1449  
1450  #ifdef CONFIG_PREEMPT
1451         /* Check if we need to preempt */
1452 -       andi.   r0,r4,_TIF_NEED_RESCHED
1453 -       beq+    restore
1454 -       /* Check that preempt_count() == 0 and interrupts are enabled */
1455         lwz     r8,TI_PREEMPT(r9)
1456 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1457 +       bne     restore
1458 +       andi.   r0,r4,_TIF_NEED_RESCHED
1459 +       bne+    check_count
1460 +
1461 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1462 +       beq+    restore
1463 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1464 +
1465 +       /* Check that preempt_count() == 0 and interrupts are enabled */
1466 +check_count:
1467         cmpwi   cr1,r8,0
1468         ld      r0,SOFTE(r1)
1469         cmpdi   r0,0
1470 @@ -738,7 +746,7 @@ _GLOBAL(ret_from_except_lite)
1471         /* Re-test flags and eventually loop */
1472         CURRENT_THREAD_INFO(r9, r1)
1473         ld      r4,TI_FLAGS(r9)
1474 -       andi.   r0,r4,_TIF_NEED_RESCHED
1475 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1476         bne     1b
1477  
1478         /*
1479 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1480 index 3c05c311e35e..f83f6ac1274d 100644
1481 --- a/arch/powerpc/kernel/irq.c
1482 +++ b/arch/powerpc/kernel/irq.c
1483 @@ -638,6 +638,7 @@ void irq_ctx_init(void)
1484         }
1485  }
1486  
1487 +#ifndef CONFIG_PREEMPT_RT_FULL
1488  void do_softirq_own_stack(void)
1489  {
1490         struct thread_info *curtp, *irqtp;
1491 @@ -655,6 +656,7 @@ void do_softirq_own_stack(void)
1492         if (irqtp->flags)
1493                 set_bits(irqtp->flags, &curtp->flags);
1494  }
1495 +#endif
1496  
1497  irq_hw_number_t virq_to_hw(unsigned int virq)
1498  {
1499 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1500 index 030d72df5dd5..b471a709e100 100644
1501 --- a/arch/powerpc/kernel/misc_32.S
1502 +++ b/arch/powerpc/kernel/misc_32.S
1503 @@ -41,6 +41,7 @@
1504   * We store the saved ksp_limit in the unused part
1505   * of the STACK_FRAME_OVERHEAD
1506   */
1507 +#ifndef CONFIG_PREEMPT_RT_FULL
1508  _GLOBAL(call_do_softirq)
1509         mflr    r0
1510         stw     r0,4(r1)
1511 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
1512         stw     r10,THREAD+KSP_LIMIT(r2)
1513         mtlr    r0
1514         blr
1515 +#endif
1516  
1517  /*
1518   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1519 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1520 index 4f178671f230..39e7d84a3492 100644
1521 --- a/arch/powerpc/kernel/misc_64.S
1522 +++ b/arch/powerpc/kernel/misc_64.S
1523 @@ -31,6 +31,7 @@
1524  
1525         .text
1526  
1527 +#ifndef CONFIG_PREEMPT_RT_FULL
1528  _GLOBAL(call_do_softirq)
1529         mflr    r0
1530         std     r0,16(r1)
1531 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
1532         ld      r0,16(r1)
1533         mtlr    r0
1534         blr
1535 +#endif
1536  
1537  _GLOBAL(call_do_irq)
1538         mflr    r0
1539 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1540 index 029be26b5a17..9528089ea142 100644
1541 --- a/arch/powerpc/kvm/Kconfig
1542 +++ b/arch/powerpc/kvm/Kconfig
1543 @@ -175,6 +175,7 @@ config KVM_E500MC
1544  config KVM_MPIC
1545         bool "KVM in-kernel MPIC emulation"
1546         depends on KVM && E500
1547 +       depends on !PREEMPT_RT_FULL
1548         select HAVE_KVM_IRQCHIP
1549         select HAVE_KVM_IRQFD
1550         select HAVE_KVM_IRQ_ROUTING
1551 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1552 index e48462447ff0..2670cee66064 100644
1553 --- a/arch/powerpc/platforms/ps3/device-init.c
1554 +++ b/arch/powerpc/platforms/ps3/device-init.c
1555 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1556         }
1557         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1558  
1559 -       res = wait_event_interruptible(dev->done.wait,
1560 +       res = swait_event_interruptible(dev->done.wait,
1561                                        dev->done.done || kthread_should_stop());
1562         if (kthread_should_stop())
1563                 res = -EINTR;
1564 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1565 index 6c0378c0b8b5..abd58b4dff97 100644
1566 --- a/arch/sh/kernel/irq.c
1567 +++ b/arch/sh/kernel/irq.c
1568 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1569         hardirq_ctx[cpu] = NULL;
1570  }
1571  
1572 +#ifndef CONFIG_PREEMPT_RT_FULL
1573  void do_softirq_own_stack(void)
1574  {
1575         struct thread_info *curctx;
1576 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1577                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1578         );
1579  }
1580 +#endif
1581  #else
1582  static inline void handle_one_irq(unsigned int irq)
1583  {
1584 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1585 index 165ecdd24d22..b68a464a22be 100644
1586 --- a/arch/sparc/Kconfig
1587 +++ b/arch/sparc/Kconfig
1588 @@ -194,12 +194,10 @@ config NR_CPUS
1589  source kernel/Kconfig.hz
1590  
1591  config RWSEM_GENERIC_SPINLOCK
1592 -       bool
1593 -       default y if SPARC32
1594 +       def_bool PREEMPT_RT_FULL
1595  
1596  config RWSEM_XCHGADD_ALGORITHM
1597 -       bool
1598 -       default y if SPARC64
1599 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1600  
1601  config GENERIC_HWEIGHT
1602         bool
1603 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1604 index 34a7930b76ef..773740521008 100644
1605 --- a/arch/sparc/kernel/irq_64.c
1606 +++ b/arch/sparc/kernel/irq_64.c
1607 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1608         set_irq_regs(old_regs);
1609  }
1610  
1611 +#ifndef CONFIG_PREEMPT_RT_FULL
1612  void do_softirq_own_stack(void)
1613  {
1614         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1615 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1616         __asm__ __volatile__("mov %0, %%sp"
1617                              : : "r" (orig_sp));
1618  }
1619 +#endif
1620  
1621  #ifdef CONFIG_HOTPLUG_CPU
1622  void fixup_irqs(void)
1623 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1624 index bada636d1065..f8a995c90c01 100644
1625 --- a/arch/x86/Kconfig
1626 +++ b/arch/x86/Kconfig
1627 @@ -17,6 +17,7 @@ config X86_64
1628  ### Arch settings
1629  config X86
1630         def_bool y
1631 +       select HAVE_PREEMPT_LAZY
1632         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1633         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1634         select ANON_INODES
1635 @@ -232,8 +233,11 @@ config ARCH_MAY_HAVE_PC_FDC
1636         def_bool y
1637         depends on ISA_DMA_API
1638  
1639 +config RWSEM_GENERIC_SPINLOCK
1640 +       def_bool PREEMPT_RT_FULL
1641 +
1642  config RWSEM_XCHGADD_ALGORITHM
1643 -       def_bool y
1644 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1645  
1646  config GENERIC_CALIBRATE_DELAY
1647         def_bool y
1648 @@ -897,7 +901,7 @@ config IOMMU_HELPER
1649  config MAXSMP
1650         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1651         depends on X86_64 && SMP && DEBUG_KERNEL
1652 -       select CPUMASK_OFFSTACK
1653 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1654         ---help---
1655           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1656           If unsure, say N.
1657 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1658 index aa8b0672f87a..2429414bfc71 100644
1659 --- a/arch/x86/crypto/aesni-intel_glue.c
1660 +++ b/arch/x86/crypto/aesni-intel_glue.c
1661 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1662         err = blkcipher_walk_virt(desc, &walk);
1663         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1664  
1665 -       kernel_fpu_begin();
1666         while ((nbytes = walk.nbytes)) {
1667 +               kernel_fpu_begin();
1668                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1669 -                             nbytes & AES_BLOCK_MASK);
1670 +                               nbytes & AES_BLOCK_MASK);
1671 +               kernel_fpu_end();
1672                 nbytes &= AES_BLOCK_SIZE - 1;
1673                 err = blkcipher_walk_done(desc, &walk, nbytes);
1674         }
1675 -       kernel_fpu_end();
1676  
1677         return err;
1678  }
1679 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1680         err = blkcipher_walk_virt(desc, &walk);
1681         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1682  
1683 -       kernel_fpu_begin();
1684         while ((nbytes = walk.nbytes)) {
1685 +               kernel_fpu_begin();
1686                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1687                               nbytes & AES_BLOCK_MASK);
1688 +               kernel_fpu_end();
1689                 nbytes &= AES_BLOCK_SIZE - 1;
1690                 err = blkcipher_walk_done(desc, &walk, nbytes);
1691         }
1692 -       kernel_fpu_end();
1693  
1694         return err;
1695  }
1696 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1697         err = blkcipher_walk_virt(desc, &walk);
1698         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1699  
1700 -       kernel_fpu_begin();
1701         while ((nbytes = walk.nbytes)) {
1702 +               kernel_fpu_begin();
1703                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1704                               nbytes & AES_BLOCK_MASK, walk.iv);
1705 +               kernel_fpu_end();
1706                 nbytes &= AES_BLOCK_SIZE - 1;
1707                 err = blkcipher_walk_done(desc, &walk, nbytes);
1708         }
1709 -       kernel_fpu_end();
1710  
1711         return err;
1712  }
1713 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1714         err = blkcipher_walk_virt(desc, &walk);
1715         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1716  
1717 -       kernel_fpu_begin();
1718         while ((nbytes = walk.nbytes)) {
1719 +               kernel_fpu_begin();
1720                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1721                               nbytes & AES_BLOCK_MASK, walk.iv);
1722 +               kernel_fpu_end();
1723                 nbytes &= AES_BLOCK_SIZE - 1;
1724                 err = blkcipher_walk_done(desc, &walk, nbytes);
1725         }
1726 -       kernel_fpu_end();
1727  
1728         return err;
1729  }
1730 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1731         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1732         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1733  
1734 -       kernel_fpu_begin();
1735         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1736 +               kernel_fpu_begin();
1737                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1738                                       nbytes & AES_BLOCK_MASK, walk.iv);
1739 +               kernel_fpu_end();
1740                 nbytes &= AES_BLOCK_SIZE - 1;
1741                 err = blkcipher_walk_done(desc, &walk, nbytes);
1742         }
1743         if (walk.nbytes) {
1744 +               kernel_fpu_begin();
1745                 ctr_crypt_final(ctx, &walk);
1746 +               kernel_fpu_end();
1747                 err = blkcipher_walk_done(desc, &walk, 0);
1748         }
1749 -       kernel_fpu_end();
1750  
1751         return err;
1752  }
1753 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1754 index 8648158f3916..d7699130ee36 100644
1755 --- a/arch/x86/crypto/cast5_avx_glue.c
1756 +++ b/arch/x86/crypto/cast5_avx_glue.c
1757 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1758  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1759                      bool enc)
1760  {
1761 -       bool fpu_enabled = false;
1762 +       bool fpu_enabled;
1763         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1764         const unsigned int bsize = CAST5_BLOCK_SIZE;
1765         unsigned int nbytes;
1766 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1767                 u8 *wsrc = walk->src.virt.addr;
1768                 u8 *wdst = walk->dst.virt.addr;
1769  
1770 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1771 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1772  
1773                 /* Process multi-block batch */
1774                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1775 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1776                 } while (nbytes >= bsize);
1777  
1778  done:
1779 +               cast5_fpu_end(fpu_enabled);
1780                 err = blkcipher_walk_done(desc, walk, nbytes);
1781         }
1782 -
1783 -       cast5_fpu_end(fpu_enabled);
1784         return err;
1785  }
1786  
1787 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1788  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1789                        struct scatterlist *src, unsigned int nbytes)
1790  {
1791 -       bool fpu_enabled = false;
1792 +       bool fpu_enabled;
1793         struct blkcipher_walk walk;
1794         int err;
1795  
1796 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1797         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1798  
1799         while ((nbytes = walk.nbytes)) {
1800 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1801 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1802                 nbytes = __cbc_decrypt(desc, &walk);
1803 +               cast5_fpu_end(fpu_enabled);
1804                 err = blkcipher_walk_done(desc, &walk, nbytes);
1805         }
1806 -
1807 -       cast5_fpu_end(fpu_enabled);
1808         return err;
1809  }
1810  
1811 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1812  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1813                      struct scatterlist *src, unsigned int nbytes)
1814  {
1815 -       bool fpu_enabled = false;
1816 +       bool fpu_enabled;
1817         struct blkcipher_walk walk;
1818         int err;
1819  
1820 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1821         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1822  
1823         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1824 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1825 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1826                 nbytes = __ctr_crypt(desc, &walk);
1827 +               cast5_fpu_end(fpu_enabled);
1828                 err = blkcipher_walk_done(desc, &walk, nbytes);
1829         }
1830  
1831 -       cast5_fpu_end(fpu_enabled);
1832 -
1833         if (walk.nbytes) {
1834                 ctr_crypt_final(desc, &walk);
1835                 err = blkcipher_walk_done(desc, &walk, 0);
1836 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1837 index 6a85598931b5..3a506ce7ed93 100644
1838 --- a/arch/x86/crypto/glue_helper.c
1839 +++ b/arch/x86/crypto/glue_helper.c
1840 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1841         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1842         const unsigned int bsize = 128 / 8;
1843         unsigned int nbytes, i, func_bytes;
1844 -       bool fpu_enabled = false;
1845 +       bool fpu_enabled;
1846         int err;
1847  
1848         err = blkcipher_walk_virt(desc, walk);
1849 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1850                 u8 *wdst = walk->dst.virt.addr;
1851  
1852                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1853 -                                            desc, fpu_enabled, nbytes);
1854 +                                            desc, false, nbytes);
1855  
1856                 for (i = 0; i < gctx->num_funcs; i++) {
1857                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1858 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1859                 }
1860  
1861  done:
1862 +               glue_fpu_end(fpu_enabled);
1863                 err = blkcipher_walk_done(desc, walk, nbytes);
1864         }
1865  
1866 -       glue_fpu_end(fpu_enabled);
1867         return err;
1868  }
1869  
1870 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1871                             struct scatterlist *src, unsigned int nbytes)
1872  {
1873         const unsigned int bsize = 128 / 8;
1874 -       bool fpu_enabled = false;
1875 +       bool fpu_enabled;
1876         struct blkcipher_walk walk;
1877         int err;
1878  
1879 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1880  
1881         while ((nbytes = walk.nbytes)) {
1882                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1883 -                                            desc, fpu_enabled, nbytes);
1884 +                                            desc, false, nbytes);
1885                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1886 +               glue_fpu_end(fpu_enabled);
1887                 err = blkcipher_walk_done(desc, &walk, nbytes);
1888         }
1889  
1890 -       glue_fpu_end(fpu_enabled);
1891         return err;
1892  }
1893  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1894 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1895                           struct scatterlist *src, unsigned int nbytes)
1896  {
1897         const unsigned int bsize = 128 / 8;
1898 -       bool fpu_enabled = false;
1899 +       bool fpu_enabled;
1900         struct blkcipher_walk walk;
1901         int err;
1902  
1903 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1904  
1905         while ((nbytes = walk.nbytes) >= bsize) {
1906                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1907 -                                            desc, fpu_enabled, nbytes);
1908 +                                            desc, false, nbytes);
1909                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1910 +               glue_fpu_end(fpu_enabled);
1911                 err = blkcipher_walk_done(desc, &walk, nbytes);
1912         }
1913  
1914 -       glue_fpu_end(fpu_enabled);
1915 -
1916         if (walk.nbytes) {
1917                 glue_ctr_crypt_final_128bit(
1918                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1919 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1920                           void *tweak_ctx, void *crypt_ctx)
1921  {
1922         const unsigned int bsize = 128 / 8;
1923 -       bool fpu_enabled = false;
1924 +       bool fpu_enabled;
1925         struct blkcipher_walk walk;
1926         int err;
1927  
1928 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1929  
1930         /* set minimum length to bsize, for tweak_fn */
1931         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1932 -                                    desc, fpu_enabled,
1933 +                                    desc, false,
1934                                      nbytes < bsize ? bsize : nbytes);
1935 -
1936         /* calculate first value of T */
1937         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1938 +       glue_fpu_end(fpu_enabled);
1939  
1940         while (nbytes) {
1941 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1942 +                               desc, false, nbytes);
1943                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1944  
1945 +               glue_fpu_end(fpu_enabled);
1946                 err = blkcipher_walk_done(desc, &walk, nbytes);
1947                 nbytes = walk.nbytes;
1948         }
1949 -
1950 -       glue_fpu_end(fpu_enabled);
1951 -
1952         return err;
1953  }
1954  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1955 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1956 index bdd9cc59d20f..56d01a339ba4 100644
1957 --- a/arch/x86/entry/common.c
1958 +++ b/arch/x86/entry/common.c
1959 @@ -129,7 +129,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1960  
1961  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1962         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1963 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1964 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1965  
1966  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1967  {
1968 @@ -145,9 +145,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1969                 /* We have work to do. */
1970                 local_irq_enable();
1971  
1972 -               if (cached_flags & _TIF_NEED_RESCHED)
1973 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
1974                         schedule();
1975  
1976 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
1977 +               if (unlikely(current->forced_info.si_signo)) {
1978 +                       struct task_struct *t = current;
1979 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
1980 +                       t->forced_info.si_signo = 0;
1981 +               }
1982 +#endif
1983                 if (cached_flags & _TIF_UPROBE)
1984                         uprobe_notify_resume(regs);
1985  
1986 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
1987 index edba8606b99a..4a3389535fc6 100644
1988 --- a/arch/x86/entry/entry_32.S
1989 +++ b/arch/x86/entry/entry_32.S
1990 @@ -308,8 +308,25 @@ END(ret_from_exception)
1991  ENTRY(resume_kernel)
1992         DISABLE_INTERRUPTS(CLBR_ANY)
1993  need_resched:
1994 +       # preempt count == 0 + NEED_RS set?
1995         cmpl    $0, PER_CPU_VAR(__preempt_count)
1996 +#ifndef CONFIG_PREEMPT_LAZY
1997         jnz     restore_all
1998 +#else
1999 +       jz test_int_off
2000 +
2001 +       # atleast preempt count == 0 ?
2002 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2003 +       jne restore_all
2004 +
2005 +       movl    PER_CPU_VAR(current_task), %ebp
2006 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
2007 +       jnz restore_all
2008 +
2009 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2010 +       jz restore_all
2011 +test_int_off:
2012 +#endif
2013         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2014         jz      restore_all
2015         call    preempt_schedule_irq
2016 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2017 index ef766a358b37..28401f826ab1 100644
2018 --- a/arch/x86/entry/entry_64.S
2019 +++ b/arch/x86/entry/entry_64.S
2020 @@ -546,7 +546,23 @@ GLOBAL(retint_user)
2021         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2022         jnc     1f
2023  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2024 +#ifndef CONFIG_PREEMPT_LAZY
2025         jnz     1f
2026 +#else
2027 +       jz      do_preempt_schedule_irq
2028 +
2029 +       # atleast preempt count == 0 ?
2030 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2031 +       jnz     1f
2032 +
2033 +       movq    PER_CPU_VAR(current_task), %rcx
2034 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2035 +       jnz     1f
2036 +
2037 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2038 +       jnc     1f
2039 +do_preempt_schedule_irq:
2040 +#endif
2041         call    preempt_schedule_irq
2042         jmp     0b
2043  1:
2044 @@ -894,6 +910,7 @@ EXPORT_SYMBOL(native_load_gs_index)
2045         jmp     2b
2046         .previous
2047  
2048 +#ifndef CONFIG_PREEMPT_RT_FULL
2049  /* Call softirq on interrupt stack. Interrupts are off. */
2050  ENTRY(do_softirq_own_stack)
2051         pushq   %rbp
2052 @@ -906,6 +923,7 @@ ENTRY(do_softirq_own_stack)
2053         decl    PER_CPU_VAR(irq_count)
2054         ret
2055  END(do_softirq_own_stack)
2056 +#endif
2057  
2058  #ifdef CONFIG_XEN
2059  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2060 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2061 index 17f218645701..11bd1b7ee6eb 100644
2062 --- a/arch/x86/include/asm/preempt.h
2063 +++ b/arch/x86/include/asm/preempt.h
2064 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2065   * a decrement which hits zero means we have no preempt_count and should
2066   * reschedule.
2067   */
2068 -static __always_inline bool __preempt_count_dec_and_test(void)
2069 +static __always_inline bool ____preempt_count_dec_and_test(void)
2070  {
2071         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2072  }
2073  
2074 +static __always_inline bool __preempt_count_dec_and_test(void)
2075 +{
2076 +       if (____preempt_count_dec_and_test())
2077 +               return true;
2078 +#ifdef CONFIG_PREEMPT_LAZY
2079 +       if (current_thread_info()->preempt_lazy_count)
2080 +               return false;
2081 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2082 +#else
2083 +       return false;
2084 +#endif
2085 +}
2086 +
2087  /*
2088   * Returns true when we need to resched and can (barring IRQ state).
2089   */
2090  static __always_inline bool should_resched(int preempt_offset)
2091  {
2092 +#ifdef CONFIG_PREEMPT_LAZY
2093 +       u32 tmp;
2094 +
2095 +       tmp = raw_cpu_read_4(__preempt_count);
2096 +       if (tmp == preempt_offset)
2097 +               return true;
2098 +
2099 +       /* preempt count == 0 ? */
2100 +       tmp &= ~PREEMPT_NEED_RESCHED;
2101 +       if (tmp)
2102 +               return false;
2103 +       if (current_thread_info()->preempt_lazy_count)
2104 +               return false;
2105 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2106 +#else
2107         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2108 +#endif
2109  }
2110  
2111  #ifdef CONFIG_PREEMPT
2112 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2113 index 8af22be0fe61..d1328789b759 100644
2114 --- a/arch/x86/include/asm/signal.h
2115 +++ b/arch/x86/include/asm/signal.h
2116 @@ -27,6 +27,19 @@ typedef struct {
2117  #define SA_IA32_ABI    0x02000000u
2118  #define SA_X32_ABI     0x01000000u
2119  
2120 +/*
2121 + * Because some traps use the IST stack, we must keep preemption
2122 + * disabled while calling do_trap(), but do_trap() may call
2123 + * force_sig_info() which will grab the signal spin_locks for the
2124 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2125 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2126 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2127 + * trap.
2128 + */
2129 +#if defined(CONFIG_PREEMPT_RT_FULL)
2130 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2131 +#endif
2132 +
2133  #ifndef CONFIG_COMPAT
2134  typedef sigset_t compat_sigset_t;
2135  #endif
2136 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2137 index 58505f01962f..02fa39652cd6 100644
2138 --- a/arch/x86/include/asm/stackprotector.h
2139 +++ b/arch/x86/include/asm/stackprotector.h
2140 @@ -59,7 +59,7 @@
2141   */
2142  static __always_inline void boot_init_stack_canary(void)
2143  {
2144 -       u64 canary;
2145 +       u64 uninitialized_var(canary);
2146         u64 tsc;
2147  
2148  #ifdef CONFIG_X86_64
2149 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2150          * of randomness. The TSC only matters for very early init,
2151          * there it already has some randomness on most systems. Later
2152          * on during the bootup the random pool has true entropy too.
2153 +        *
2154 +        * For preempt-rt we need to weaken the randomness a bit, as
2155 +        * we can't call into the random generator from atomic context
2156 +        * due to locking constraints. We just leave canary
2157 +        * uninitialized and use the TSC based randomness on top of it.
2158          */
2159 +#ifndef CONFIG_PREEMPT_RT_FULL
2160         get_random_bytes(&canary, sizeof(canary));
2161 +#endif
2162         tsc = rdtsc();
2163         canary += tsc + (tsc << 32UL);
2164  
2165 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2166 index ad6f5eb07a95..5ceb3a1c2b1a 100644
2167 --- a/arch/x86/include/asm/thread_info.h
2168 +++ b/arch/x86/include/asm/thread_info.h
2169 @@ -54,11 +54,14 @@ struct task_struct;
2170  
2171  struct thread_info {
2172         unsigned long           flags;          /* low level flags */
2173 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2174 +                                                          <0 => BUG */
2175  };
2176  
2177  #define INIT_THREAD_INFO(tsk)                  \
2178  {                                              \
2179         .flags          = 0,                    \
2180 +       .preempt_lazy_count = 0,                \
2181  }
2182  
2183  #define init_stack             (init_thread_union.stack)
2184 @@ -67,6 +70,10 @@ struct thread_info {
2185  
2186  #include <asm/asm-offsets.h>
2187  
2188 +#define GET_THREAD_INFO(reg) \
2189 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2190 +       _ASM_SUB $(THREAD_SIZE),reg ;
2191 +
2192  #endif
2193  
2194  /*
2195 @@ -85,6 +92,7 @@ struct thread_info {
2196  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2197  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2198  #define TIF_SECCOMP            8       /* secure computing */
2199 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2200  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2201  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2202  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2203 @@ -108,6 +116,7 @@ struct thread_info {
2204  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2205  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2206  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2207 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2208  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2209  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2210  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2211 @@ -143,6 +152,8 @@ struct thread_info {
2212  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2213  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2214  
2215 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2216 +
2217  #define STACK_WARN             (THREAD_SIZE/8)
2218  
2219  /*
2220 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2221 index 57ab86d94d64..35d25e27180f 100644
2222 --- a/arch/x86/include/asm/uv/uv_bau.h
2223 +++ b/arch/x86/include/asm/uv/uv_bau.h
2224 @@ -624,9 +624,9 @@ struct bau_control {
2225         cycles_t                send_message;
2226         cycles_t                period_end;
2227         cycles_t                period_time;
2228 -       spinlock_t              uvhub_lock;
2229 -       spinlock_t              queue_lock;
2230 -       spinlock_t              disable_lock;
2231 +       raw_spinlock_t          uvhub_lock;
2232 +       raw_spinlock_t          queue_lock;
2233 +       raw_spinlock_t          disable_lock;
2234         /* tunables */
2235         int                     max_concurr;
2236         int                     max_concurr_const;
2237 @@ -815,15 +815,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2238   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2239   * on equal.
2240   */
2241 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2242 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2243  {
2244 -       spin_lock(lock);
2245 +       raw_spin_lock(lock);
2246         if (atomic_read(v) >= u) {
2247 -               spin_unlock(lock);
2248 +               raw_spin_unlock(lock);
2249                 return 0;
2250         }
2251         atomic_inc(v);
2252 -       spin_unlock(lock);
2253 +       raw_spin_unlock(lock);
2254         return 1;
2255  }
2256  
2257 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2258 index 931ced8ca345..167975ac8af7 100644
2259 --- a/arch/x86/kernel/acpi/boot.c
2260 +++ b/arch/x86/kernel/acpi/boot.c
2261 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2262   *             ->ioapic_mutex
2263   *                     ->ioapic_lock
2264   */
2265 +#ifdef CONFIG_X86_IO_APIC
2266  static DEFINE_MUTEX(acpi_ioapic_lock);
2267 +#endif
2268  
2269  /* --------------------------------------------------------------------------
2270                                Boot-time Configuration
2271 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2272 index 48e6d84f173e..0b5a8b994f65 100644
2273 --- a/arch/x86/kernel/apic/io_apic.c
2274 +++ b/arch/x86/kernel/apic/io_apic.c
2275 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2276  static inline bool ioapic_irqd_mask(struct irq_data *data)
2277  {
2278         /* If we are moving the irq we need to mask it */
2279 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2280 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2281 +                    !irqd_irq_inprogress(data))) {
2282                 mask_ioapic_irq(data);
2283                 return true;
2284         }
2285 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2286 index c62e015b126c..0cc71257fca6 100644
2287 --- a/arch/x86/kernel/asm-offsets.c
2288 +++ b/arch/x86/kernel/asm-offsets.c
2289 @@ -36,6 +36,7 @@ void common(void) {
2290  
2291         BLANK();
2292         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2293 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2294         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2295  
2296         BLANK();
2297 @@ -91,4 +92,5 @@ void common(void) {
2298  
2299         BLANK();
2300         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2301 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2302  }
2303 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2304 index a7fdf453d895..e3a0e969a66e 100644
2305 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2306 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2307 @@ -41,6 +41,8 @@
2308  #include <linux/debugfs.h>
2309  #include <linux/irq_work.h>
2310  #include <linux/export.h>
2311 +#include <linux/jiffies.h>
2312 +#include <linux/swork.h>
2313  #include <linux/jump_label.h>
2314  
2315  #include <asm/processor.h>
2316 @@ -1317,7 +1319,7 @@ void mce_log_therm_throt_event(__u64 status)
2317  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2318  
2319  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2320 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2321 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2322  
2323  static unsigned long mce_adjust_timer_default(unsigned long interval)
2324  {
2325 @@ -1326,32 +1328,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2326  
2327  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2328  
2329 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2330 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2331  {
2332 -       unsigned long when = jiffies + interval;
2333 -       unsigned long flags;
2334 -
2335 -       local_irq_save(flags);
2336 -
2337 -       if (timer_pending(t)) {
2338 -               if (time_before(when, t->expires))
2339 -                       mod_timer(t, when);
2340 -       } else {
2341 -               t->expires = round_jiffies(when);
2342 -               add_timer_on(t, smp_processor_id());
2343 -       }
2344 -
2345 -       local_irq_restore(flags);
2346 +       if (!interval)
2347 +               return HRTIMER_NORESTART;
2348 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2349 +       return HRTIMER_RESTART;
2350  }
2351  
2352 -static void mce_timer_fn(unsigned long data)
2353 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2354  {
2355 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2356 -       int cpu = smp_processor_id();
2357         unsigned long iv;
2358  
2359 -       WARN_ON(cpu != data);
2360 -
2361         iv = __this_cpu_read(mce_next_interval);
2362  
2363         if (mce_available(this_cpu_ptr(&cpu_info))) {
2364 @@ -1374,7 +1362,7 @@ static void mce_timer_fn(unsigned long data)
2365  
2366  done:
2367         __this_cpu_write(mce_next_interval, iv);
2368 -       __restart_timer(t, iv);
2369 +       return __restart_timer(timer, iv);
2370  }
2371  
2372  /*
2373 @@ -1382,7 +1370,7 @@ static void mce_timer_fn(unsigned long data)
2374   */
2375  void mce_timer_kick(unsigned long interval)
2376  {
2377 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2378 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2379         unsigned long iv = __this_cpu_read(mce_next_interval);
2380  
2381         __restart_timer(t, interval);
2382 @@ -1397,7 +1385,7 @@ static void mce_timer_delete_all(void)
2383         int cpu;
2384  
2385         for_each_online_cpu(cpu)
2386 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2387 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2388  }
2389  
2390  static void mce_do_trigger(struct work_struct *work)
2391 @@ -1407,6 +1395,56 @@ static void mce_do_trigger(struct work_struct *work)
2392  
2393  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2394  
2395 +static void __mce_notify_work(struct swork_event *event)
2396 +{
2397 +       /* Not more than two messages every minute */
2398 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2399 +
2400 +       /* wake processes polling /dev/mcelog */
2401 +       wake_up_interruptible(&mce_chrdev_wait);
2402 +
2403 +       /*
2404 +        * There is no risk of missing notifications because
2405 +        * work_pending is always cleared before the function is
2406 +        * executed.
2407 +        */
2408 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2409 +               schedule_work(&mce_trigger_work);
2410 +
2411 +       if (__ratelimit(&ratelimit))
2412 +               pr_info(HW_ERR "Machine check events logged\n");
2413 +}
2414 +
2415 +#ifdef CONFIG_PREEMPT_RT_FULL
2416 +static bool notify_work_ready __read_mostly;
2417 +static struct swork_event notify_work;
2418 +
2419 +static int mce_notify_work_init(void)
2420 +{
2421 +       int err;
2422 +
2423 +       err = swork_get();
2424 +       if (err)
2425 +               return err;
2426 +
2427 +       INIT_SWORK(&notify_work, __mce_notify_work);
2428 +       notify_work_ready = true;
2429 +       return 0;
2430 +}
2431 +
2432 +static void mce_notify_work(void)
2433 +{
2434 +       if (notify_work_ready)
2435 +               swork_queue(&notify_work);
2436 +}
2437 +#else
2438 +static void mce_notify_work(void)
2439 +{
2440 +       __mce_notify_work(NULL);
2441 +}
2442 +static inline int mce_notify_work_init(void) { return 0; }
2443 +#endif
2444 +
2445  /*
2446   * Notify the user(s) about new machine check events.
2447   * Can be called from interrupt context, but not from machine check/NMI
2448 @@ -1414,19 +1452,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2449   */
2450  int mce_notify_irq(void)
2451  {
2452 -       /* Not more than two messages every minute */
2453 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2454 -
2455         if (test_and_clear_bit(0, &mce_need_notify)) {
2456 -               /* wake processes polling /dev/mcelog */
2457 -               wake_up_interruptible(&mce_chrdev_wait);
2458 -
2459 -               if (mce_helper[0])
2460 -                       schedule_work(&mce_trigger_work);
2461 -
2462 -               if (__ratelimit(&ratelimit))
2463 -                       pr_info(HW_ERR "Machine check events logged\n");
2464 -
2465 +               mce_notify_work();
2466                 return 1;
2467         }
2468         return 0;
2469 @@ -1732,7 +1759,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2470         }
2471  }
2472  
2473 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2474 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2475  {
2476         unsigned long iv = check_interval * HZ;
2477  
2478 @@ -1741,16 +1768,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2479  
2480         per_cpu(mce_next_interval, cpu) = iv;
2481  
2482 -       t->expires = round_jiffies(jiffies + iv);
2483 -       add_timer_on(t, cpu);
2484 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2485 +                       0, HRTIMER_MODE_REL_PINNED);
2486  }
2487  
2488  static void __mcheck_cpu_init_timer(void)
2489  {
2490 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2491 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2492         unsigned int cpu = smp_processor_id();
2493  
2494 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2495 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2496 +       t->function = mce_timer_fn;
2497         mce_start_timer(cpu, t);
2498  }
2499  
2500 @@ -2475,6 +2503,8 @@ static void mce_disable_cpu(void *h)
2501         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2502                 return;
2503  
2504 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2505 +
2506         if (!(action & CPU_TASKS_FROZEN))
2507                 cmci_clear();
2508  
2509 @@ -2497,6 +2527,7 @@ static void mce_reenable_cpu(void *h)
2510                 if (b->init)
2511                         wrmsrl(msr_ops.ctl(i), b->ctl);
2512         }
2513 +       __mcheck_cpu_init_timer();
2514  }
2515  
2516  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2517 @@ -2504,7 +2535,6 @@ static int
2518  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2519  {
2520         unsigned int cpu = (unsigned long)hcpu;
2521 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2522  
2523         switch (action & ~CPU_TASKS_FROZEN) {
2524         case CPU_ONLINE:
2525 @@ -2524,11 +2554,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2526                 break;
2527         case CPU_DOWN_PREPARE:
2528                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2529 -               del_timer_sync(t);
2530                 break;
2531         case CPU_DOWN_FAILED:
2532                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2533 -               mce_start_timer(cpu, t);
2534                 break;
2535         }
2536  
2537 @@ -2567,6 +2595,10 @@ static __init int mcheck_init_device(void)
2538                 goto err_out;
2539         }
2540  
2541 +       err = mce_notify_work_init();
2542 +       if (err)
2543 +               goto err_out;
2544 +
2545         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2546                 err = -ENOMEM;
2547                 goto err_out;
2548 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2549 index 1f38d9a4d9de..053bf3b2ef39 100644
2550 --- a/arch/x86/kernel/irq_32.c
2551 +++ b/arch/x86/kernel/irq_32.c
2552 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2553                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2554  }
2555  
2556 +#ifndef CONFIG_PREEMPT_RT_FULL
2557  void do_softirq_own_stack(void)
2558  {
2559         struct irq_stack *irqstk;
2560 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2561  
2562         call_on_stack(__do_softirq, isp);
2563  }
2564 +#endif
2565  
2566  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2567  {
2568 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2569 index bd7be8efdc4c..b3b0a7f7b1ca 100644
2570 --- a/arch/x86/kernel/process_32.c
2571 +++ b/arch/x86/kernel/process_32.c
2572 @@ -35,6 +35,7 @@
2573  #include <linux/uaccess.h>
2574  #include <linux/io.h>
2575  #include <linux/kdebug.h>
2576 +#include <linux/highmem.h>
2577  
2578  #include <asm/pgtable.h>
2579  #include <asm/ldt.h>
2580 @@ -195,6 +196,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2581  }
2582  EXPORT_SYMBOL_GPL(start_thread);
2583  
2584 +#ifdef CONFIG_PREEMPT_RT_FULL
2585 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2586 +{
2587 +       int i;
2588 +
2589 +       /*
2590 +        * Clear @prev's kmap_atomic mappings
2591 +        */
2592 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2593 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2594 +               pte_t *ptep = kmap_pte - idx;
2595 +
2596 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2597 +       }
2598 +       /*
2599 +        * Restore @next_p's kmap_atomic mappings
2600 +        */
2601 +       for (i = 0; i < next_p->kmap_idx; i++) {
2602 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2603 +
2604 +               if (!pte_none(next_p->kmap_pte[i]))
2605 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2606 +       }
2607 +}
2608 +#else
2609 +static inline void
2610 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2611 +#endif
2612 +
2613  
2614  /*
2615   *     switch_to(x,y) should switch tasks from x to y.
2616 @@ -271,6 +301,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2617                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2618                 __switch_to_xtra(prev_p, next_p, tss);
2619  
2620 +       switch_kmaps(prev_p, next_p);
2621 +
2622         /*
2623          * Leave lazy mode, flushing any hypercalls made here.
2624          * This must be done before restoring TLS segments so
2625 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2626 index 6f69340f9fa3..d47f204a0fbe 100644
2627 --- a/arch/x86/kvm/lapic.c
2628 +++ b/arch/x86/kvm/lapic.c
2629 @@ -1939,6 +1939,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2630         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2631                      HRTIMER_MODE_ABS_PINNED);
2632         apic->lapic_timer.timer.function = apic_timer_fn;
2633 +       apic->lapic_timer.timer.irqsafe = 1;
2634  
2635         /*
2636          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2637 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2638 index f3648c978d2f..d0d0901d1c56 100644
2639 --- a/arch/x86/kvm/x86.c
2640 +++ b/arch/x86/kvm/x86.c
2641 @@ -5930,6 +5930,13 @@ int kvm_arch_init(void *opaque)
2642                 goto out;
2643         }
2644  
2645 +#ifdef CONFIG_PREEMPT_RT_FULL
2646 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2647 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2648 +               return -EOPNOTSUPP;
2649 +       }
2650 +#endif
2651 +
2652         r = kvm_mmu_module_init();
2653         if (r)
2654                 goto out_free_percpu;
2655 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2656 index 6d18b70ed5a9..f752724c22e8 100644
2657 --- a/arch/x86/mm/highmem_32.c
2658 +++ b/arch/x86/mm/highmem_32.c
2659 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2660   */
2661  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2662  {
2663 +       pte_t pte = mk_pte(page, prot);
2664         unsigned long vaddr;
2665         int idx, type;
2666  
2667 -       preempt_disable();
2668 +       preempt_disable_nort();
2669         pagefault_disable();
2670  
2671         if (!PageHighMem(page))
2672 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2673         idx = type + KM_TYPE_NR*smp_processor_id();
2674         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2675         BUG_ON(!pte_none(*(kmap_pte-idx)));
2676 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2677 +#ifdef CONFIG_PREEMPT_RT_FULL
2678 +       current->kmap_pte[type] = pte;
2679 +#endif
2680 +       set_pte(kmap_pte-idx, pte);
2681         arch_flush_lazy_mmu_mode();
2682  
2683         return (void *)vaddr;
2684 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2685                  * is a bad idea also, in case the page changes cacheability
2686                  * attributes or becomes a protected page in a hypervisor.
2687                  */
2688 +#ifdef CONFIG_PREEMPT_RT_FULL
2689 +               current->kmap_pte[type] = __pte(0);
2690 +#endif
2691                 kpte_clear_flush(kmap_pte-idx, vaddr);
2692                 kmap_atomic_idx_pop();
2693                 arch_flush_lazy_mmu_mode();
2694 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2695  #endif
2696  
2697         pagefault_enable();
2698 -       preempt_enable();
2699 +       preempt_enable_nort();
2700  }
2701  EXPORT_SYMBOL(__kunmap_atomic);
2702  
2703 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2704 index ada98b39b8ad..585f6829653b 100644
2705 --- a/arch/x86/mm/iomap_32.c
2706 +++ b/arch/x86/mm/iomap_32.c
2707 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2708  
2709  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2710  {
2711 +       pte_t pte = pfn_pte(pfn, prot);
2712         unsigned long vaddr;
2713         int idx, type;
2714  
2715 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2716         type = kmap_atomic_idx_push();
2717         idx = type + KM_TYPE_NR * smp_processor_id();
2718         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2719 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2720 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2721 +
2722 +#ifdef CONFIG_PREEMPT_RT_FULL
2723 +       current->kmap_pte[type] = pte;
2724 +#endif
2725 +       set_pte(kmap_pte - idx, pte);
2726         arch_flush_lazy_mmu_mode();
2727  
2728         return (void *)vaddr;
2729 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2730                  * is a bad idea also, in case the page changes cacheability
2731                  * attributes or becomes a protected page in a hypervisor.
2732                  */
2733 +#ifdef CONFIG_PREEMPT_RT_FULL
2734 +               current->kmap_pte[type] = __pte(0);
2735 +#endif
2736                 kpte_clear_flush(kmap_pte-idx, vaddr);
2737                 kmap_atomic_idx_pop();
2738         }
2739 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2740 index 9e42842e924a..5398f97172f9 100644
2741 --- a/arch/x86/platform/uv/tlb_uv.c
2742 +++ b/arch/x86/platform/uv/tlb_uv.c
2743 @@ -748,9 +748,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2744  
2745                 quiesce_local_uvhub(hmaster);
2746  
2747 -               spin_lock(&hmaster->queue_lock);
2748 +               raw_spin_lock(&hmaster->queue_lock);
2749                 reset_with_ipi(&bau_desc->distribution, bcp);
2750 -               spin_unlock(&hmaster->queue_lock);
2751 +               raw_spin_unlock(&hmaster->queue_lock);
2752  
2753                 end_uvhub_quiesce(hmaster);
2754  
2755 @@ -770,9 +770,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2756  
2757                 quiesce_local_uvhub(hmaster);
2758  
2759 -               spin_lock(&hmaster->queue_lock);
2760 +               raw_spin_lock(&hmaster->queue_lock);
2761                 reset_with_ipi(&bau_desc->distribution, bcp);
2762 -               spin_unlock(&hmaster->queue_lock);
2763 +               raw_spin_unlock(&hmaster->queue_lock);
2764  
2765                 end_uvhub_quiesce(hmaster);
2766  
2767 @@ -793,7 +793,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2768         cycles_t tm1;
2769  
2770         hmaster = bcp->uvhub_master;
2771 -       spin_lock(&hmaster->disable_lock);
2772 +       raw_spin_lock(&hmaster->disable_lock);
2773         if (!bcp->baudisabled) {
2774                 stat->s_bau_disabled++;
2775                 tm1 = get_cycles();
2776 @@ -806,7 +806,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2777                         }
2778                 }
2779         }
2780 -       spin_unlock(&hmaster->disable_lock);
2781 +       raw_spin_unlock(&hmaster->disable_lock);
2782  }
2783  
2784  static void count_max_concurr(int stat, struct bau_control *bcp,
2785 @@ -869,7 +869,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2786   */
2787  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2788  {
2789 -       spinlock_t *lock = &hmaster->uvhub_lock;
2790 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2791         atomic_t *v;
2792  
2793         v = &hmaster->active_descriptor_count;
2794 @@ -1002,7 +1002,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2795         struct bau_control *hmaster;
2796  
2797         hmaster = bcp->uvhub_master;
2798 -       spin_lock(&hmaster->disable_lock);
2799 +       raw_spin_lock(&hmaster->disable_lock);
2800         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2801                 stat->s_bau_reenabled++;
2802                 for_each_present_cpu(tcpu) {
2803 @@ -1014,10 +1014,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2804                                 tbcp->period_giveups = 0;
2805                         }
2806                 }
2807 -               spin_unlock(&hmaster->disable_lock);
2808 +               raw_spin_unlock(&hmaster->disable_lock);
2809                 return 0;
2810         }
2811 -       spin_unlock(&hmaster->disable_lock);
2812 +       raw_spin_unlock(&hmaster->disable_lock);
2813         return -1;
2814  }
2815  
2816 @@ -1940,9 +1940,9 @@ static void __init init_per_cpu_tunables(void)
2817                 bcp->cong_reps                  = congested_reps;
2818                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2819                 bcp->giveup_limit               = giveup_limit;
2820 -               spin_lock_init(&bcp->queue_lock);
2821 -               spin_lock_init(&bcp->uvhub_lock);
2822 -               spin_lock_init(&bcp->disable_lock);
2823 +               raw_spin_lock_init(&bcp->queue_lock);
2824 +               raw_spin_lock_init(&bcp->uvhub_lock);
2825 +               raw_spin_lock_init(&bcp->disable_lock);
2826         }
2827  }
2828  
2829 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2830 index b333fc45f9ec..8b85916e6986 100644
2831 --- a/arch/x86/platform/uv/uv_time.c
2832 +++ b/arch/x86/platform/uv/uv_time.c
2833 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2834  
2835  /* There is one of these allocated per node */
2836  struct uv_rtc_timer_head {
2837 -       spinlock_t      lock;
2838 +       raw_spinlock_t  lock;
2839         /* next cpu waiting for timer, local node relative: */
2840         int             next_cpu;
2841         /* number of cpus on this node: */
2842 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2843                                 uv_rtc_deallocate_timers();
2844                                 return -ENOMEM;
2845                         }
2846 -                       spin_lock_init(&head->lock);
2847 +                       raw_spin_lock_init(&head->lock);
2848                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2849                         head->next_cpu = -1;
2850                         blade_info[bid] = head;
2851 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2852         unsigned long flags;
2853         int next_cpu;
2854  
2855 -       spin_lock_irqsave(&head->lock, flags);
2856 +       raw_spin_lock_irqsave(&head->lock, flags);
2857  
2858         next_cpu = head->next_cpu;
2859         *t = expires;
2860 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2861                 if (uv_setup_intr(cpu, expires)) {
2862                         *t = ULLONG_MAX;
2863                         uv_rtc_find_next_timer(head, pnode);
2864 -                       spin_unlock_irqrestore(&head->lock, flags);
2865 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2866                         return -ETIME;
2867                 }
2868         }
2869  
2870 -       spin_unlock_irqrestore(&head->lock, flags);
2871 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2872         return 0;
2873  }
2874  
2875 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2876         unsigned long flags;
2877         int rc = 0;
2878  
2879 -       spin_lock_irqsave(&head->lock, flags);
2880 +       raw_spin_lock_irqsave(&head->lock, flags);
2881  
2882         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2883                 rc = 1;
2884 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2885                         uv_rtc_find_next_timer(head, pnode);
2886         }
2887  
2888 -       spin_unlock_irqrestore(&head->lock, flags);
2889 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2890  
2891         return rc;
2892  }
2893 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2894  static cycle_t uv_read_rtc(struct clocksource *cs)
2895  {
2896         unsigned long offset;
2897 +       cycle_t cycles;
2898  
2899 +       preempt_disable();
2900         if (uv_get_min_hub_revision_id() == 1)
2901                 offset = 0;
2902         else
2903                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2904  
2905 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2906 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2907 +       preempt_enable();
2908 +
2909 +       return cycles;
2910  }
2911  
2912  /*
2913 diff --git a/block/blk-core.c b/block/blk-core.c
2914 index 14d7c0740dc0..dfd905bea77c 100644
2915 --- a/block/blk-core.c
2916 +++ b/block/blk-core.c
2917 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
2918  
2919         INIT_LIST_HEAD(&rq->queuelist);
2920         INIT_LIST_HEAD(&rq->timeout_list);
2921 +#ifdef CONFIG_PREEMPT_RT_FULL
2922 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2923 +#endif
2924         rq->cpu = -1;
2925         rq->q = q;
2926         rq->__sector = (sector_t) -1;
2927 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
2928   **/
2929  void blk_start_queue(struct request_queue *q)
2930  {
2931 -       WARN_ON(!irqs_disabled());
2932 +       WARN_ON_NONRT(!irqs_disabled());
2933  
2934         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2935         __blk_run_queue(q);
2936 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
2937                 if (nowait)
2938                         return -EBUSY;
2939  
2940 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2941 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2942                                 !atomic_read(&q->mq_freeze_depth) ||
2943                                 blk_queue_dying(q));
2944                 if (blk_queue_dying(q))
2945 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
2946         struct request_queue *q =
2947                 container_of(ref, struct request_queue, q_usage_counter);
2948  
2949 -       wake_up_all(&q->mq_freeze_wq);
2950 +       swake_up_all(&q->mq_freeze_wq);
2951  }
2952  
2953  static void blk_rq_timed_out_timer(unsigned long data)
2954 @@ -748,7 +751,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
2955         q->bypass_depth = 1;
2956         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
2957  
2958 -       init_waitqueue_head(&q->mq_freeze_wq);
2959 +       init_swait_queue_head(&q->mq_freeze_wq);
2960  
2961         /*
2962          * Init percpu_ref in atomic mode so that it's faster to shutdown.
2963 @@ -3177,7 +3180,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
2964                 blk_run_queue_async(q);
2965         else
2966                 __blk_run_queue(q);
2967 -       spin_unlock(q->queue_lock);
2968 +       spin_unlock_irq(q->queue_lock);
2969  }
2970  
2971  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2972 @@ -3225,7 +3228,6 @@ EXPORT_SYMBOL(blk_check_plugged);
2973  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2974  {
2975         struct request_queue *q;
2976 -       unsigned long flags;
2977         struct request *rq;
2978         LIST_HEAD(list);
2979         unsigned int depth;
2980 @@ -3245,11 +3247,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2981         q = NULL;
2982         depth = 0;
2983  
2984 -       /*
2985 -        * Save and disable interrupts here, to avoid doing it for every
2986 -        * queue lock we have to take.
2987 -        */
2988 -       local_irq_save(flags);
2989         while (!list_empty(&list)) {
2990                 rq = list_entry_rq(list.next);
2991                 list_del_init(&rq->queuelist);
2992 @@ -3262,7 +3259,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2993                                 queue_unplugged(q, depth, from_schedule);
2994                         q = rq->q;
2995                         depth = 0;
2996 -                       spin_lock(q->queue_lock);
2997 +                       spin_lock_irq(q->queue_lock);
2998                 }
2999  
3000                 /*
3001 @@ -3289,8 +3286,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3002          */
3003         if (q)
3004                 queue_unplugged(q, depth, from_schedule);
3005 -
3006 -       local_irq_restore(flags);
3007  }
3008  
3009  void blk_finish_plug(struct blk_plug *plug)
3010 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3011 index 381cb50a673c..dc8785233d94 100644
3012 --- a/block/blk-ioc.c
3013 +++ b/block/blk-ioc.c
3014 @@ -7,6 +7,7 @@
3015  #include <linux/bio.h>
3016  #include <linux/blkdev.h>
3017  #include <linux/slab.h>
3018 +#include <linux/delay.h>
3019  
3020  #include "blk.h"
3021  
3022 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3023                         spin_unlock(q->queue_lock);
3024                 } else {
3025                         spin_unlock_irqrestore(&ioc->lock, flags);
3026 -                       cpu_relax();
3027 +                       cpu_chill();
3028                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3029                 }
3030         }
3031 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3032                         spin_unlock(icq->q->queue_lock);
3033                 } else {
3034                         spin_unlock_irqrestore(&ioc->lock, flags);
3035 -                       cpu_relax();
3036 +                       cpu_chill();
3037                         goto retry;
3038                 }
3039         }
3040 diff --git a/block/blk-mq.c b/block/blk-mq.c
3041 index ad459e4e8071..1bfacb205bfa 100644
3042 --- a/block/blk-mq.c
3043 +++ b/block/blk-mq.c
3044 @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3045  
3046  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3047  {
3048 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3049 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3050  }
3051  
3052  /*
3053 @@ -110,7 +110,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3054         WARN_ON_ONCE(freeze_depth < 0);
3055         if (!freeze_depth) {
3056                 percpu_ref_reinit(&q->q_usage_counter);
3057 -               wake_up_all(&q->mq_freeze_wq);
3058 +               swake_up_all(&q->mq_freeze_wq);
3059         }
3060  }
3061  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3062 @@ -129,7 +129,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3063          * dying, we need to ensure that processes currently waiting on
3064          * the queue are notified as well.
3065          */
3066 -       wake_up_all(&q->mq_freeze_wq);
3067 +       swake_up_all(&q->mq_freeze_wq);
3068  }
3069  
3070  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3071 @@ -177,6 +177,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3072         rq->resid_len = 0;
3073         rq->sense = NULL;
3074  
3075 +#ifdef CONFIG_PREEMPT_RT_FULL
3076 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3077 +#endif
3078         INIT_LIST_HEAD(&rq->timeout_list);
3079         rq->timeout = 0;
3080  
3081 @@ -345,6 +348,17 @@ void blk_mq_end_request(struct request *rq, int error)
3082  }
3083  EXPORT_SYMBOL(blk_mq_end_request);
3084  
3085 +#ifdef CONFIG_PREEMPT_RT_FULL
3086 +
3087 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3088 +{
3089 +       struct request *rq = container_of(work, struct request, work);
3090 +
3091 +       rq->q->softirq_done_fn(rq);
3092 +}
3093 +
3094 +#else
3095 +
3096  static void __blk_mq_complete_request_remote(void *data)
3097  {
3098         struct request *rq = data;
3099 @@ -352,6 +366,8 @@ static void __blk_mq_complete_request_remote(void *data)
3100         rq->q->softirq_done_fn(rq);
3101  }
3102  
3103 +#endif
3104 +
3105  static void blk_mq_ipi_complete_request(struct request *rq)
3106  {
3107         struct blk_mq_ctx *ctx = rq->mq_ctx;
3108 @@ -363,19 +379,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3109                 return;
3110         }
3111  
3112 -       cpu = get_cpu();
3113 +       cpu = get_cpu_light();
3114         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3115                 shared = cpus_share_cache(cpu, ctx->cpu);
3116  
3117         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3118 +#ifdef CONFIG_PREEMPT_RT_FULL
3119 +               schedule_work_on(ctx->cpu, &rq->work);
3120 +#else
3121                 rq->csd.func = __blk_mq_complete_request_remote;
3122                 rq->csd.info = rq;
3123                 rq->csd.flags = 0;
3124                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3125 +#endif
3126         } else {
3127                 rq->q->softirq_done_fn(rq);
3128         }
3129 -       put_cpu();
3130 +       put_cpu_light();
3131  }
3132  
3133  static void __blk_mq_complete_request(struct request *rq)
3134 @@ -917,14 +937,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3135                 return;
3136  
3137         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3138 -               int cpu = get_cpu();
3139 +               int cpu = get_cpu_light();
3140                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3141                         __blk_mq_run_hw_queue(hctx);
3142 -                       put_cpu();
3143 +                       put_cpu_light();
3144                         return;
3145                 }
3146  
3147 -               put_cpu();
3148 +               put_cpu_light();
3149         }
3150  
3151         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3152 diff --git a/block/blk-mq.h b/block/blk-mq.h
3153 index e5d25249028c..1e846b842eab 100644
3154 --- a/block/blk-mq.h
3155 +++ b/block/blk-mq.h
3156 @@ -72,12 +72,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3157   */
3158  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3159  {
3160 -       return __blk_mq_get_ctx(q, get_cpu());
3161 +       return __blk_mq_get_ctx(q, get_cpu_light());
3162  }
3163  
3164  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3165  {
3166 -       put_cpu();
3167 +       put_cpu_light();
3168  }
3169  
3170  struct blk_mq_alloc_data {
3171 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3172 index 06cf9807f49a..c40342643ca0 100644
3173 --- a/block/blk-softirq.c
3174 +++ b/block/blk-softirq.c
3175 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3176                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3177  
3178         local_irq_restore(flags);
3179 +       preempt_check_resched_rt();
3180  }
3181  
3182  /*
3183 @@ -89,6 +90,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
3184                          this_cpu_ptr(&blk_cpu_done));
3185         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3186         local_irq_enable();
3187 +       preempt_check_resched_rt();
3188  
3189         return 0;
3190  }
3191 @@ -141,6 +143,7 @@ void __blk_complete_request(struct request *req)
3192                 goto do_local;
3193  
3194         local_irq_restore(flags);
3195 +       preempt_check_resched_rt();
3196  }
3197  
3198  /**
3199 diff --git a/block/bounce.c b/block/bounce.c
3200 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3201 --- a/block/bounce.c
3202 +++ b/block/bounce.c
3203 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3204         unsigned long flags;
3205         unsigned char *vto;
3206  
3207 -       local_irq_save(flags);
3208 +       local_irq_save_nort(flags);
3209         vto = kmap_atomic(to->bv_page);
3210         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3211         kunmap_atomic(vto);
3212 -       local_irq_restore(flags);
3213 +       local_irq_restore_nort(flags);
3214  }
3215  
3216  #else /* CONFIG_HIGHMEM */
3217 diff --git a/crypto/algapi.c b/crypto/algapi.c
3218 index df939b54b09f..efe5e06adcf7 100644
3219 --- a/crypto/algapi.c
3220 +++ b/crypto/algapi.c
3221 @@ -718,13 +718,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3222  
3223  int crypto_register_notifier(struct notifier_block *nb)
3224  {
3225 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3226 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3227  }
3228  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3229  
3230  int crypto_unregister_notifier(struct notifier_block *nb)
3231  {
3232 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3233 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3234  }
3235  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3236  
3237 diff --git a/crypto/api.c b/crypto/api.c
3238 index bbc147cb5dec..bc1a848f02ec 100644
3239 --- a/crypto/api.c
3240 +++ b/crypto/api.c
3241 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3242  DECLARE_RWSEM(crypto_alg_sem);
3243  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3244  
3245 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3246 +SRCU_NOTIFIER_HEAD(crypto_chain);
3247  EXPORT_SYMBOL_GPL(crypto_chain);
3248  
3249  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3250 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3251  {
3252         int ok;
3253  
3254 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3255 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3256         if (ok == NOTIFY_DONE) {
3257                 request_module("cryptomgr");
3258 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3259 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3260         }
3261  
3262         return ok;
3263 diff --git a/crypto/internal.h b/crypto/internal.h
3264 index 7eefcdb00227..0ecc7f5a2f40 100644
3265 --- a/crypto/internal.h
3266 +++ b/crypto/internal.h
3267 @@ -47,7 +47,7 @@ struct crypto_larval {
3268  
3269  extern struct list_head crypto_alg_list;
3270  extern struct rw_semaphore crypto_alg_sem;
3271 -extern struct blocking_notifier_head crypto_chain;
3272 +extern struct srcu_notifier_head crypto_chain;
3273  
3274  #ifdef CONFIG_PROC_FS
3275  void __init crypto_init_proc(void);
3276 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3277  
3278  static inline void crypto_notify(unsigned long val, void *v)
3279  {
3280 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3281 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3282  }
3283  
3284  #endif /* _CRYPTO_INTERNAL_H */
3285 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3286 index 750fa824d42c..441edf51484a 100644
3287 --- a/drivers/acpi/acpica/acglobal.h
3288 +++ b/drivers/acpi/acpica/acglobal.h
3289 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3290   * interrupt level
3291   */
3292  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3293 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3294 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3295  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3296  
3297  /* Mutex for _OSI support */
3298 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3299 index 3b7fb99362b6..696bf8e62afb 100644
3300 --- a/drivers/acpi/acpica/hwregs.c
3301 +++ b/drivers/acpi/acpica/hwregs.c
3302 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3303                           ACPI_BITMASK_ALL_FIXED_STATUS,
3304                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3305  
3306 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3307 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3308  
3309         /* Clear the fixed events in PM1 A/B */
3310  
3311         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3312                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3313  
3314 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3315 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3316  
3317         if (ACPI_FAILURE(status)) {
3318                 goto exit;
3319 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3320 index 98c26ff39409..6e236f2ea791 100644
3321 --- a/drivers/acpi/acpica/hwxface.c
3322 +++ b/drivers/acpi/acpica/hwxface.c
3323 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3324                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3325         }
3326  
3327 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3328 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3329  
3330         /*
3331          * At this point, we know that the parent register is one of the
3332 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3333  
3334  unlock_and_exit:
3335  
3336 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3337 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3338         return_ACPI_STATUS(status);
3339  }
3340  
3341 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3342 index 15073375bd00..357e7ca5a587 100644
3343 --- a/drivers/acpi/acpica/utmutex.c
3344 +++ b/drivers/acpi/acpica/utmutex.c
3345 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3346                 return_ACPI_STATUS (status);
3347         }
3348  
3349 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3350 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3351         if (ACPI_FAILURE (status)) {
3352                 return_ACPI_STATUS (status);
3353         }
3354 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3355         /* Delete the spinlocks */
3356  
3357         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3358 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3359 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3360         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3361  
3362         /* Delete the reader/writer lock */
3363 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3364 index 051b6158d1b7..7ad293bef6ed 100644
3365 --- a/drivers/ata/libata-sff.c
3366 +++ b/drivers/ata/libata-sff.c
3367 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3368         unsigned long flags;
3369         unsigned int consumed;
3370  
3371 -       local_irq_save(flags);
3372 +       local_irq_save_nort(flags);
3373         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3374 -       local_irq_restore(flags);
3375 +       local_irq_restore_nort(flags);
3376  
3377         return consumed;
3378  }
3379 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3380                 unsigned long flags;
3381  
3382                 /* FIXME: use a bounce buffer */
3383 -               local_irq_save(flags);
3384 +               local_irq_save_nort(flags);
3385                 buf = kmap_atomic(page);
3386  
3387                 /* do the actual data transfer */
3388 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3389                                        do_write);
3390  
3391                 kunmap_atomic(buf);
3392 -               local_irq_restore(flags);
3393 +               local_irq_restore_nort(flags);
3394         } else {
3395                 buf = page_address(page);
3396                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3397 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3398                 unsigned long flags;
3399  
3400                 /* FIXME: use bounce buffer */
3401 -               local_irq_save(flags);
3402 +               local_irq_save_nort(flags);
3403                 buf = kmap_atomic(page);
3404  
3405                 /* do the actual data transfer */
3406 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3407                                                                 count, rw);
3408  
3409                 kunmap_atomic(buf);
3410 -               local_irq_restore(flags);
3411 +               local_irq_restore_nort(flags);
3412         } else {
3413                 buf = page_address(page);
3414                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3415 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3416 index 4b5cd3a7b2b6..fa8329ad79fd 100644
3417 --- a/drivers/block/zram/zcomp.c
3418 +++ b/drivers/block/zram/zcomp.c
3419 @@ -118,12 +118,19 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3420  
3421  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3422  {
3423 -       return *get_cpu_ptr(comp->stream);
3424 +       struct zcomp_strm *zstrm;
3425 +
3426 +       zstrm = *this_cpu_ptr(comp->stream);
3427 +       spin_lock(&zstrm->zcomp_lock);
3428 +       return zstrm;
3429  }
3430  
3431  void zcomp_stream_put(struct zcomp *comp)
3432  {
3433 -       put_cpu_ptr(comp->stream);
3434 +       struct zcomp_strm *zstrm;
3435 +
3436 +       zstrm = *this_cpu_ptr(comp->stream);
3437 +       spin_unlock(&zstrm->zcomp_lock);
3438  }
3439  
3440  int zcomp_compress(struct zcomp_strm *zstrm,
3441 @@ -174,6 +181,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3442                         pr_err("Can't allocate a compression stream\n");
3443                         return NOTIFY_BAD;
3444                 }
3445 +               spin_lock_init(&zstrm->zcomp_lock);
3446                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3447                 break;
3448         case CPU_DEAD:
3449 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3450 index 478cac2ed465..f7a6efdc3285 100644
3451 --- a/drivers/block/zram/zcomp.h
3452 +++ b/drivers/block/zram/zcomp.h
3453 @@ -14,6 +14,7 @@ struct zcomp_strm {
3454         /* compression/decompression buffer */
3455         void *buffer;
3456         struct crypto_comp *tfm;
3457 +       spinlock_t zcomp_lock;
3458  };
3459  
3460  /* dynamic per-device compression frontend */
3461 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3462 index 5497f7fc44d0..3826072a23c5 100644
3463 --- a/drivers/block/zram/zram_drv.c
3464 +++ b/drivers/block/zram/zram_drv.c
3465 @@ -519,6 +519,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3466                 goto out_error;
3467         }
3468  
3469 +       zram_meta_init_table_locks(meta, disksize);
3470 +
3471         return meta;
3472  
3473  out_error:
3474 @@ -566,28 +568,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3475         struct zram_meta *meta = zram->meta;
3476         unsigned long handle;
3477         unsigned int size;
3478 +       struct zcomp_strm *zstrm;
3479  
3480 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3481 +       zram_lock_table(&meta->table[index]);
3482         handle = meta->table[index].handle;
3483         size = zram_get_obj_size(meta, index);
3484  
3485         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3486 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3487 +               zram_unlock_table(&meta->table[index]);
3488                 clear_page(mem);
3489                 return 0;
3490         }
3491  
3492 +       zstrm = zcomp_stream_get(zram->comp);
3493         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3494         if (size == PAGE_SIZE) {
3495                 copy_page(mem, cmem);
3496         } else {
3497 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3498 -
3499                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3500 -               zcomp_stream_put(zram->comp);
3501         }
3502         zs_unmap_object(meta->mem_pool, handle);
3503 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3504 +       zcomp_stream_put(zram->comp);
3505 +       zram_unlock_table(&meta->table[index]);
3506  
3507         /* Should NEVER happen. Return bio error if it does. */
3508         if (unlikely(ret)) {
3509 @@ -607,14 +609,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3510         struct zram_meta *meta = zram->meta;
3511         page = bvec->bv_page;
3512  
3513 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3514 +       zram_lock_table(&meta->table[index]);
3515         if (unlikely(!meta->table[index].handle) ||
3516                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3517 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3518 +               zram_unlock_table(&meta->table[index]);
3519                 handle_zero_page(bvec);
3520                 return 0;
3521         }
3522 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3523 +       zram_unlock_table(&meta->table[index]);
3524  
3525         if (is_partial_io(bvec))
3526                 /* Use  a temporary buffer to decompress the page */
3527 @@ -691,10 +693,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3528                 if (user_mem)
3529                         kunmap_atomic(user_mem);
3530                 /* Free memory associated with this sector now. */
3531 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3532 +               zram_lock_table(&meta->table[index]);
3533                 zram_free_page(zram, index);
3534                 zram_set_flag(meta, index, ZRAM_ZERO);
3535 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3536 +               zram_unlock_table(&meta->table[index]);
3537  
3538                 atomic64_inc(&zram->stats.zero_pages);
3539                 ret = 0;
3540 @@ -785,12 +787,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3541          * Free memory associated with this sector
3542          * before overwriting unused sectors.
3543          */
3544 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3545 +       zram_lock_table(&meta->table[index]);
3546         zram_free_page(zram, index);
3547  
3548         meta->table[index].handle = handle;
3549         zram_set_obj_size(meta, index, clen);
3550 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3551 +       zram_unlock_table(&meta->table[index]);
3552  
3553         /* Update stats */
3554         atomic64_add(clen, &zram->stats.compr_data_size);
3555 @@ -833,9 +835,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3556         }
3557  
3558         while (n >= PAGE_SIZE) {
3559 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3560 +               zram_lock_table(&meta->table[index]);
3561                 zram_free_page(zram, index);
3562 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3563 +               zram_unlock_table(&meta->table[index]);
3564                 atomic64_inc(&zram->stats.notify_free);
3565                 index++;
3566                 n -= PAGE_SIZE;
3567 @@ -964,9 +966,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3568         zram = bdev->bd_disk->private_data;
3569         meta = zram->meta;
3570  
3571 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3572 +       zram_lock_table(&meta->table[index]);
3573         zram_free_page(zram, index);
3574 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3575 +       zram_unlock_table(&meta->table[index]);
3576         atomic64_inc(&zram->stats.notify_free);
3577  }
3578  
3579 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3580 index 74fcf10da374..fd4020c99b9e 100644
3581 --- a/drivers/block/zram/zram_drv.h
3582 +++ b/drivers/block/zram/zram_drv.h
3583 @@ -73,6 +73,9 @@ enum zram_pageflags {
3584  struct zram_table_entry {
3585         unsigned long handle;
3586         unsigned long value;
3587 +#ifdef CONFIG_PREEMPT_RT_BASE
3588 +       spinlock_t lock;
3589 +#endif
3590  };
3591  
3592  struct zram_stats {
3593 @@ -120,4 +123,42 @@ struct zram {
3594          */
3595         bool claim; /* Protected by bdev->bd_mutex */
3596  };
3597 +
3598 +#ifndef CONFIG_PREEMPT_RT_BASE
3599 +static inline void zram_lock_table(struct zram_table_entry *table)
3600 +{
3601 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3602 +}
3603 +
3604 +static inline void zram_unlock_table(struct zram_table_entry *table)
3605 +{
3606 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3607 +}
3608 +
3609 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3610 +#else /* CONFIG_PREEMPT_RT_BASE */
3611 +static inline void zram_lock_table(struct zram_table_entry *table)
3612 +{
3613 +       spin_lock(&table->lock);
3614 +       __set_bit(ZRAM_ACCESS, &table->value);
3615 +}
3616 +
3617 +static inline void zram_unlock_table(struct zram_table_entry *table)
3618 +{
3619 +       __clear_bit(ZRAM_ACCESS, &table->value);
3620 +       spin_unlock(&table->lock);
3621 +}
3622 +
3623 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3624 +{
3625 +        size_t num_pages = disksize >> PAGE_SHIFT;
3626 +        size_t index;
3627 +
3628 +        for (index = 0; index < num_pages; index++) {
3629 +               spinlock_t *lock = &meta->table[index].lock;
3630 +               spin_lock_init(lock);
3631 +        }
3632 +}
3633 +#endif /* CONFIG_PREEMPT_RT_BASE */
3634 +
3635  #endif
3636 diff --git a/drivers/char/random.c b/drivers/char/random.c
3637 index d6876d506220..0c60b1e54579 100644
3638 --- a/drivers/char/random.c
3639 +++ b/drivers/char/random.c
3640 @@ -1028,8 +1028,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3641         } sample;
3642         long delta, delta2, delta3;
3643  
3644 -       preempt_disable();
3645 -
3646         sample.jiffies = jiffies;
3647         sample.cycles = random_get_entropy();
3648         sample.num = num;
3649 @@ -1070,7 +1068,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3650                  */
3651                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3652         }
3653 -       preempt_enable();
3654  }
3655  
3656  void add_input_randomness(unsigned int type, unsigned int code,
3657 @@ -1123,28 +1120,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3658         return *(ptr + f->reg_idx++);
3659  }
3660  
3661 -void add_interrupt_randomness(int irq, int irq_flags)
3662 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3663  {
3664         struct entropy_store    *r;
3665         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3666 -       struct pt_regs          *regs = get_irq_regs();
3667         unsigned long           now = jiffies;
3668         cycles_t                cycles = random_get_entropy();
3669         __u32                   c_high, j_high;
3670 -       __u64                   ip;
3671         unsigned long           seed;
3672         int                     credit = 0;
3673  
3674         if (cycles == 0)
3675 -               cycles = get_reg(fast_pool, regs);
3676 +               cycles = get_reg(fast_pool, NULL);
3677         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3678         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3679         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3680         fast_pool->pool[1] ^= now ^ c_high;
3681 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3682 +       if (!ip)
3683 +               ip = _RET_IP_;
3684         fast_pool->pool[2] ^= ip;
3685         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3686 -               get_reg(fast_pool, regs);
3687 +               get_reg(fast_pool, NULL);
3688  
3689         fast_mix(fast_pool);
3690         add_interrupt_bench(cycles);
3691 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3692 index 4da2af9694a2..5b6f57f500b8 100644
3693 --- a/drivers/clocksource/tcb_clksrc.c
3694 +++ b/drivers/clocksource/tcb_clksrc.c
3695 @@ -23,8 +23,7 @@
3696   *     this 32 bit free-running counter. the second channel is not used.
3697   *
3698   *   - The third channel may be used to provide a 16-bit clockevent
3699 - *     source, used in either periodic or oneshot mode.  This runs
3700 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3701 + *     source, used in either periodic or oneshot mode.
3702   *
3703   * A boot clocksource and clockevent source are also currently needed,
3704   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3705 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3706  struct tc_clkevt_device {
3707         struct clock_event_device       clkevt;
3708         struct clk                      *clk;
3709 +       bool                            clk_enabled;
3710 +       u32                             freq;
3711         void __iomem                    *regs;
3712  };
3713  
3714 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3715         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3716  }
3717  
3718 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3719 - * because using one of the divided clocks would usually mean the
3720 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3721 - *
3722 - * A divided clock could be good for high resolution timers, since
3723 - * 30.5 usec resolution can seem "low".
3724 - */
3725  static u32 timer_clock;
3726  
3727 +static void tc_clk_disable(struct clock_event_device *d)
3728 +{
3729 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3730 +
3731 +       clk_disable(tcd->clk);
3732 +       tcd->clk_enabled = false;
3733 +}
3734 +
3735 +static void tc_clk_enable(struct clock_event_device *d)
3736 +{
3737 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3738 +
3739 +       if (tcd->clk_enabled)
3740 +               return;
3741 +       clk_enable(tcd->clk);
3742 +       tcd->clk_enabled = true;
3743 +}
3744 +
3745  static int tc_shutdown(struct clock_event_device *d)
3746  {
3747         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3748 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3749  
3750         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3751         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3752 +       return 0;
3753 +}
3754 +
3755 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3756 +{
3757 +       tc_shutdown(d);
3758         if (!clockevent_state_detached(d))
3759 -               clk_disable(tcd->clk);
3760 +               tc_clk_disable(d);
3761  
3762         return 0;
3763  }
3764 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3765         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3766                 tc_shutdown(d);
3767  
3768 -       clk_enable(tcd->clk);
3769 +       tc_clk_enable(d);
3770  
3771 -       /* slow clock, count up to RC, then irq and stop */
3772 +       /* count up to RC, then irq and stop */
3773         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3774                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3775         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3776 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3777         /* By not making the gentime core emulate periodic mode on top
3778          * of oneshot, we get lower overhead and improved accuracy.
3779          */
3780 -       clk_enable(tcd->clk);
3781 +       tc_clk_enable(d);
3782  
3783 -       /* slow clock, count up to RC, then irq and restart */
3784 +       /* count up to RC, then irq and restart */
3785         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3786                      regs + ATMEL_TC_REG(2, CMR));
3787 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3788 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3789  
3790         /* Enable clock and interrupts on RC compare */
3791         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3792 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3793                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3794                                           CLOCK_EVT_FEAT_ONESHOT,
3795                 /* Should be lower than at91rm9200's system timer */
3796 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3797                 .rating                 = 125,
3798 +#else
3799 +               .rating                 = 200,
3800 +#endif
3801                 .set_next_event         = tc_next_event,
3802 -               .set_state_shutdown     = tc_shutdown,
3803 +               .set_state_shutdown     = tc_shutdown_clk_off,
3804                 .set_state_periodic     = tc_set_periodic,
3805                 .set_state_oneshot      = tc_set_oneshot,
3806         },
3807 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3808         return IRQ_NONE;
3809  }
3810  
3811 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3812 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3813  {
3814 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3815         int ret;
3816         struct clk *t2_clk = tc->clk[2];
3817         int irq = tc->irq[2];
3818 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3819         clkevt.regs = tc->regs;
3820         clkevt.clk = t2_clk;
3821  
3822 -       timer_clock = clk32k_divisor_idx;
3823 +       timer_clock = divisor_idx;
3824 +       if (!divisor)
3825 +               clkevt.freq = 32768;
3826 +       else
3827 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3828  
3829         clkevt.clkevt.cpumask = cpumask_of(0);
3830  
3831 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3832                 return ret;
3833         }
3834  
3835 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3836 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3837  
3838         return ret;
3839  }
3840 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
3841                 goto err_disable_t1;
3842  
3843         /* channel 2:  periodic and oneshot timer support */
3844 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3845         ret = setup_clkevents(tc, clk32k_divisor_idx);
3846 +#else
3847 +       ret = setup_clkevents(tc, best_divisor_idx);
3848 +#endif
3849         if (ret)
3850                 goto err_unregister_clksrc;
3851  
3852 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
3853 index 6555821bbdae..93288849b2bd 100644
3854 --- a/drivers/clocksource/timer-atmel-pit.c
3855 +++ b/drivers/clocksource/timer-atmel-pit.c
3856 @@ -46,6 +46,7 @@ struct pit_data {
3857         u32             cycle;
3858         u32             cnt;
3859         unsigned int    irq;
3860 +       bool            irq_requested;
3861         struct clk      *mck;
3862  };
3863  
3864 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
3865  
3866         /* disable irq, leaving the clocksource active */
3867         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3868 +       if (data->irq_requested) {
3869 +               free_irq(data->irq, data);
3870 +               data->irq_requested = false;
3871 +       }
3872         return 0;
3873  }
3874  
3875 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3876  /*
3877   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
3878   */
3879  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3880  {
3881         struct pit_data *data = clkevt_to_pit_data(dev);
3882 +       int ret;
3883 +
3884 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3885 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3886 +                         "at91_tick", data);
3887 +       if (ret)
3888 +               panic(pr_fmt("Unable to setup IRQ\n"));
3889 +
3890 +       data->irq_requested = true;
3891  
3892         /* update clocksource counter */
3893         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3894 @@ -230,15 +245,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
3895                 return ret;
3896         }
3897  
3898 -       /* Set up irq handler */
3899 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3900 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3901 -                         "at91_tick", data);
3902 -       if (ret) {
3903 -               pr_err("Unable to setup IRQ\n");
3904 -               return ret;
3905 -       }
3906 -
3907         /* Set up and register clockevents */
3908         data->clkevt.name = "pit";
3909         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
3910 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
3911 index e90ab5b63a90..9e124087c55f 100644
3912 --- a/drivers/clocksource/timer-atmel-st.c
3913 +++ b/drivers/clocksource/timer-atmel-st.c
3914 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
3915         last_crtr = read_CRTR();
3916  }
3917  
3918 +static int atmel_st_irq;
3919 +
3920  static int clkevt32k_shutdown(struct clock_event_device *evt)
3921  {
3922         clkdev32k_disable_and_flush_irq();
3923         irqmask = 0;
3924         regmap_write(regmap_st, AT91_ST_IER, irqmask);
3925 +       free_irq(atmel_st_irq, regmap_st);
3926         return 0;
3927  }
3928  
3929  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3930  {
3931 +       int ret;
3932 +
3933         clkdev32k_disable_and_flush_irq();
3934  
3935 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3936 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3937 +                         "at91_tick", regmap_st);
3938 +       if (ret)
3939 +               panic(pr_fmt("Unable to setup IRQ\n"));
3940 +
3941         /*
3942          * ALM for oneshot irqs, set by next_event()
3943          * before 32 seconds have passed.
3944 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3945  
3946  static int clkevt32k_set_periodic(struct clock_event_device *dev)
3947  {
3948 +       int ret;
3949 +
3950         clkdev32k_disable_and_flush_irq();
3951  
3952 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3953 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3954 +                         "at91_tick", regmap_st);
3955 +       if (ret)
3956 +               panic(pr_fmt("Unable to setup IRQ\n"));
3957 +
3958         /* PIT for periodic irqs; fixed rate of 1/HZ */
3959         irqmask = AT91_ST_PITS;
3960         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
3961 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
3962  {
3963         struct clk *sclk;
3964         unsigned int sclk_rate, val;
3965 -       int irq, ret;
3966 +       int ret;
3967  
3968         regmap_st = syscon_node_to_regmap(node);
3969         if (IS_ERR(regmap_st)) {
3970 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
3971         regmap_read(regmap_st, AT91_ST_SR, &val);
3972  
3973         /* Get the interrupts property */
3974 -       irq  = irq_of_parse_and_map(node, 0);
3975 -       if (!irq) {
3976 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
3977 +       if (!atmel_st_irq) {
3978                 pr_err("Unable to get IRQ from DT\n");
3979                 return -EINVAL;
3980         }
3981  
3982 -       /* Make IRQs happen for the system timer */
3983 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
3984 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3985 -                         "at91_tick", regmap_st);
3986 -       if (ret) {
3987 -               pr_err("Unable to setup IRQ\n");
3988 -               return ret;
3989 -       }
3990 -
3991         sclk = of_clk_get(node, 0);
3992         if (IS_ERR(sclk)) {
3993                 pr_err("Unable to get slow clock\n");
3994 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
3995 index a782ce87715c..19d265948526 100644
3996 --- a/drivers/connector/cn_proc.c
3997 +++ b/drivers/connector/cn_proc.c
3998 @@ -32,6 +32,7 @@
3999  #include <linux/pid_namespace.h>
4000  
4001  #include <linux/cn_proc.h>
4002 +#include <linux/locallock.h>
4003  
4004  /*
4005   * Size of a cn_msg followed by a proc_event structure.  Since the
4006 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4007  
4008  /* proc_event_counts is used as the sequence number of the netlink message */
4009  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4010 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4011  
4012  static inline void send_msg(struct cn_msg *msg)
4013  {
4014 -       preempt_disable();
4015 +       local_lock(send_msg_lock);
4016  
4017         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4018         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4019 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4020          */
4021         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4022  
4023 -       preempt_enable();
4024 +       local_unlock(send_msg_lock);
4025  }
4026  
4027  void proc_fork_connector(struct task_struct *task)
4028 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4029 index adbd1de1cea5..1fac5074f2cf 100644
4030 --- a/drivers/cpufreq/Kconfig.x86
4031 +++ b/drivers/cpufreq/Kconfig.x86
4032 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4033  
4034  config X86_POWERNOW_K8
4035         tristate "AMD Opteron/Athlon64 PowerNow!"
4036 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4037 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4038         help
4039           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4040           Support for K10 and newer processors is now in acpi-cpufreq.
4041 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4042 index a218c2e395e7..5273d8f1d5dd 100644
4043 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4044 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4045 @@ -1537,7 +1537,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
4046         if (ret)
4047                 return ret;
4048  
4049 +#ifndef CONFIG_PREEMPT_RT_BASE
4050         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4051 +#endif
4052  
4053         i915_gem_execbuffer_move_to_active(vmas, params->request);
4054  
4055 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4056 index 1c237d02f30b..9e9b4404c0d7 100644
4057 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4058 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4059 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4060         if (!mutex_is_locked(mutex))
4061                 return false;
4062  
4063 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4064 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4065         return mutex->owner == task;
4066  #else
4067         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4068 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4069 index 3fc286cd1157..252a1117b103 100644
4070 --- a/drivers/gpu/drm/i915/i915_irq.c
4071 +++ b/drivers/gpu/drm/i915/i915_irq.c
4072 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4073         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4074  
4075         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4076 +       preempt_disable_rt();
4077  
4078         /* Get optional system timestamp before query. */
4079         if (stime)
4080 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4081                 *etime = ktime_get();
4082  
4083         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4084 +       preempt_enable_rt();
4085  
4086         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4087  
4088 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4089 index c9e83f39ec0a..6b0caae38076 100644
4090 --- a/drivers/gpu/drm/i915/intel_display.c
4091 +++ b/drivers/gpu/drm/i915/intel_display.c
4092 @@ -12131,7 +12131,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4093         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4094         struct intel_flip_work *work;
4095  
4096 -       WARN_ON(!in_interrupt());
4097 +       WARN_ON_NONRT(!in_interrupt());
4098  
4099         if (crtc == NULL)
4100                 return;
4101 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4102 index dbed12c484c9..5c540b78e8b5 100644
4103 --- a/drivers/gpu/drm/i915/intel_sprite.c
4104 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4105 @@ -35,6 +35,7 @@
4106  #include <drm/drm_rect.h>
4107  #include <drm/drm_atomic.h>
4108  #include <drm/drm_plane_helper.h>
4109 +#include <linux/locallock.h>
4110  #include "intel_drv.h"
4111  #include "intel_frontbuffer.h"
4112  #include <drm/i915_drm.h>
4113 @@ -65,6 +66,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4114                             1000 * adjusted_mode->crtc_htotal);
4115  }
4116  
4117 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4118 +
4119  /**
4120   * intel_pipe_update_start() - start update of a set of display registers
4121   * @crtc: the crtc of which the registers are going to be updated
4122 @@ -95,7 +98,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4123         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4124         max = vblank_start - 1;
4125  
4126 -       local_irq_disable();
4127 +       local_lock_irq(pipe_update_lock);
4128  
4129         if (min <= 0 || max <= 0)
4130                 return;
4131 @@ -125,11 +128,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4132                         break;
4133                 }
4134  
4135 -               local_irq_enable();
4136 +               local_unlock_irq(pipe_update_lock);
4137  
4138                 timeout = schedule_timeout(timeout);
4139  
4140 -               local_irq_disable();
4141 +               local_lock_irq(pipe_update_lock);
4142         }
4143  
4144         finish_wait(wq, &wait);
4145 @@ -181,7 +184,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4146                 crtc->base.state->event = NULL;
4147         }
4148  
4149 -       local_irq_enable();
4150 +       local_unlock_irq(pipe_update_lock);
4151  
4152         if (crtc->debug.start_vbl_count &&
4153             crtc->debug.start_vbl_count != end_vbl_count) {
4154 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4155 index 192b2d3a79cb..d5372a207326 100644
4156 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4157 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4158 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4159         if (!mutex_is_locked(mutex))
4160                 return false;
4161  
4162 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4163 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4164         return mutex->owner == task;
4165  #else
4166         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4167 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4168 index cdb8cb568c15..b6d7fd964cbc 100644
4169 --- a/drivers/gpu/drm/radeon/radeon_display.c
4170 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4171 @@ -1845,6 +1845,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4172         struct radeon_device *rdev = dev->dev_private;
4173  
4174         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4175 +       preempt_disable_rt();
4176  
4177         /* Get optional system timestamp before query. */
4178         if (stime)
4179 @@ -1937,6 +1938,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4180                 *etime = ktime_get();
4181  
4182         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4183 +       preempt_enable_rt();
4184  
4185         /* Decode into vertical and horizontal scanout position. */
4186         *vpos = position & 0x1fff;
4187 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4188 index 0276d2ef06ee..8868045eabde 100644
4189 --- a/drivers/hv/vmbus_drv.c
4190 +++ b/drivers/hv/vmbus_drv.c
4191 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4192         void *page_addr;
4193         struct hv_message *msg;
4194         union hv_synic_event_flags *event;
4195 +       struct pt_regs *regs = get_irq_regs();
4196 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4197         bool handled = false;
4198  
4199         page_addr = hv_context.synic_event_page[cpu];
4200 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4201                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4202         }
4203  
4204 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4205 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4206  }
4207  
4208  
4209 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4210 index 36f76e28a0bf..394f142f90c7 100644
4211 --- a/drivers/ide/alim15x3.c
4212 +++ b/drivers/ide/alim15x3.c
4213 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4214  
4215         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4216  
4217 -       local_irq_save(flags);
4218 +       local_irq_save_nort(flags);
4219  
4220         if (m5229_revision < 0xC2) {
4221                 /*
4222 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4223         }
4224         pci_dev_put(north);
4225         pci_dev_put(isa_dev);
4226 -       local_irq_restore(flags);
4227 +       local_irq_restore_nort(flags);
4228         return 0;
4229  }
4230  
4231 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4232 index 0ceae5cbd89a..c212e85d7f3e 100644
4233 --- a/drivers/ide/hpt366.c
4234 +++ b/drivers/ide/hpt366.c
4235 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4236  
4237         dma_old = inb(base + 2);
4238  
4239 -       local_irq_save(flags);
4240 +       local_irq_save_nort(flags);
4241  
4242         dma_new = dma_old;
4243         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4244 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4245         if (dma_new != dma_old)
4246                 outb(dma_new, base + 2);
4247  
4248 -       local_irq_restore(flags);
4249 +       local_irq_restore_nort(flags);
4250  
4251         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4252                          hwif->name, base, base + 7);
4253 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4254 index 19763977568c..4169433faab5 100644
4255 --- a/drivers/ide/ide-io-std.c
4256 +++ b/drivers/ide/ide-io-std.c
4257 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4258                 unsigned long uninitialized_var(flags);
4259  
4260                 if ((io_32bit & 2) && !mmio) {
4261 -                       local_irq_save(flags);
4262 +                       local_irq_save_nort(flags);
4263                         ata_vlb_sync(io_ports->nsect_addr);
4264                 }
4265  
4266 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4267                         insl(data_addr, buf, words);
4268  
4269                 if ((io_32bit & 2) && !mmio)
4270 -                       local_irq_restore(flags);
4271 +                       local_irq_restore_nort(flags);
4272  
4273                 if (((len + 1) & 3) < 2)
4274                         return;
4275 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4276                 unsigned long uninitialized_var(flags);
4277  
4278                 if ((io_32bit & 2) && !mmio) {
4279 -                       local_irq_save(flags);
4280 +                       local_irq_save_nort(flags);
4281                         ata_vlb_sync(io_ports->nsect_addr);
4282                 }
4283  
4284 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4285                         outsl(data_addr, buf, words);
4286  
4287                 if ((io_32bit & 2) && !mmio)
4288 -                       local_irq_restore(flags);
4289 +                       local_irq_restore_nort(flags);
4290  
4291                 if (((len + 1) & 3) < 2)
4292                         return;
4293 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4294 index 669ea1e45795..e12e43e62245 100644
4295 --- a/drivers/ide/ide-io.c
4296 +++ b/drivers/ide/ide-io.c
4297 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4298                 /* disable_irq_nosync ?? */
4299                 disable_irq(hwif->irq);
4300                 /* local CPU only, as if we were handling an interrupt */
4301 -               local_irq_disable();
4302 +               local_irq_disable_nort();
4303                 if (hwif->polling) {
4304                         startstop = handler(drive);
4305                 } else if (drive_is_ready(drive)) {
4306 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4307 index 376f2dc410c5..f014dd1b73dc 100644
4308 --- a/drivers/ide/ide-iops.c
4309 +++ b/drivers/ide/ide-iops.c
4310 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4311                                 if ((stat & ATA_BUSY) == 0)
4312                                         break;
4313  
4314 -                               local_irq_restore(flags);
4315 +                               local_irq_restore_nort(flags);
4316                                 *rstat = stat;
4317                                 return -EBUSY;
4318                         }
4319                 }
4320 -               local_irq_restore(flags);
4321 +               local_irq_restore_nort(flags);
4322         }
4323         /*
4324          * Allow status to settle, then read it again.
4325 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4326 index 0b63facd1d87..4ceba37afc0c 100644
4327 --- a/drivers/ide/ide-probe.c
4328 +++ b/drivers/ide/ide-probe.c
4329 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4330         int bswap = 1;
4331  
4332         /* local CPU only; some systems need this */
4333 -       local_irq_save(flags);
4334 +       local_irq_save_nort(flags);
4335         /* read 512 bytes of id info */
4336         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4337 -       local_irq_restore(flags);
4338 +       local_irq_restore_nort(flags);
4339  
4340         drive->dev_flags |= IDE_DFLAG_ID_READ;
4341  #ifdef DEBUG
4342 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4343 index a716693417a3..be0568c722d6 100644
4344 --- a/drivers/ide/ide-taskfile.c
4345 +++ b/drivers/ide/ide-taskfile.c
4346 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4347  
4348                 page_is_high = PageHighMem(page);
4349                 if (page_is_high)
4350 -                       local_irq_save(flags);
4351 +                       local_irq_save_nort(flags);
4352  
4353                 buf = kmap_atomic(page) + offset;
4354  
4355 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4356                 kunmap_atomic(buf);
4357  
4358                 if (page_is_high)
4359 -                       local_irq_restore(flags);
4360 +                       local_irq_restore_nort(flags);
4361  
4362                 len -= nr_bytes;
4363         }
4364 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4365         }
4366  
4367         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4368 -               local_irq_disable();
4369 +               local_irq_disable_nort();
4370  
4371         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4372  
4373 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4374 index fddff403d5d2..cca1bb4fbfe3 100644
4375 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4376 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4377 @@ -902,7 +902,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4378  
4379         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4380  
4381 -       local_irq_save(flags);
4382 +       local_irq_save_nort(flags);
4383         netif_addr_lock(dev);
4384         spin_lock(&priv->lock);
4385  
4386 @@ -984,7 +984,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4387  
4388         spin_unlock(&priv->lock);
4389         netif_addr_unlock(dev);
4390 -       local_irq_restore(flags);
4391 +       local_irq_restore_nort(flags);
4392  
4393         /*
4394          * make sure the in-flight joins have finished before we attempt
4395 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4396 index 4a2a9e370be7..e970d9afd179 100644
4397 --- a/drivers/input/gameport/gameport.c
4398 +++ b/drivers/input/gameport/gameport.c
4399 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4400         tx = ~0;
4401  
4402         for (i = 0; i < 50; i++) {
4403 -               local_irq_save(flags);
4404 +               local_irq_save_nort(flags);
4405                 t1 = ktime_get_ns();
4406                 for (t = 0; t < 50; t++)
4407                         gameport_read(gameport);
4408                 t2 = ktime_get_ns();
4409                 t3 = ktime_get_ns();
4410 -               local_irq_restore(flags);
4411 +               local_irq_restore_nort(flags);
4412                 udelay(i * 10);
4413                 t = (t2 - t1) - (t3 - t2);
4414                 if (t < tx)
4415 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4416         tx = 1 << 30;
4417  
4418         for(i = 0; i < 50; i++) {
4419 -               local_irq_save(flags);
4420 +               local_irq_save_nort(flags);
4421                 GET_TIME(t1);
4422                 for (t = 0; t < 50; t++) gameport_read(gameport);
4423                 GET_TIME(t2);
4424                 GET_TIME(t3);
4425 -               local_irq_restore(flags);
4426 +               local_irq_restore_nort(flags);
4427                 udelay(i * 10);
4428                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4429         }
4430 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4431         tx = 1 << 30;
4432  
4433         for(i = 0; i < 50; i++) {
4434 -               local_irq_save(flags);
4435 +               local_irq_save_nort(flags);
4436                 t1 = rdtsc();
4437                 for (t = 0; t < 50; t++) gameport_read(gameport);
4438                 t2 = rdtsc();
4439 -               local_irq_restore(flags);
4440 +               local_irq_restore_nort(flags);
4441                 udelay(i * 10);
4442                 if (t2 - t1 < tx) tx = t2 - t1;
4443         }
4444 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4445 index 11a13b5be73a..baaed0ac274b 100644
4446 --- a/drivers/iommu/amd_iommu.c
4447 +++ b/drivers/iommu/amd_iommu.c
4448 @@ -1923,10 +1923,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4449         int ret;
4450  
4451         /*
4452 -        * Must be called with IRQs disabled. Warn here to detect early
4453 -        * when its not.
4454 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4455 +        * detect early when its not.
4456          */
4457 -       WARN_ON(!irqs_disabled());
4458 +       WARN_ON_NONRT(!irqs_disabled());
4459  
4460         /* lock domain */
4461         spin_lock(&domain->lock);
4462 @@ -2094,10 +2094,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4463         struct protection_domain *domain;
4464  
4465         /*
4466 -        * Must be called with IRQs disabled. Warn here to detect early
4467 -        * when its not.
4468 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4469 +        * detect early when its not.
4470          */
4471 -       WARN_ON(!irqs_disabled());
4472 +       WARN_ON_NONRT(!irqs_disabled());
4473  
4474         if (WARN_ON(!dev_data->domain))
4475                 return;
4476 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4477 index d82637ab09fd..ebe41d30c093 100644
4478 --- a/drivers/iommu/intel-iommu.c
4479 +++ b/drivers/iommu/intel-iommu.c
4480 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4481         struct deferred_flush_table *tables;
4482  };
4483  
4484 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4485 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4486  
4487  /* bitmap for indexing intel_iommus */
4488  static int g_num_of_iommus;
4489 @@ -3715,10 +3715,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4490         struct intel_iommu *iommu;
4491         struct deferred_flush_entry *entry;
4492         struct deferred_flush_data *flush_data;
4493 -       unsigned int cpuid;
4494  
4495 -       cpuid = get_cpu();
4496 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4497 +       flush_data = raw_cpu_ptr(&deferred_flush);
4498  
4499         /* Flush all CPUs' entries to avoid deferring too much.  If
4500          * this becomes a bottleneck, can just flush us, and rely on
4501 @@ -3751,8 +3749,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4502         }
4503         flush_data->size++;
4504         spin_unlock_irqrestore(&flush_data->lock, flags);
4505 -
4506 -       put_cpu();
4507  }
4508  
4509  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4510 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4511 index e23001bfcfee..359d5d169ec0 100644
4512 --- a/drivers/iommu/iova.c
4513 +++ b/drivers/iommu/iova.c
4514 @@ -22,6 +22,7 @@
4515  #include <linux/slab.h>
4516  #include <linux/smp.h>
4517  #include <linux/bitops.h>
4518 +#include <linux/cpu.h>
4519  
4520  static bool iova_rcache_insert(struct iova_domain *iovad,
4521                                unsigned long pfn,
4522 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4523  
4524                 /* Try replenishing IOVAs by flushing rcache. */
4525                 flushed_rcache = true;
4526 -               preempt_disable();
4527                 for_each_online_cpu(cpu)
4528                         free_cpu_cached_iovas(cpu, iovad);
4529 -               preempt_enable();
4530                 goto retry;
4531         }
4532  
4533 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4534         bool can_insert = false;
4535         unsigned long flags;
4536  
4537 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4538 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4539         spin_lock_irqsave(&cpu_rcache->lock, flags);
4540  
4541         if (!iova_magazine_full(cpu_rcache->loaded)) {
4542 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4543                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4544  
4545         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4546 -       put_cpu_ptr(rcache->cpu_rcaches);
4547  
4548         if (mag_to_free) {
4549                 iova_magazine_free_pfns(mag_to_free, iovad);
4550 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4551         bool has_pfn = false;
4552         unsigned long flags;
4553  
4554 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4555 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4556         spin_lock_irqsave(&cpu_rcache->lock, flags);
4557  
4558         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4559 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4560                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4561  
4562         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4563 -       put_cpu_ptr(rcache->cpu_rcaches);
4564  
4565         return iova_pfn;
4566  }
4567 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4568 index 3f9ddb9fafa7..09da5b6b44a1 100644
4569 --- a/drivers/leds/trigger/Kconfig
4570 +++ b/drivers/leds/trigger/Kconfig
4571 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4572  
4573  config LEDS_TRIGGER_CPU
4574         bool "LED CPU Trigger"
4575 -       depends on LEDS_TRIGGERS
4576 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4577         help
4578           This allows LEDs to be controlled by active CPUs. This shows
4579           the active CPUs across an array of LEDs so you can see which
4580 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4581 index 4d200883c505..98b64ed5cb81 100644
4582 --- a/drivers/md/bcache/Kconfig
4583 +++ b/drivers/md/bcache/Kconfig
4584 @@ -1,6 +1,7 @@
4585  
4586  config BCACHE
4587         tristate "Block device as cache"
4588 +       depends on !PREEMPT_RT_FULL
4589         ---help---
4590         Allows a block device to be used as cache for other devices; uses
4591         a btree for indexing and the layout is optimized for SSDs.
4592 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4593 index 31a89c8832c0..c3a7e8a9f761 100644
4594 --- a/drivers/md/dm-rq.c
4595 +++ b/drivers/md/dm-rq.c
4596 @@ -838,7 +838,7 @@ static void dm_old_request_fn(struct request_queue *q)
4597                 /* Establish tio->ti before queuing work (map_tio_request) */
4598                 tio->ti = ti;
4599                 kthread_queue_work(&md->kworker, &tio->work);
4600 -               BUG_ON(!irqs_disabled());
4601 +               BUG_ON_NONRT(!irqs_disabled());
4602         }
4603  }
4604  
4605 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4606 index cce6057b9aca..fa2c4de32a64 100644
4607 --- a/drivers/md/raid5.c
4608 +++ b/drivers/md/raid5.c
4609 @@ -1928,8 +1928,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4610         struct raid5_percpu *percpu;
4611         unsigned long cpu;
4612  
4613 -       cpu = get_cpu();
4614 +       cpu = get_cpu_light();
4615         percpu = per_cpu_ptr(conf->percpu, cpu);
4616 +       spin_lock(&percpu->lock);
4617         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4618                 ops_run_biofill(sh);
4619                 overlap_clear++;
4620 @@ -1985,7 +1986,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4621                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4622                                 wake_up(&sh->raid_conf->wait_for_overlap);
4623                 }
4624 -       put_cpu();
4625 +       spin_unlock(&percpu->lock);
4626 +       put_cpu_light();
4627  }
4628  
4629  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4630 @@ -6391,6 +6393,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
4631                        __func__, cpu);
4632                 return -ENOMEM;
4633         }
4634 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4635         return 0;
4636  }
4637  
4638 @@ -6401,7 +6404,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4639         conf->percpu = alloc_percpu(struct raid5_percpu);
4640         if (!conf->percpu)
4641                 return -ENOMEM;
4642 -
4643         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4644         if (!err) {
4645                 conf->scribble_disks = max(conf->raid_disks,
4646 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4647 index 57ec49f0839e..0739604990b7 100644
4648 --- a/drivers/md/raid5.h
4649 +++ b/drivers/md/raid5.h
4650 @@ -504,6 +504,7 @@ struct r5conf {
4651         int                     recovery_disabled;
4652         /* per cpu variables */
4653         struct raid5_percpu {
4654 +               spinlock_t      lock;           /* Protection for -RT */
4655                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4656                 struct flex_array *scribble;   /* space for constructing buffer
4657                                               * lists and performing address
4658 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4659 index 64971baf11fa..215e91e36198 100644
4660 --- a/drivers/misc/Kconfig
4661 +++ b/drivers/misc/Kconfig
4662 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4663  config ATMEL_TCLIB
4664         bool "Atmel AT32/AT91 Timer/Counter Library"
4665         depends on (AVR32 || ARCH_AT91)
4666 +       default y if PREEMPT_RT_FULL
4667         help
4668           Select this if you want a library to allocate the Timer/Counter
4669           blocks found on many Atmel processors.  This facilitates using
4670 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4671           are combined to make a single 32-bit timer.
4672  
4673           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4674 -         may be used as a clock event device supporting oneshot mode
4675 -         (delays of up to two seconds) based on the 32 KiHz clock.
4676 +         may be used as a clock event device supporting oneshot mode.
4677  
4678  config ATMEL_TCB_CLKSRC_BLOCK
4679         int
4680 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4681           TC can be used for other purposes, such as PWM generation and
4682           interval timing.
4683  
4684 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4685 +       bool "TC Block use 32 KiHz clock"
4686 +       depends on ATMEL_TCB_CLKSRC
4687 +       default y if !PREEMPT_RT_FULL
4688 +       help
4689 +         Select this to use 32 KiHz base clock rate as TC block clock
4690 +         source for clock events.
4691 +
4692 +
4693  config DUMMY_IRQ
4694         tristate "Dummy IRQ handler"
4695         default n
4696 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
4697 index df990bb8c873..1a162709a85e 100644
4698 --- a/drivers/mmc/host/mmci.c
4699 +++ b/drivers/mmc/host/mmci.c
4700 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4701         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4702         struct variant_data *variant = host->variant;
4703         void __iomem *base = host->base;
4704 -       unsigned long flags;
4705         u32 status;
4706  
4707         status = readl(base + MMCISTATUS);
4708  
4709         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4710  
4711 -       local_irq_save(flags);
4712 -
4713         do {
4714                 unsigned int remain, len;
4715                 char *buffer;
4716 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4717  
4718         sg_miter_stop(sg_miter);
4719  
4720 -       local_irq_restore(flags);
4721 -
4722         /*
4723          * If we have less than the fifo 'half-full' threshold to transfer,
4724          * trigger a PIO interrupt as soon as any data is available.
4725 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
4726 index 9133e7926da5..63afb921ed40 100644
4727 --- a/drivers/net/ethernet/3com/3c59x.c
4728 +++ b/drivers/net/ethernet/3com/3c59x.c
4729 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
4730  {
4731         struct vortex_private *vp = netdev_priv(dev);
4732         unsigned long flags;
4733 -       local_irq_save(flags);
4734 +       local_irq_save_nort(flags);
4735         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4736 -       local_irq_restore(flags);
4737 +       local_irq_restore_nort(flags);
4738  }
4739  #endif
4740  
4741 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
4742                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4743                          */
4744                         unsigned long flags;
4745 -                       local_irq_save(flags);
4746 +                       local_irq_save_nort(flags);
4747                         if (vp->full_bus_master_tx)
4748                                 boomerang_interrupt(dev->irq, dev);
4749                         else
4750                                 vortex_interrupt(dev->irq, dev);
4751 -                       local_irq_restore(flags);
4752 +                       local_irq_restore_nort(flags);
4753                 }
4754         }
4755  
4756 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
4757 index da4c2d8a4173..1420dfb56bac 100644
4758 --- a/drivers/net/ethernet/realtek/8139too.c
4759 +++ b/drivers/net/ethernet/realtek/8139too.c
4760 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
4761         struct rtl8139_private *tp = netdev_priv(dev);
4762         const int irq = tp->pci_dev->irq;
4763  
4764 -       disable_irq(irq);
4765 +       disable_irq_nosync(irq);
4766         rtl8139_interrupt(irq, dev);
4767         enable_irq(irq);
4768  }
4769 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4770 index bca6935a94db..d7a35ee34d03 100644
4771 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4772 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4773 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
4774                         while (!ctx->done.done && msecs--)
4775                                 udelay(1000);
4776                 } else {
4777 -                       wait_event_interruptible(ctx->done.wait,
4778 +                       swait_event_interruptible(ctx->done.wait,
4779                                                  ctx->done.done);
4780                 }
4781                 break;
4782 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
4783 index d11cdbb8fba3..223bbb9acb03 100644
4784 --- a/drivers/pci/access.c
4785 +++ b/drivers/pci/access.c
4786 @@ -672,7 +672,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
4787         WARN_ON(!dev->block_cfg_access);
4788  
4789         dev->block_cfg_access = 0;
4790 -       wake_up_all(&pci_cfg_wait);
4791 +       wake_up_all_locked(&pci_cfg_wait);
4792         raw_spin_unlock_irqrestore(&pci_lock, flags);
4793  }
4794  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4795 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
4796 index 9bd41a35a78a..8e2d436c2e3f 100644
4797 --- a/drivers/scsi/fcoe/fcoe.c
4798 +++ b/drivers/scsi/fcoe/fcoe.c
4799 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
4800  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
4801  {
4802         struct fcoe_percpu_s *fps;
4803 -       int rc;
4804 +       int rc, cpu = get_cpu_light();
4805  
4806 -       fps = &get_cpu_var(fcoe_percpu);
4807 +       fps = &per_cpu(fcoe_percpu, cpu);
4808         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
4809 -       put_cpu_var(fcoe_percpu);
4810 +       put_cpu_light();
4811  
4812         return rc;
4813  }
4814 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
4815                 return 0;
4816         }
4817  
4818 -       stats = per_cpu_ptr(lport->stats, get_cpu());
4819 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
4820         stats->InvalidCRCCount++;
4821         if (stats->InvalidCRCCount < 5)
4822                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
4823 -       put_cpu();
4824 +       put_cpu_light();
4825         return -EINVAL;
4826  }
4827  
4828 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
4829          */
4830         hp = (struct fcoe_hdr *) skb_network_header(skb);
4831  
4832 -       stats = per_cpu_ptr(lport->stats, get_cpu());
4833 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
4834         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
4835                 if (stats->ErrorFrames < 5)
4836                         printk(KERN_WARNING "fcoe: FCoE version "
4837 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
4838                 goto drop;
4839  
4840         if (!fcoe_filter_frames(lport, fp)) {
4841 -               put_cpu();
4842 +               put_cpu_light();
4843                 fc_exch_recv(lport, fp);
4844                 return;
4845         }
4846  drop:
4847         stats->ErrorFrames++;
4848 -       put_cpu();
4849 +       put_cpu_light();
4850         kfree_skb(skb);
4851  }
4852  
4853 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
4854 index dcf36537a767..1a1f2e46452c 100644
4855 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
4856 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
4857 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
4858  
4859         INIT_LIST_HEAD(&del_list);
4860  
4861 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
4862 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
4863  
4864         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
4865                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
4866 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
4867                                 sel_time = fcf->time;
4868                 }
4869         }
4870 -       put_cpu();
4871 +       put_cpu_light();
4872  
4873         list_for_each_entry_safe(fcf, next, &del_list, list) {
4874                 /* Removes fcf from current list */
4875 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
4876 index 16ca31ad5ec0..c3987347e762 100644
4877 --- a/drivers/scsi/libfc/fc_exch.c
4878 +++ b/drivers/scsi/libfc/fc_exch.c
4879 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
4880         }
4881         memset(ep, 0, sizeof(*ep));
4882  
4883 -       cpu = get_cpu();
4884 +       cpu = get_cpu_light();
4885         pool = per_cpu_ptr(mp->pool, cpu);
4886         spin_lock_bh(&pool->lock);
4887 -       put_cpu();
4888 +       put_cpu_light();
4889  
4890         /* peek cache of free slot */
4891         if (pool->left != FC_XID_UNKNOWN) {
4892 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
4893 index 763f012fdeca..d0f61b595470 100644
4894 --- a/drivers/scsi/libsas/sas_ata.c
4895 +++ b/drivers/scsi/libsas/sas_ata.c
4896 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
4897         /* TODO: audit callers to ensure they are ready for qc_issue to
4898          * unconditionally re-enable interrupts
4899          */
4900 -       local_irq_save(flags);
4901 +       local_irq_save_nort(flags);
4902         spin_unlock(ap->lock);
4903  
4904         /* If the device fell off, no sense in issuing commands */
4905 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
4906  
4907   out:
4908         spin_lock(ap->lock);
4909 -       local_irq_restore(flags);
4910 +       local_irq_restore_nort(flags);
4911         return ret;
4912  }
4913  
4914 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
4915 index edc48f3b8230..ee5c6f9dfb6f 100644
4916 --- a/drivers/scsi/qla2xxx/qla_inline.h
4917 +++ b/drivers/scsi/qla2xxx/qla_inline.h
4918 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
4919  {
4920         unsigned long flags;
4921         struct qla_hw_data *ha = rsp->hw;
4922 -       local_irq_save(flags);
4923 +       local_irq_save_nort(flags);
4924         if (IS_P3P_TYPE(ha))
4925                 qla82xx_poll(0, rsp);
4926         else
4927                 ha->isp_ops->intr_handler(0, rsp);
4928 -       local_irq_restore(flags);
4929 +       local_irq_restore_nort(flags);
4930  }
4931  
4932  static inline uint8_t *
4933 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
4934 index 068c4e47fac9..a2090f640397 100644
4935 --- a/drivers/scsi/qla2xxx/qla_isr.c
4936 +++ b/drivers/scsi/qla2xxx/qla_isr.c
4937 @@ -3125,7 +3125,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
4938                 * kref_put().
4939                 */
4940                 kref_get(&qentry->irq_notify.kref);
4941 +#ifdef CONFIG_PREEMPT_RT_BASE
4942 +               swork_queue(&qentry->irq_notify.swork);
4943 +#else
4944                 schedule_work(&qentry->irq_notify.work);
4945 +#endif
4946         }
4947  
4948         /*
4949 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
4950 index 95f4c1bcdb4c..0be934799bff 100644
4951 --- a/drivers/thermal/x86_pkg_temp_thermal.c
4952 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
4953 @@ -29,6 +29,7 @@
4954  #include <linux/pm.h>
4955  #include <linux/thermal.h>
4956  #include <linux/debugfs.h>
4957 +#include <linux/swork.h>
4958  #include <asm/cpu_device_id.h>
4959  #include <asm/mce.h>
4960  
4961 @@ -353,7 +354,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
4962         }
4963  }
4964  
4965 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4966 +static void platform_thermal_notify_work(struct swork_event *event)
4967  {
4968         unsigned long flags;
4969         int cpu = smp_processor_id();
4970 @@ -370,7 +371,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4971                         pkg_work_scheduled[phy_id]) {
4972                 disable_pkg_thres_interrupt();
4973                 spin_unlock_irqrestore(&pkg_work_lock, flags);
4974 -               return -EINVAL;
4975 +               return;
4976         }
4977         pkg_work_scheduled[phy_id] = 1;
4978         spin_unlock_irqrestore(&pkg_work_lock, flags);
4979 @@ -379,9 +380,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4980         schedule_delayed_work_on(cpu,
4981                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
4982                                 msecs_to_jiffies(notify_delay_ms));
4983 +}
4984 +
4985 +#ifdef CONFIG_PREEMPT_RT_FULL
4986 +static struct swork_event notify_work;
4987 +
4988 +static int thermal_notify_work_init(void)
4989 +{
4990 +       int err;
4991 +
4992 +       err = swork_get();
4993 +       if (err)
4994 +               return err;
4995 +
4996 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
4997         return 0;
4998  }
4999  
5000 +static void thermal_notify_work_cleanup(void)
5001 +{
5002 +       swork_put();
5003 +}
5004 +
5005 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5006 +{
5007 +       swork_queue(&notify_work);
5008 +       return 0;
5009 +}
5010 +
5011 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5012 +
5013 +static int thermal_notify_work_init(void) { return 0; }
5014 +
5015 +static void thermal_notify_work_cleanup(void) {  }
5016 +
5017 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5018 +{
5019 +       platform_thermal_notify_work(NULL);
5020 +
5021 +       return 0;
5022 +}
5023 +#endif /* CONFIG_PREEMPT_RT_FULL */
5024 +
5025  static int find_siblings_cpu(int cpu)
5026  {
5027         int i;
5028 @@ -585,6 +625,9 @@ static int __init pkg_temp_thermal_init(void)
5029         if (!x86_match_cpu(pkg_temp_thermal_ids))
5030                 return -ENODEV;
5031  
5032 +       if (!thermal_notify_work_init())
5033 +               return -ENODEV;
5034 +
5035         spin_lock_init(&pkg_work_lock);
5036         platform_thermal_package_notify =
5037                         pkg_temp_thermal_platform_thermal_notify;
5038 @@ -609,7 +652,7 @@ static int __init pkg_temp_thermal_init(void)
5039         kfree(pkg_work_scheduled);
5040         platform_thermal_package_notify = NULL;
5041         platform_thermal_package_rate_control = NULL;
5042 -
5043 +       thermal_notify_work_cleanup();
5044         return -ENODEV;
5045  }
5046  
5047 @@ -634,6 +677,7 @@ static void __exit pkg_temp_thermal_exit(void)
5048         mutex_unlock(&phy_dev_list_mutex);
5049         platform_thermal_package_notify = NULL;
5050         platform_thermal_package_rate_control = NULL;
5051 +       thermal_notify_work_cleanup();
5052         for_each_online_cpu(i)
5053                 cancel_delayed_work_sync(
5054                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5055 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
5056 index 240a361b674f..55e249267144 100644
5057 --- a/drivers/tty/serial/8250/8250_core.c
5058 +++ b/drivers/tty/serial/8250/8250_core.c
5059 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
5060  
5061  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5062  
5063 -#define PASS_LIMIT     512
5064 +/*
5065 + * On -rt we can have a more delays, and legitimately
5066 + * so - so don't drop work spuriously and spam the
5067 + * syslog:
5068 + */
5069 +#ifdef CONFIG_PREEMPT_RT_FULL
5070 +# define PASS_LIMIT    1000000
5071 +#else
5072 +# define PASS_LIMIT    512
5073 +#endif
5074  
5075  #include <asm/serial.h>
5076  /*
5077 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
5078 index 1731b98d2471..5cc62301e840 100644
5079 --- a/drivers/tty/serial/8250/8250_port.c
5080 +++ b/drivers/tty/serial/8250/8250_port.c
5081 @@ -35,6 +35,7 @@
5082  #include <linux/nmi.h>
5083  #include <linux/mutex.h>
5084  #include <linux/slab.h>
5085 +#include <linux/kdb.h>
5086  #include <linux/uaccess.h>
5087  #include <linux/pm_runtime.h>
5088  #include <linux/timer.h>
5089 @@ -3144,9 +3145,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
5090  
5091         serial8250_rpm_get(up);
5092  
5093 -       if (port->sysrq)
5094 +       if (port->sysrq || oops_in_progress)
5095                 locked = 0;
5096 -       else if (oops_in_progress)
5097 +       else if (in_kdb_printk())
5098                 locked = spin_trylock_irqsave(&port->lock, flags);
5099         else
5100                 spin_lock_irqsave(&port->lock, flags);
5101 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
5102 index e2c33b9528d8..53af53c43e8c 100644
5103 --- a/drivers/tty/serial/amba-pl011.c
5104 +++ b/drivers/tty/serial/amba-pl011.c
5105 @@ -2194,13 +2194,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5106  
5107         clk_enable(uap->clk);
5108  
5109 -       local_irq_save(flags);
5110 +       /*
5111 +        * local_irq_save(flags);
5112 +        *
5113 +        * This local_irq_save() is nonsense. If we come in via sysrq
5114 +        * handling then interrupts are already disabled. Aside of
5115 +        * that the port.sysrq check is racy on SMP regardless.
5116 +       */
5117         if (uap->port.sysrq)
5118                 locked = 0;
5119         else if (oops_in_progress)
5120 -               locked = spin_trylock(&uap->port.lock);
5121 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5122         else
5123 -               spin_lock(&uap->port.lock);
5124 +               spin_lock_irqsave(&uap->port.lock, flags);
5125  
5126         /*
5127          *      First save the CR then disable the interrupts
5128 @@ -2224,8 +2230,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5129                 pl011_write(old_cr, uap, REG_CR);
5130  
5131         if (locked)
5132 -               spin_unlock(&uap->port.lock);
5133 -       local_irq_restore(flags);
5134 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5135  
5136         clk_disable(uap->clk);
5137  }
5138 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
5139 index a2a529994ba5..0ee7c4c518df 100644
5140 --- a/drivers/tty/serial/omap-serial.c
5141 +++ b/drivers/tty/serial/omap-serial.c
5142 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
5143  
5144         pm_runtime_get_sync(up->dev);
5145  
5146 -       local_irq_save(flags);
5147 -       if (up->port.sysrq)
5148 -               locked = 0;
5149 -       else if (oops_in_progress)
5150 -               locked = spin_trylock(&up->port.lock);
5151 +       if (up->port.sysrq || oops_in_progress)
5152 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5153         else
5154 -               spin_lock(&up->port.lock);
5155 +               spin_lock_irqsave(&up->port.lock, flags);
5156  
5157         /*
5158          * First save the IER then disable the interrupts
5159 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
5160         pm_runtime_mark_last_busy(up->dev);
5161         pm_runtime_put_autosuspend(up->dev);
5162         if (locked)
5163 -               spin_unlock(&up->port.lock);
5164 -       local_irq_restore(flags);
5165 +               spin_unlock_irqrestore(&up->port.lock, flags);
5166  }
5167  
5168  static int __init
5169 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
5170 index 479e223f9cff..3418a54b4131 100644
5171 --- a/drivers/usb/core/hcd.c
5172 +++ b/drivers/usb/core/hcd.c
5173 @@ -1761,9 +1761,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
5174          * and no one may trigger the above deadlock situation when
5175          * running complete() in tasklet.
5176          */
5177 -       local_irq_save(flags);
5178 +       local_irq_save_nort(flags);
5179         urb->complete(urb);
5180 -       local_irq_restore(flags);
5181 +       local_irq_restore_nort(flags);
5182  
5183         usb_anchor_resume_wakeups(anchor);
5184         atomic_dec(&urb->use_count);
5185 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
5186 index 17989b72cdae..88c6574b5992 100644
5187 --- a/drivers/usb/gadget/function/f_fs.c
5188 +++ b/drivers/usb/gadget/function/f_fs.c
5189 @@ -1593,7 +1593,7 @@ static void ffs_data_put(struct ffs_data *ffs)
5190                 pr_info("%s(): freeing\n", __func__);
5191                 ffs_data_clear(ffs);
5192                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5193 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5194 +                      swait_active(&ffs->ep0req_completion.wait));
5195                 kfree(ffs->dev_name);
5196                 kfree(ffs);
5197         }
5198 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
5199 index 1468d8f085a3..6aae3ae25c18 100644
5200 --- a/drivers/usb/gadget/legacy/inode.c
5201 +++ b/drivers/usb/gadget/legacy/inode.c
5202 @@ -346,7 +346,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5203         spin_unlock_irq (&epdata->dev->lock);
5204  
5205         if (likely (value == 0)) {
5206 -               value = wait_event_interruptible (done.wait, done.done);
5207 +               value = swait_event_interruptible (done.wait, done.done);
5208                 if (value != 0) {
5209                         spin_lock_irq (&epdata->dev->lock);
5210                         if (likely (epdata->ep != NULL)) {
5211 @@ -355,7 +355,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5212                                 usb_ep_dequeue (epdata->ep, epdata->req);
5213                                 spin_unlock_irq (&epdata->dev->lock);
5214  
5215 -                               wait_event (done.wait, done.done);
5216 +                               swait_event (done.wait, done.done);
5217                                 if (epdata->status == -ECONNRESET)
5218                                         epdata->status = -EINTR;
5219                         } else {
5220 diff --git a/fs/aio.c b/fs/aio.c
5221 index 428484f2f841..2b02e2eb2158 100644
5222 --- a/fs/aio.c
5223 +++ b/fs/aio.c
5224 @@ -40,6 +40,7 @@
5225  #include <linux/ramfs.h>
5226  #include <linux/percpu-refcount.h>
5227  #include <linux/mount.h>
5228 +#include <linux/swork.h>
5229  
5230  #include <asm/kmap_types.h>
5231  #include <asm/uaccess.h>
5232 @@ -115,7 +116,7 @@ struct kioctx {
5233         struct page             **ring_pages;
5234         long                    nr_pages;
5235  
5236 -       struct work_struct      free_work;
5237 +       struct swork_event      free_work;
5238  
5239         /*
5240          * signals when all in-flight requests are done
5241 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
5242                 .mount          = aio_mount,
5243                 .kill_sb        = kill_anon_super,
5244         };
5245 +       BUG_ON(swork_get());
5246         aio_mnt = kern_mount(&aio_fs);
5247         if (IS_ERR(aio_mnt))
5248                 panic("Failed to create aio fs mount.");
5249 @@ -581,9 +583,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
5250         return cancel(&kiocb->common);
5251  }
5252  
5253 -static void free_ioctx(struct work_struct *work)
5254 +static void free_ioctx(struct swork_event *sev)
5255  {
5256 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5257 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5258  
5259         pr_debug("freeing %p\n", ctx);
5260  
5261 @@ -602,8 +604,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5262         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5263                 complete(&ctx->rq_wait->comp);
5264  
5265 -       INIT_WORK(&ctx->free_work, free_ioctx);
5266 -       schedule_work(&ctx->free_work);
5267 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5268 +       swork_queue(&ctx->free_work);
5269  }
5270  
5271  /*
5272 @@ -611,9 +613,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5273   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5274   * now it's safe to cancel any that need to be.
5275   */
5276 -static void free_ioctx_users(struct percpu_ref *ref)
5277 +static void free_ioctx_users_work(struct swork_event *sev)
5278  {
5279 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5280 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5281         struct aio_kiocb *req;
5282  
5283         spin_lock_irq(&ctx->ctx_lock);
5284 @@ -632,6 +634,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
5285         percpu_ref_put(&ctx->reqs);
5286  }
5287  
5288 +static void free_ioctx_users(struct percpu_ref *ref)
5289 +{
5290 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5291 +
5292 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5293 +       swork_queue(&ctx->free_work);
5294 +}
5295 +
5296  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5297  {
5298         unsigned i, new_nr;
5299 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
5300 index a1fba4285277..3796769b4cd1 100644
5301 --- a/fs/autofs4/autofs_i.h
5302 +++ b/fs/autofs4/autofs_i.h
5303 @@ -31,6 +31,7 @@
5304  #include <linux/sched.h>
5305  #include <linux/mount.h>
5306  #include <linux/namei.h>
5307 +#include <linux/delay.h>
5308  #include <asm/current.h>
5309  #include <linux/uaccess.h>
5310  
5311 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
5312 index d8e6d421c27f..2e689ab1306b 100644
5313 --- a/fs/autofs4/expire.c
5314 +++ b/fs/autofs4/expire.c
5315 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
5316                         parent = p->d_parent;
5317                         if (!spin_trylock(&parent->d_lock)) {
5318                                 spin_unlock(&p->d_lock);
5319 -                               cpu_relax();
5320 +                               cpu_chill();
5321                                 goto relock;
5322                         }
5323                         spin_unlock(&p->d_lock);
5324 diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
5325 index 63d197724519..b8e479c5ad83 100644
5326 --- a/fs/btrfs/async-thread.c
5327 +++ b/fs/btrfs/async-thread.c
5328 @@ -306,8 +306,8 @@
5329                  * because the callback could free the structure.
5330                  */
5331                 wtag = work;
5332 -               work->ordered_free(work);
5333                 trace_btrfs_all_work_done(wq->fs_info, wtag);
5334 +               work->ordered_free(work);
5335         }
5336         spin_unlock_irqrestore(lock, flags);
5337  }
5338 @@ -339,8 +339,6 @@
5339                 set_bit(WORK_DONE_BIT, &work->flags);
5340                 run_ordered_work(wq);
5341         }
5342 -       if (!need_order)
5343 -               trace_btrfs_all_work_done(wq->fs_info, wtag);
5344  }
5345  
5346  void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
5347 diff --git a/fs/buffer.c b/fs/buffer.c
5348 index b205a629001d..5646afc022ba 100644
5349 --- a/fs/buffer.c
5350 +++ b/fs/buffer.c
5351 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5352          * decide that the page is now completely done.
5353          */
5354         first = page_buffers(page);
5355 -       local_irq_save(flags);
5356 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5357 +       flags = bh_uptodate_lock_irqsave(first);
5358         clear_buffer_async_read(bh);
5359         unlock_buffer(bh);
5360         tmp = bh;
5361 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5362                 }
5363                 tmp = tmp->b_this_page;
5364         } while (tmp != bh);
5365 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5366 -       local_irq_restore(flags);
5367 +       bh_uptodate_unlock_irqrestore(first, flags);
5368  
5369         /*
5370          * If none of the buffers had errors and they are all
5371 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5372         return;
5373  
5374  still_busy:
5375 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5376 -       local_irq_restore(flags);
5377 -       return;
5378 +       bh_uptodate_unlock_irqrestore(first, flags);
5379  }
5380  
5381  /*
5382 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5383         }
5384  
5385         first = page_buffers(page);
5386 -       local_irq_save(flags);
5387 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5388 +       flags = bh_uptodate_lock_irqsave(first);
5389  
5390         clear_buffer_async_write(bh);
5391         unlock_buffer(bh);
5392 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5393                 }
5394                 tmp = tmp->b_this_page;
5395         }
5396 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5397 -       local_irq_restore(flags);
5398 +       bh_uptodate_unlock_irqrestore(first, flags);
5399         end_page_writeback(page);
5400         return;
5401  
5402  still_busy:
5403 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5404 -       local_irq_restore(flags);
5405 -       return;
5406 +       bh_uptodate_unlock_irqrestore(first, flags);
5407  }
5408  EXPORT_SYMBOL(end_buffer_async_write);
5409  
5410 @@ -3383,6 +3375,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
5411         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5412         if (ret) {
5413                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5414 +               buffer_head_init_locks(ret);
5415                 preempt_disable();
5416                 __this_cpu_inc(bh_accounting.nr);
5417                 recalc_bh_state();
5418 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5419 index 8f6a2a5863b9..4217828d0b68 100644
5420 --- a/fs/cifs/readdir.c
5421 +++ b/fs/cifs/readdir.c
5422 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
5423         struct inode *inode;
5424         struct super_block *sb = parent->d_sb;
5425         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5426 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5427 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5428  
5429         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5430  
5431 diff --git a/fs/dcache.c b/fs/dcache.c
5432 index 5c7cc953ac81..a9bb31f1c1af 100644
5433 --- a/fs/dcache.c
5434 +++ b/fs/dcache.c
5435 @@ -19,6 +19,7 @@
5436  #include <linux/mm.h>
5437  #include <linux/fs.h>
5438  #include <linux/fsnotify.h>
5439 +#include <linux/delay.h>
5440  #include <linux/slab.h>
5441  #include <linux/init.h>
5442  #include <linux/hash.h>
5443 @@ -750,6 +751,8 @@ static inline bool fast_dput(struct dentry *dentry)
5444   */
5445  void dput(struct dentry *dentry)
5446  {
5447 +       struct dentry *parent;
5448 +
5449         if (unlikely(!dentry))
5450                 return;
5451  
5452 @@ -788,9 +791,18 @@ void dput(struct dentry *dentry)
5453         return;
5454  
5455  kill_it:
5456 -       dentry = dentry_kill(dentry);
5457 -       if (dentry) {
5458 -               cond_resched();
5459 +       parent = dentry_kill(dentry);
5460 +       if (parent) {
5461 +               int r;
5462 +
5463 +               if (parent == dentry) {
5464 +                       /* the task with the highest priority won't schedule */
5465 +                       r = cond_resched();
5466 +                       if (!r)
5467 +                               cpu_chill();
5468 +               } else {
5469 +                       dentry = parent;
5470 +               }
5471                 goto repeat;
5472         }
5473  }
5474 @@ -2321,7 +2333,7 @@ void d_delete(struct dentry * dentry)
5475         if (dentry->d_lockref.count == 1) {
5476                 if (!spin_trylock(&inode->i_lock)) {
5477                         spin_unlock(&dentry->d_lock);
5478 -                       cpu_relax();
5479 +                       cpu_chill();
5480                         goto again;
5481                 }
5482                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5483 @@ -2381,21 +2393,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
5484  
5485  static void d_wait_lookup(struct dentry *dentry)
5486  {
5487 -       if (d_in_lookup(dentry)) {
5488 -               DECLARE_WAITQUEUE(wait, current);
5489 -               add_wait_queue(dentry->d_wait, &wait);
5490 -               do {
5491 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5492 -                       spin_unlock(&dentry->d_lock);
5493 -                       schedule();
5494 -                       spin_lock(&dentry->d_lock);
5495 -               } while (d_in_lookup(dentry));
5496 -       }
5497 +       struct swait_queue __wait;
5498 +
5499 +       if (!d_in_lookup(dentry))
5500 +               return;
5501 +
5502 +       INIT_LIST_HEAD(&__wait.task_list);
5503 +       do {
5504 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5505 +               spin_unlock(&dentry->d_lock);
5506 +               schedule();
5507 +               spin_lock(&dentry->d_lock);
5508 +       } while (d_in_lookup(dentry));
5509 +       finish_swait(dentry->d_wait, &__wait);
5510  }
5511  
5512  struct dentry *d_alloc_parallel(struct dentry *parent,
5513                                 const struct qstr *name,
5514 -                               wait_queue_head_t *wq)
5515 +                               struct swait_queue_head *wq)
5516  {
5517         unsigned int hash = name->hash;
5518         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5519 @@ -2504,7 +2519,7 @@ void __d_lookup_done(struct dentry *dentry)
5520         hlist_bl_lock(b);
5521         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5522         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5523 -       wake_up_all(dentry->d_wait);
5524 +       swake_up_all(dentry->d_wait);
5525         dentry->d_wait = NULL;
5526         hlist_bl_unlock(b);
5527         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5528 @@ -3601,6 +3616,11 @@ EXPORT_SYMBOL(d_genocide);
5529  
5530  void __init vfs_caches_init_early(void)
5531  {
5532 +       int i;
5533 +
5534 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5535 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5536 +
5537         dcache_init_early();
5538         inode_init_early();
5539  }
5540 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
5541 index 10db91218933..42af0a06f657 100644
5542 --- a/fs/eventpoll.c
5543 +++ b/fs/eventpoll.c
5544 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
5545   */
5546  static void ep_poll_safewake(wait_queue_head_t *wq)
5547  {
5548 -       int this_cpu = get_cpu();
5549 +       int this_cpu = get_cpu_light();
5550  
5551         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5552                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5553  
5554 -       put_cpu();
5555 +       put_cpu_light();
5556  }
5557  
5558  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5559 diff --git a/fs/exec.c b/fs/exec.c
5560 index 67e86571685a..fe14cdd84016 100644
5561 --- a/fs/exec.c
5562 +++ b/fs/exec.c
5563 @@ -1017,12 +1017,14 @@ static int exec_mmap(struct mm_struct *mm)
5564                 }
5565         }
5566         task_lock(tsk);
5567 +       preempt_disable_rt();
5568         active_mm = tsk->active_mm;
5569         tsk->mm = mm;
5570         tsk->active_mm = mm;
5571         activate_mm(active_mm, mm);
5572         tsk->mm->vmacache_seqnum = 0;
5573         vmacache_flush(tsk);
5574 +       preempt_enable_rt();
5575         task_unlock(tsk);
5576         if (old_mm) {
5577                 up_read(&old_mm->mmap_sem);
5578 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
5579 index 096f79997f75..310e2aabbb0d 100644
5580 --- a/fs/fuse/dir.c
5581 +++ b/fs/fuse/dir.c
5582 @@ -1191,7 +1191,7 @@ static int fuse_direntplus_link(struct file *file,
5583         struct inode *dir = d_inode(parent);
5584         struct fuse_conn *fc;
5585         struct inode *inode;
5586 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5587 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5588  
5589         if (!o->nodeid) {
5590                 /*
5591 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
5592 index 684996c8a3a4..6e18a06aaabe 100644
5593 --- a/fs/jbd2/checkpoint.c
5594 +++ b/fs/jbd2/checkpoint.c
5595 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
5596         nblocks = jbd2_space_needed(journal);
5597         while (jbd2_log_space_left(journal) < nblocks) {
5598                 write_unlock(&journal->j_state_lock);
5599 +               if (current->plug)
5600 +                       io_schedule();
5601                 mutex_lock(&journal->j_checkpoint_mutex);
5602  
5603                 /*
5604 diff --git a/fs/locks.c b/fs/locks.c
5605 index 22c5b4aa4961..269c6a44449a 100644
5606 --- a/fs/locks.c
5607 +++ b/fs/locks.c
5608 @@ -935,7 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5609                         return -ENOMEM;
5610         }
5611  
5612 -       percpu_down_read_preempt_disable(&file_rwsem);
5613 +       percpu_down_read(&file_rwsem);
5614         spin_lock(&ctx->flc_lock);
5615         if (request->fl_flags & FL_ACCESS)
5616                 goto find_conflict;
5617 @@ -976,7 +976,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5618  
5619  out:
5620         spin_unlock(&ctx->flc_lock);
5621 -       percpu_up_read_preempt_enable(&file_rwsem);
5622 +       percpu_up_read(&file_rwsem);
5623         if (new_fl)
5624                 locks_free_lock(new_fl);
5625         locks_dispose_list(&dispose);
5626 @@ -1013,7 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5627                 new_fl2 = locks_alloc_lock();
5628         }
5629  
5630 -       percpu_down_read_preempt_disable(&file_rwsem);
5631 +       percpu_down_read(&file_rwsem);
5632         spin_lock(&ctx->flc_lock);
5633         /*
5634          * New lock request. Walk all POSIX locks and look for conflicts. If
5635 @@ -1185,7 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5636         }
5637   out:
5638         spin_unlock(&ctx->flc_lock);
5639 -       percpu_up_read_preempt_enable(&file_rwsem);
5640 +       percpu_up_read(&file_rwsem);
5641         /*
5642          * Free any unused locks.
5643          */
5644 @@ -1460,7 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5645                 return error;
5646         }
5647  
5648 -       percpu_down_read_preempt_disable(&file_rwsem);
5649 +       percpu_down_read(&file_rwsem);
5650         spin_lock(&ctx->flc_lock);
5651  
5652         time_out_leases(inode, &dispose);
5653 @@ -1512,13 +1512,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5654         locks_insert_block(fl, new_fl);
5655         trace_break_lease_block(inode, new_fl);
5656         spin_unlock(&ctx->flc_lock);
5657 -       percpu_up_read_preempt_enable(&file_rwsem);
5658 +       percpu_up_read(&file_rwsem);
5659  
5660         locks_dispose_list(&dispose);
5661         error = wait_event_interruptible_timeout(new_fl->fl_wait,
5662                                                 !new_fl->fl_next, break_time);
5663  
5664 -       percpu_down_read_preempt_disable(&file_rwsem);
5665 +       percpu_down_read(&file_rwsem);
5666         spin_lock(&ctx->flc_lock);
5667         trace_break_lease_unblock(inode, new_fl);
5668         locks_delete_block(new_fl);
5669 @@ -1535,7 +1535,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5670         }
5671  out:
5672         spin_unlock(&ctx->flc_lock);
5673 -       percpu_up_read_preempt_enable(&file_rwsem);
5674 +       percpu_up_read(&file_rwsem);
5675         locks_dispose_list(&dispose);
5676         locks_free_lock(new_fl);
5677         return error;
5678 @@ -1609,7 +1609,7 @@ int fcntl_getlease(struct file *filp)
5679  
5680         ctx = smp_load_acquire(&inode->i_flctx);
5681         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5682 -               percpu_down_read_preempt_disable(&file_rwsem);
5683 +               percpu_down_read(&file_rwsem);
5684                 spin_lock(&ctx->flc_lock);
5685                 time_out_leases(inode, &dispose);
5686                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5687 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
5688                         break;
5689                 }
5690                 spin_unlock(&ctx->flc_lock);
5691 -               percpu_up_read_preempt_enable(&file_rwsem);
5692 +               percpu_up_read(&file_rwsem);
5693  
5694                 locks_dispose_list(&dispose);
5695         }
5696 @@ -1694,7 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5697                 return -EINVAL;
5698         }
5699  
5700 -       percpu_down_read_preempt_disable(&file_rwsem);
5701 +       percpu_down_read(&file_rwsem);
5702         spin_lock(&ctx->flc_lock);
5703         time_out_leases(inode, &dispose);
5704         error = check_conflicting_open(dentry, arg, lease->fl_flags);
5705 @@ -1765,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5706                 lease->fl_lmops->lm_setup(lease, priv);
5707  out:
5708         spin_unlock(&ctx->flc_lock);
5709 -       percpu_up_read_preempt_enable(&file_rwsem);
5710 +       percpu_up_read(&file_rwsem);
5711         locks_dispose_list(&dispose);
5712         if (is_deleg)
5713                 inode_unlock(inode);
5714 @@ -1788,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
5715                 return error;
5716         }
5717  
5718 -       percpu_down_read_preempt_disable(&file_rwsem);
5719 +       percpu_down_read(&file_rwsem);
5720         spin_lock(&ctx->flc_lock);
5721         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5722                 if (fl->fl_file == filp &&
5723 @@ -1801,7 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
5724         if (victim)
5725                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
5726         spin_unlock(&ctx->flc_lock);
5727 -       percpu_up_read_preempt_enable(&file_rwsem);
5728 +       percpu_up_read(&file_rwsem);
5729         locks_dispose_list(&dispose);
5730         return error;
5731  }
5732 @@ -2532,13 +2532,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
5733         if (list_empty(&ctx->flc_lease))
5734                 return;
5735  
5736 -       percpu_down_read_preempt_disable(&file_rwsem);
5737 +       percpu_down_read(&file_rwsem);
5738         spin_lock(&ctx->flc_lock);
5739         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
5740                 if (filp == fl->fl_file)
5741                         lease_modify(fl, F_UNLCK, &dispose);
5742         spin_unlock(&ctx->flc_lock);
5743 -       percpu_up_read_preempt_enable(&file_rwsem);
5744 +       percpu_up_read(&file_rwsem);
5745  
5746         locks_dispose_list(&dispose);
5747  }
5748 diff --git a/fs/namei.c b/fs/namei.c
5749 index 5b4eed221530..9c8dd3c83a80 100644
5750 --- a/fs/namei.c
5751 +++ b/fs/namei.c
5752 @@ -1629,7 +1629,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
5753  {
5754         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
5755         struct inode *inode = dir->d_inode;
5756 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5757 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5758  
5759         inode_lock_shared(inode);
5760         /* Don't go there if it's already dead */
5761 @@ -3086,7 +3086,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
5762         struct dentry *dentry;
5763         int error, create_error = 0;
5764         umode_t mode = op->mode;
5765 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5766 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5767  
5768         if (unlikely(IS_DEADDIR(dir_inode)))
5769                 return -ENOENT;
5770 diff --git a/fs/namespace.c b/fs/namespace.c
5771 index e6c234b1a645..c9dbe5e56347 100644
5772 --- a/fs/namespace.c
5773 +++ b/fs/namespace.c
5774 @@ -14,6 +14,7 @@
5775  #include <linux/mnt_namespace.h>
5776  #include <linux/user_namespace.h>
5777  #include <linux/namei.h>
5778 +#include <linux/delay.h>
5779  #include <linux/security.h>
5780  #include <linux/idr.h>
5781  #include <linux/init.h>                /* init_rootfs */
5782 @@ -356,8 +357,11 @@ int __mnt_want_write(struct vfsmount *m)
5783          * incremented count after it has set MNT_WRITE_HOLD.
5784          */
5785         smp_mb();
5786 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
5787 -               cpu_relax();
5788 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
5789 +               preempt_enable();
5790 +               cpu_chill();
5791 +               preempt_disable();
5792 +       }
5793         /*
5794          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
5795          * be set to match its requirements. So we must not load that until
5796 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
5797 index dff600ae0d74..d726d2e09353 100644
5798 --- a/fs/nfs/delegation.c
5799 +++ b/fs/nfs/delegation.c
5800 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
5801                 sp = state->owner;
5802                 /* Block nfs4_proc_unlck */
5803                 mutex_lock(&sp->so_delegreturn_mutex);
5804 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5805 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
5806                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
5807                 if (!err)
5808                         err = nfs_delegation_claim_locks(ctx, state, stateid);
5809 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5810 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
5811                         err = -EAGAIN;
5812                 mutex_unlock(&sp->so_delegreturn_mutex);
5813                 put_nfs_open_context(ctx);
5814 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
5815 index 5f1af4cd1a33..436c27eb9d4f 100644
5816 --- a/fs/nfs/dir.c
5817 +++ b/fs/nfs/dir.c
5818 @@ -485,7 +485,7 @@ static
5819  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
5820  {
5821         struct qstr filename = QSTR_INIT(entry->name, entry->len);
5822 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5823 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5824         struct dentry *dentry;
5825         struct dentry *alias;
5826         struct inode *dir = d_inode(parent);
5827 @@ -1498,7 +1498,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
5828                     struct file *file, unsigned open_flags,
5829                     umode_t mode, int *opened)
5830  {
5831 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5832 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5833         struct nfs_open_context *ctx;
5834         struct dentry *res;
5835         struct iattr attr = { .ia_valid = ATTR_OPEN };
5836 @@ -1813,7 +1813,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
5837  
5838         trace_nfs_rmdir_enter(dir, dentry);
5839         if (d_really_is_positive(dentry)) {
5840 +#ifdef CONFIG_PREEMPT_RT_BASE
5841 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
5842 +#else
5843                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5844 +#endif
5845                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5846                 /* Ensure the VFS deletes this inode */
5847                 switch (error) {
5848 @@ -1823,7 +1827,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
5849                 case -ENOENT:
5850                         nfs_dentry_handle_enoent(dentry);
5851                 }
5852 +#ifdef CONFIG_PREEMPT_RT_BASE
5853 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
5854 +#else
5855                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5856 +#endif
5857         } else
5858                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5859         trace_nfs_rmdir_exit(dir, dentry, error);
5860 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
5861 index bf4ec5ecc97e..36cd5fc9192c 100644
5862 --- a/fs/nfs/inode.c
5863 +++ b/fs/nfs/inode.c
5864 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
5865         nfsi->nrequests = 0;
5866         nfsi->commit_info.ncommit = 0;
5867         atomic_set(&nfsi->commit_info.rpcs_out, 0);
5868 +#ifdef CONFIG_PREEMPT_RT_BASE
5869 +       sema_init(&nfsi->rmdir_sem, 1);
5870 +#else
5871         init_rwsem(&nfsi->rmdir_sem);
5872 +#endif
5873         nfs4_init_once(nfsi);
5874  }
5875  
5876 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
5877 index 1452177c822d..f43b01d54c59 100644
5878 --- a/fs/nfs/nfs4_fs.h
5879 +++ b/fs/nfs/nfs4_fs.h
5880 @@ -111,7 +111,7 @@ struct nfs4_state_owner {
5881         unsigned long        so_flags;
5882         struct list_head     so_states;
5883         struct nfs_seqid_counter so_seqid;
5884 -       seqcount_t           so_reclaim_seqcount;
5885 +       seqlock_t            so_reclaim_seqlock;
5886         struct mutex         so_delegreturn_mutex;
5887  };
5888  
5889 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
5890 index 241da19b7da4..8f9636cc298f 100644
5891 --- a/fs/nfs/nfs4proc.c
5892 +++ b/fs/nfs/nfs4proc.c
5893 @@ -2697,7 +2697,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
5894         unsigned int seq;
5895         int ret;
5896  
5897 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5898 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5899  
5900         ret = _nfs4_proc_open(opendata);
5901         if (ret != 0)
5902 @@ -2735,7 +2735,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
5903         ctx->state = state;
5904         if (d_inode(dentry) == state->inode) {
5905                 nfs_inode_attach_open_context(ctx);
5906 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5907 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
5908                         nfs4_schedule_stateid_recovery(server, state);
5909         }
5910  out:
5911 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
5912 index 0959c9661662..dabd834d7686 100644
5913 --- a/fs/nfs/nfs4state.c
5914 +++ b/fs/nfs/nfs4state.c
5915 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
5916         nfs4_init_seqid_counter(&sp->so_seqid);
5917         atomic_set(&sp->so_count, 1);
5918         INIT_LIST_HEAD(&sp->so_lru);
5919 -       seqcount_init(&sp->so_reclaim_seqcount);
5920 +       seqlock_init(&sp->so_reclaim_seqlock);
5921         mutex_init(&sp->so_delegreturn_mutex);
5922         return sp;
5923  }
5924 @@ -1497,8 +1497,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
5925          * recovering after a network partition or a reboot from a
5926          * server that doesn't support a grace period.
5927          */
5928 +#ifdef CONFIG_PREEMPT_RT_FULL
5929 +       write_seqlock(&sp->so_reclaim_seqlock);
5930 +#else
5931 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5932 +#endif
5933         spin_lock(&sp->so_lock);
5934 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
5935  restart:
5936         list_for_each_entry(state, &sp->so_states, open_states) {
5937                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
5938 @@ -1567,14 +1571,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
5939                 spin_lock(&sp->so_lock);
5940                 goto restart;
5941         }
5942 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5943         spin_unlock(&sp->so_lock);
5944 +#ifdef CONFIG_PREEMPT_RT_FULL
5945 +       write_sequnlock(&sp->so_reclaim_seqlock);
5946 +#else
5947 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5948 +#endif
5949         return 0;
5950  out_err:
5951         nfs4_put_open_state(state);
5952 -       spin_lock(&sp->so_lock);
5953 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5954 -       spin_unlock(&sp->so_lock);
5955 +#ifdef CONFIG_PREEMPT_RT_FULL
5956 +       write_sequnlock(&sp->so_reclaim_seqlock);
5957 +#else
5958 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5959 +#endif
5960         return status;
5961  }
5962  
5963 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
5964 index 191aa577dd1f..58990c8f52e0 100644
5965 --- a/fs/nfs/unlink.c
5966 +++ b/fs/nfs/unlink.c
5967 @@ -12,7 +12,7 @@
5968  #include <linux/sunrpc/clnt.h>
5969  #include <linux/nfs_fs.h>
5970  #include <linux/sched.h>
5971 -#include <linux/wait.h>
5972 +#include <linux/swait.h>
5973  #include <linux/namei.h>
5974  #include <linux/fsnotify.h>
5975  
5976 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
5977                 rpc_restart_call_prepare(task);
5978  }
5979  
5980 +#ifdef CONFIG_PREEMPT_RT_BASE
5981 +static void nfs_down_anon(struct semaphore *sema)
5982 +{
5983 +       down(sema);
5984 +}
5985 +
5986 +static void nfs_up_anon(struct semaphore *sema)
5987 +{
5988 +       up(sema);
5989 +}
5990 +
5991 +#else
5992 +static void nfs_down_anon(struct rw_semaphore *rwsem)
5993 +{
5994 +       down_read_non_owner(rwsem);
5995 +}
5996 +
5997 +static void nfs_up_anon(struct rw_semaphore *rwsem)
5998 +{
5999 +       up_read_non_owner(rwsem);
6000 +}
6001 +#endif
6002 +
6003  /**
6004   * nfs_async_unlink_release - Release the sillydelete data.
6005   * @task: rpc_task of the sillydelete
6006 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
6007         struct dentry *dentry = data->dentry;
6008         struct super_block *sb = dentry->d_sb;
6009  
6010 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6011 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6012         d_lookup_done(dentry);
6013         nfs_free_unlinkdata(data);
6014         dput(dentry);
6015 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6016         struct inode *dir = d_inode(dentry->d_parent);
6017         struct dentry *alias;
6018  
6019 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6020 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6021         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6022         if (IS_ERR(alias)) {
6023 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6024 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6025                 return 0;
6026         }
6027         if (!d_in_lookup(alias)) {
6028 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6029                         ret = 0;
6030                 spin_unlock(&alias->d_lock);
6031                 dput(alias);
6032 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6033 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6034                 /*
6035                  * If we'd displaced old cached devname, free it.  At that
6036                  * point dentry is definitely not a root, so we won't need
6037 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
6038                 goto out_free_name;
6039         }
6040         data->res.dir_attr = &data->dir_attr;
6041 -       init_waitqueue_head(&data->wq);
6042 +       init_swait_queue_head(&data->wq);
6043  
6044         status = -EBUSY;
6045         spin_lock(&dentry->d_lock);
6046 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
6047 index fe251f187ff8..e89da4fb14c2 100644
6048 --- a/fs/ntfs/aops.c
6049 +++ b/fs/ntfs/aops.c
6050 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6051                         ofs = 0;
6052                         if (file_ofs < init_size)
6053                                 ofs = init_size - file_ofs;
6054 -                       local_irq_save(flags);
6055 +                       local_irq_save_nort(flags);
6056                         kaddr = kmap_atomic(page);
6057                         memset(kaddr + bh_offset(bh) + ofs, 0,
6058                                         bh->b_size - ofs);
6059                         flush_dcache_page(page);
6060                         kunmap_atomic(kaddr);
6061 -                       local_irq_restore(flags);
6062 +                       local_irq_restore_nort(flags);
6063                 }
6064         } else {
6065                 clear_buffer_uptodate(bh);
6066 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6067                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6068         }
6069         first = page_buffers(page);
6070 -       local_irq_save(flags);
6071 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6072 +       flags = bh_uptodate_lock_irqsave(first);
6073         clear_buffer_async_read(bh);
6074         unlock_buffer(bh);
6075         tmp = bh;
6076 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6077                 }
6078                 tmp = tmp->b_this_page;
6079         } while (tmp != bh);
6080 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6081 -       local_irq_restore(flags);
6082 +       bh_uptodate_unlock_irqrestore(first, flags);
6083         /*
6084          * If none of the buffers had errors then we can set the page uptodate,
6085          * but we first have to perform the post read mst fixups, if the
6086 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6087                 recs = PAGE_SIZE / rec_size;
6088                 /* Should have been verified before we got here... */
6089                 BUG_ON(!recs);
6090 -               local_irq_save(flags);
6091 +               local_irq_save_nort(flags);
6092                 kaddr = kmap_atomic(page);
6093                 for (i = 0; i < recs; i++)
6094                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6095                                         i * rec_size), rec_size);
6096                 kunmap_atomic(kaddr);
6097 -               local_irq_restore(flags);
6098 +               local_irq_restore_nort(flags);
6099                 flush_dcache_page(page);
6100                 if (likely(page_uptodate && !PageError(page)))
6101                         SetPageUptodate(page);
6102 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6103         unlock_page(page);
6104         return;
6105  still_busy:
6106 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6107 -       local_irq_restore(flags);
6108 -       return;
6109 +       bh_uptodate_unlock_irqrestore(first, flags);
6110  }
6111  
6112  /**
6113 diff --git a/fs/proc/base.c b/fs/proc/base.c
6114 index ca651ac00660..41d9dc789285 100644
6115 --- a/fs/proc/base.c
6116 +++ b/fs/proc/base.c
6117 @@ -1834,7 +1834,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
6118  
6119         child = d_hash_and_lookup(dir, &qname);
6120         if (!child) {
6121 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6122 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6123                 child = d_alloc_parallel(dir, &qname, &wq);
6124                 if (IS_ERR(child))
6125                         goto end_instantiate;
6126 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
6127 index 55313d994895..bdfc493721e9 100644
6128 --- a/fs/proc/proc_sysctl.c
6129 +++ b/fs/proc/proc_sysctl.c
6130 @@ -632,7 +632,7 @@ static bool proc_sys_fill_cache(struct file *file,
6131  
6132         child = d_lookup(dir, &qname);
6133         if (!child) {
6134 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6135 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6136                 child = d_alloc_parallel(dir, &qname, &wq);
6137                 if (IS_ERR(child))
6138                         return false;
6139 diff --git a/fs/timerfd.c b/fs/timerfd.c
6140 index 9ae4abb4110b..8644b67c48fd 100644
6141 --- a/fs/timerfd.c
6142 +++ b/fs/timerfd.c
6143 @@ -460,7 +460,10 @@ static int do_timerfd_settime(int ufd, int flags,
6144                                 break;
6145                 }
6146                 spin_unlock_irq(&ctx->wqh.lock);
6147 -               cpu_relax();
6148 +               if (isalarm(ctx))
6149 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6150 +               else
6151 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6152         }
6153  
6154         /*
6155 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
6156 index e861a24f06f2..b5c97d3059c7 100644
6157 --- a/include/acpi/platform/aclinux.h
6158 +++ b/include/acpi/platform/aclinux.h
6159 @@ -133,6 +133,7 @@
6160  
6161  #define acpi_cache_t                        struct kmem_cache
6162  #define acpi_spinlock                       spinlock_t *
6163 +#define acpi_raw_spinlock              raw_spinlock_t *
6164  #define acpi_cpu_flags                      unsigned long
6165  
6166  /* Use native linux version of acpi_os_allocate_zeroed */
6167 @@ -151,6 +152,20 @@
6168  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6169  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6170  
6171 +#define acpi_os_create_raw_lock(__handle)                      \
6172 +({                                                             \
6173 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6174 +                                                               \
6175 +        if (lock) {                                            \
6176 +               *(__handle) = lock;                             \
6177 +               raw_spin_lock_init(*(__handle));                \
6178 +        }                                                      \
6179 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6180 + })
6181 +
6182 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6183 +
6184 +
6185  /*
6186   * OSL interfaces used by debugger/disassembler
6187   */
6188 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
6189 index 6f96247226a4..fa53a21263c2 100644
6190 --- a/include/asm-generic/bug.h
6191 +++ b/include/asm-generic/bug.h
6192 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
6193  # define WARN_ON_SMP(x)                        ({0;})
6194  #endif
6195  
6196 +#ifdef CONFIG_PREEMPT_RT_BASE
6197 +# define BUG_ON_RT(c)                  BUG_ON(c)
6198 +# define BUG_ON_NONRT(c)               do { } while (0)
6199 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6200 +# define WARN_ON_NONRT(condition)      do { } while (0)
6201 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6202 +#else
6203 +# define BUG_ON_RT(c)                  do { } while (0)
6204 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6205 +# define WARN_ON_RT(condition)         do { } while (0)
6206 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6207 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6208 +#endif
6209 +
6210  #endif /* __ASSEMBLY__ */
6211  
6212  #endif
6213 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
6214 index 535ab2e13d2e..cfc246899473 100644
6215 --- a/include/linux/blk-mq.h
6216 +++ b/include/linux/blk-mq.h
6217 @@ -209,7 +209,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
6218         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6219  }
6220  
6221 -
6222 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6223  int blk_mq_request_started(struct request *rq);
6224  void blk_mq_start_request(struct request *rq);
6225  void blk_mq_end_request(struct request *rq, int error);
6226 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
6227 index c47c358ba052..a99c23735725 100644
6228 --- a/include/linux/blkdev.h
6229 +++ b/include/linux/blkdev.h
6230 @@ -89,6 +89,7 @@ struct request {
6231         struct list_head queuelist;
6232         union {
6233                 struct call_single_data csd;
6234 +               struct work_struct work;
6235                 u64 fifo_time;
6236         };
6237  
6238 @@ -467,7 +468,7 @@ struct request_queue {
6239         struct throtl_data *td;
6240  #endif
6241         struct rcu_head         rcu_head;
6242 -       wait_queue_head_t       mq_freeze_wq;
6243 +       struct swait_queue_head mq_freeze_wq;
6244         struct percpu_ref       q_usage_counter;
6245         struct list_head        all_q_node;
6246  
6247 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
6248 index 8fdcb783197d..d07dbeec7bc1 100644
6249 --- a/include/linux/bottom_half.h
6250 +++ b/include/linux/bottom_half.h
6251 @@ -3,6 +3,39 @@
6252  
6253  #include <linux/preempt.h>
6254  
6255 +#ifdef CONFIG_PREEMPT_RT_FULL
6256 +
6257 +extern void __local_bh_disable(void);
6258 +extern void _local_bh_enable(void);
6259 +extern void __local_bh_enable(void);
6260 +
6261 +static inline void local_bh_disable(void)
6262 +{
6263 +       __local_bh_disable();
6264 +}
6265 +
6266 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6267 +{
6268 +       __local_bh_disable();
6269 +}
6270 +
6271 +static inline void local_bh_enable(void)
6272 +{
6273 +       __local_bh_enable();
6274 +}
6275 +
6276 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6277 +{
6278 +       __local_bh_enable();
6279 +}
6280 +
6281 +static inline void local_bh_enable_ip(unsigned long ip)
6282 +{
6283 +       __local_bh_enable();
6284 +}
6285 +
6286 +#else
6287 +
6288  #ifdef CONFIG_TRACE_IRQFLAGS
6289  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6290  #else
6291 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
6292  {
6293         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6294  }
6295 +#endif
6296  
6297  #endif /* _LINUX_BH_H */
6298 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
6299 index ebbacd14d450..be5e87f6360a 100644
6300 --- a/include/linux/buffer_head.h
6301 +++ b/include/linux/buffer_head.h
6302 @@ -75,8 +75,50 @@ struct buffer_head {
6303         struct address_space *b_assoc_map;      /* mapping this buffer is
6304                                                    associated with */
6305         atomic_t b_count;               /* users using this buffer_head */
6306 +#ifdef CONFIG_PREEMPT_RT_BASE
6307 +       spinlock_t b_uptodate_lock;
6308 +#if IS_ENABLED(CONFIG_JBD2)
6309 +       spinlock_t b_state_lock;
6310 +       spinlock_t b_journal_head_lock;
6311 +#endif
6312 +#endif
6313  };
6314  
6315 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6316 +{
6317 +       unsigned long flags;
6318 +
6319 +#ifndef CONFIG_PREEMPT_RT_BASE
6320 +       local_irq_save(flags);
6321 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6322 +#else
6323 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6324 +#endif
6325 +       return flags;
6326 +}
6327 +
6328 +static inline void
6329 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6330 +{
6331 +#ifndef CONFIG_PREEMPT_RT_BASE
6332 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6333 +       local_irq_restore(flags);
6334 +#else
6335 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6336 +#endif
6337 +}
6338 +
6339 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6340 +{
6341 +#ifdef CONFIG_PREEMPT_RT_BASE
6342 +       spin_lock_init(&bh->b_uptodate_lock);
6343 +#if IS_ENABLED(CONFIG_JBD2)
6344 +       spin_lock_init(&bh->b_state_lock);
6345 +       spin_lock_init(&bh->b_journal_head_lock);
6346 +#endif
6347 +#endif
6348 +}
6349 +
6350  /*
6351   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6352   * and buffer_foo() functions.
6353 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
6354 index 5b17de62c962..56027cc01a56 100644
6355 --- a/include/linux/cgroup-defs.h
6356 +++ b/include/linux/cgroup-defs.h
6357 @@ -16,6 +16,7 @@
6358  #include <linux/percpu-refcount.h>
6359  #include <linux/percpu-rwsem.h>
6360  #include <linux/workqueue.h>
6361 +#include <linux/swork.h>
6362  
6363  #ifdef CONFIG_CGROUPS
6364  
6365 @@ -137,6 +138,7 @@ struct cgroup_subsys_state {
6366         /* percpu_ref killing and RCU release */
6367         struct rcu_head rcu_head;
6368         struct work_struct destroy_work;
6369 +       struct swork_event destroy_swork;
6370  };
6371  
6372  /*
6373 diff --git a/include/linux/completion.h b/include/linux/completion.h
6374 index 5d5aaae3af43..3bca1590e29f 100644
6375 --- a/include/linux/completion.h
6376 +++ b/include/linux/completion.h
6377 @@ -7,8 +7,7 @@
6378   * Atomic wait-for-completion handler data structures.
6379   * See kernel/sched/completion.c for details.
6380   */
6381 -
6382 -#include <linux/wait.h>
6383 +#include <linux/swait.h>
6384  
6385  /*
6386   * struct completion - structure used to maintain state for a "completion"
6387 @@ -24,11 +23,11 @@
6388   */
6389  struct completion {
6390         unsigned int done;
6391 -       wait_queue_head_t wait;
6392 +       struct swait_queue_head wait;
6393  };
6394  
6395  #define COMPLETION_INITIALIZER(work) \
6396 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6397 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6398  
6399  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6400         ({ init_completion(&work); work; })
6401 @@ -73,7 +72,7 @@ struct completion {
6402  static inline void init_completion(struct completion *x)
6403  {
6404         x->done = 0;
6405 -       init_waitqueue_head(&x->wait);
6406 +       init_swait_queue_head(&x->wait);
6407  }
6408  
6409  /**
6410 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
6411 index e571128ad99a..5e52d28c20c1 100644
6412 --- a/include/linux/cpu.h
6413 +++ b/include/linux/cpu.h
6414 @@ -182,6 +182,8 @@ extern void get_online_cpus(void);
6415  extern void put_online_cpus(void);
6416  extern void cpu_hotplug_disable(void);
6417  extern void cpu_hotplug_enable(void);
6418 +extern void pin_current_cpu(void);
6419 +extern void unpin_current_cpu(void);
6420  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6421  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6422  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6423 @@ -199,6 +201,8 @@ static inline void cpu_hotplug_done(void) {}
6424  #define put_online_cpus()      do { } while (0)
6425  #define cpu_hotplug_disable()  do { } while (0)
6426  #define cpu_hotplug_enable()   do { } while (0)
6427 +static inline void pin_current_cpu(void) { }
6428 +static inline void unpin_current_cpu(void) { }
6429  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6430  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6431  /* These aren't inline functions due to a GCC bug. */
6432 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
6433 index 5beed7b30561..61cab7ef458e 100644
6434 --- a/include/linux/dcache.h
6435 +++ b/include/linux/dcache.h
6436 @@ -11,6 +11,7 @@
6437  #include <linux/rcupdate.h>
6438  #include <linux/lockref.h>
6439  #include <linux/stringhash.h>
6440 +#include <linux/wait.h>
6441  
6442  struct path;
6443  struct vfsmount;
6444 @@ -100,7 +101,7 @@ struct dentry {
6445  
6446         union {
6447                 struct list_head d_lru;         /* LRU list */
6448 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6449 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6450         };
6451         struct list_head d_child;       /* child of parent list */
6452         struct list_head d_subdirs;     /* our children */
6453 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
6454  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6455  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6456  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6457 -                                       wait_queue_head_t *);
6458 +                                       struct swait_queue_head *);
6459  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6460  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6461  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6462 diff --git a/include/linux/delay.h b/include/linux/delay.h
6463 index a6ecb34cf547..37caab306336 100644
6464 --- a/include/linux/delay.h
6465 +++ b/include/linux/delay.h
6466 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
6467         msleep(seconds * 1000);
6468  }
6469  
6470 +#ifdef CONFIG_PREEMPT_RT_FULL
6471 +extern void cpu_chill(void);
6472 +#else
6473 +# define cpu_chill()   cpu_relax()
6474 +#endif
6475 +
6476  #endif /* defined(_LINUX_DELAY_H) */
6477 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
6478 index bb3f3297062a..a117a33ef72c 100644
6479 --- a/include/linux/highmem.h
6480 +++ b/include/linux/highmem.h
6481 @@ -7,6 +7,7 @@
6482  #include <linux/mm.h>
6483  #include <linux/uaccess.h>
6484  #include <linux/hardirq.h>
6485 +#include <linux/sched.h>
6486  
6487  #include <asm/cacheflush.h>
6488  
6489 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
6490  
6491  static inline void *kmap_atomic(struct page *page)
6492  {
6493 -       preempt_disable();
6494 +       preempt_disable_nort();
6495         pagefault_disable();
6496         return page_address(page);
6497  }
6498 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
6499  static inline void __kunmap_atomic(void *addr)
6500  {
6501         pagefault_enable();
6502 -       preempt_enable();
6503 +       preempt_enable_nort();
6504  }
6505  
6506  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
6507 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
6508  
6509  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6510  
6511 +#ifndef CONFIG_PREEMPT_RT_FULL
6512  DECLARE_PER_CPU(int, __kmap_atomic_idx);
6513 +#endif
6514  
6515  static inline int kmap_atomic_idx_push(void)
6516  {
6517 +#ifndef CONFIG_PREEMPT_RT_FULL
6518         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6519  
6520 -#ifdef CONFIG_DEBUG_HIGHMEM
6521 +# ifdef CONFIG_DEBUG_HIGHMEM
6522         WARN_ON_ONCE(in_irq() && !irqs_disabled());
6523         BUG_ON(idx >= KM_TYPE_NR);
6524 -#endif
6525 +# endif
6526         return idx;
6527 +#else
6528 +       current->kmap_idx++;
6529 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
6530 +       return current->kmap_idx - 1;
6531 +#endif
6532  }
6533  
6534  static inline int kmap_atomic_idx(void)
6535  {
6536 +#ifndef CONFIG_PREEMPT_RT_FULL
6537         return __this_cpu_read(__kmap_atomic_idx) - 1;
6538 +#else
6539 +       return current->kmap_idx - 1;
6540 +#endif
6541  }
6542  
6543  static inline void kmap_atomic_idx_pop(void)
6544  {
6545 -#ifdef CONFIG_DEBUG_HIGHMEM
6546 +#ifndef CONFIG_PREEMPT_RT_FULL
6547 +# ifdef CONFIG_DEBUG_HIGHMEM
6548         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6549  
6550         BUG_ON(idx < 0);
6551 -#else
6552 +# else
6553         __this_cpu_dec(__kmap_atomic_idx);
6554 +# endif
6555 +#else
6556 +       current->kmap_idx--;
6557 +# ifdef CONFIG_DEBUG_HIGHMEM
6558 +       BUG_ON(current->kmap_idx < 0);
6559 +# endif
6560  #endif
6561  }
6562  
6563 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
6564 index 5e00f80b1535..65d0671f20b4 100644
6565 --- a/include/linux/hrtimer.h
6566 +++ b/include/linux/hrtimer.h
6567 @@ -87,6 +87,9 @@ enum hrtimer_restart {
6568   * @function:  timer expiry callback function
6569   * @base:      pointer to the timer base (per cpu and per clock)
6570   * @state:     state information (See bit values above)
6571 + * @cb_entry:  list entry to defer timers from hardirq context
6572 + * @irqsafe:   timer can run in hardirq context
6573 + * @praecox:   timer expiry time if expired at the time of programming
6574   * @is_rel:    Set if the timer was armed relative
6575   * @start_pid:  timer statistics field to store the pid of the task which
6576   *             started the timer
6577 @@ -103,6 +106,11 @@ struct hrtimer {
6578         enum hrtimer_restart            (*function)(struct hrtimer *);
6579         struct hrtimer_clock_base       *base;
6580         u8                              state;
6581 +       struct list_head                cb_entry;
6582 +       int                             irqsafe;
6583 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6584 +       ktime_t                         praecox;
6585 +#endif
6586         u8                              is_rel;
6587  #ifdef CONFIG_TIMER_STATS
6588         int                             start_pid;
6589 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
6590         struct task_struct *task;
6591  };
6592  
6593 -#ifdef CONFIG_64BIT
6594  # define HRTIMER_CLOCK_BASE_ALIGN      64
6595 -#else
6596 -# define HRTIMER_CLOCK_BASE_ALIGN      32
6597 -#endif
6598  
6599  /**
6600   * struct hrtimer_clock_base - the timer base for a specific clock
6601 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
6602   *                     timer to a base on another cpu.
6603   * @clockid:           clock id for per_cpu support
6604   * @active:            red black tree root node for the active timers
6605 + * @expired:           list head for deferred timers.
6606   * @get_time:          function to retrieve the current time of the clock
6607   * @offset:            offset of this clock to the monotonic base
6608   */
6609 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
6610         int                     index;
6611         clockid_t               clockid;
6612         struct timerqueue_head  active;
6613 +       struct list_head        expired;
6614         ktime_t                 (*get_time)(void);
6615         ktime_t                 offset;
6616  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6617 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
6618         raw_spinlock_t                  lock;
6619         seqcount_t                      seq;
6620         struct hrtimer                  *running;
6621 +       struct hrtimer                  *running_soft;
6622         unsigned int                    cpu;
6623         unsigned int                    active_bases;
6624         unsigned int                    clock_was_set_seq;
6625 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
6626         unsigned int                    nr_hangs;
6627         unsigned int                    max_hang_time;
6628  #endif
6629 +#ifdef CONFIG_PREEMPT_RT_BASE
6630 +       wait_queue_head_t               wait;
6631 +#endif
6632         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
6633  } ____cacheline_aligned;
6634  
6635 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
6636         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6637  }
6638  
6639 +/* Softirq preemption could deadlock timer removal */
6640 +#ifdef CONFIG_PREEMPT_RT_BASE
6641 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6642 +#else
6643 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6644 +#endif
6645 +
6646  /* Query timers: */
6647  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6648  
6649 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
6650   * Helper function to check, whether the timer is running the callback
6651   * function
6652   */
6653 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6654 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6655  {
6656         return timer->base->cpu_base->running == timer;
6657  }
6658 diff --git a/include/linux/idr.h b/include/linux/idr.h
6659 index 083d61e92706..5899796f50cb 100644
6660 --- a/include/linux/idr.h
6661 +++ b/include/linux/idr.h
6662 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
6663   * Each idr_preload() should be matched with an invocation of this
6664   * function.  See idr_preload() for details.
6665   */
6666 +#ifdef CONFIG_PREEMPT_RT_FULL
6667 +void idr_preload_end(void);
6668 +#else
6669  static inline void idr_preload_end(void)
6670  {
6671         preempt_enable();
6672  }
6673 +#endif
6674  
6675  /**
6676   * idr_find - return pointer for given id
6677 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
6678 index 325f649d77ff..8af70bcc799b 100644
6679 --- a/include/linux/init_task.h
6680 +++ b/include/linux/init_task.h
6681 @@ -150,6 +150,12 @@ extern struct task_group root_task_group;
6682  # define INIT_PERF_EVENTS(tsk)
6683  #endif
6684  
6685 +#ifdef CONFIG_PREEMPT_RT_BASE
6686 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
6687 +#else
6688 +# define INIT_TIMER_LIST
6689 +#endif
6690 +
6691  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6692  # define INIT_VTIME(tsk)                                               \
6693         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
6694 @@ -250,6 +256,7 @@ extern struct task_group root_task_group;
6695         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
6696         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
6697         .timer_slack_ns = 50000, /* 50 usec default slack */            \
6698 +       INIT_TIMER_LIST                                                 \
6699         .pids = {                                                       \
6700                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
6701                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
6702 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
6703 index 72f0721f75e7..480972ae47d3 100644
6704 --- a/include/linux/interrupt.h
6705 +++ b/include/linux/interrupt.h
6706 @@ -14,6 +14,7 @@
6707  #include <linux/hrtimer.h>
6708  #include <linux/kref.h>
6709  #include <linux/workqueue.h>
6710 +#include <linux/swork.h>
6711  
6712  #include <linux/atomic.h>
6713  #include <asm/ptrace.h>
6714 @@ -61,6 +62,7 @@
6715   *                interrupt handler after suspending interrupts. For system
6716   *                wakeup devices users need to implement wakeup detection in
6717   *                their interrupt handlers.
6718 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
6719   */
6720  #define IRQF_SHARED            0x00000080
6721  #define IRQF_PROBE_SHARED      0x00000100
6722 @@ -74,6 +76,7 @@
6723  #define IRQF_NO_THREAD         0x00010000
6724  #define IRQF_EARLY_RESUME      0x00020000
6725  #define IRQF_COND_SUSPEND      0x00040000
6726 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
6727  
6728  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
6729  
6730 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
6731  #ifdef CONFIG_LOCKDEP
6732  # define local_irq_enable_in_hardirq() do { } while (0)
6733  #else
6734 -# define local_irq_enable_in_hardirq() local_irq_enable()
6735 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
6736  #endif
6737  
6738  extern void disable_irq_nosync(unsigned int irq);
6739 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
6740   * struct irq_affinity_notify - context for notification of IRQ affinity changes
6741   * @irq:               Interrupt to which notification applies
6742   * @kref:              Reference count, for internal use
6743 + * @swork:             Swork item, for internal use
6744   * @work:              Work item, for internal use
6745   * @notify:            Function to be called on change.  This will be
6746   *                     called in process context.
6747 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
6748  struct irq_affinity_notify {
6749         unsigned int irq;
6750         struct kref kref;
6751 +#ifdef CONFIG_PREEMPT_RT_BASE
6752 +       struct swork_event swork;
6753 +#else
6754         struct work_struct work;
6755 +#endif
6756         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
6757         void (*release)(struct kref *ref);
6758  };
6759 @@ -406,9 +414,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
6760                                  bool state);
6761  
6762  #ifdef CONFIG_IRQ_FORCED_THREADING
6763 +# ifndef CONFIG_PREEMPT_RT_BASE
6764  extern bool force_irqthreads;
6765 +# else
6766 +#  define force_irqthreads     (true)
6767 +# endif
6768  #else
6769 -#define force_irqthreads       (0)
6770 +#define force_irqthreads       (false)
6771  #endif
6772  
6773  #ifndef __ARCH_SET_SOFTIRQ_PENDING
6774 @@ -465,9 +477,10 @@ struct softirq_action
6775         void    (*action)(struct softirq_action *);
6776  };
6777  
6778 +#ifndef CONFIG_PREEMPT_RT_FULL
6779  asmlinkage void do_softirq(void);
6780  asmlinkage void __do_softirq(void);
6781 -
6782 +static inline void thread_do_softirq(void) { do_softirq(); }
6783  #ifdef __ARCH_HAS_DO_SOFTIRQ
6784  void do_softirq_own_stack(void);
6785  #else
6786 @@ -476,13 +489,25 @@ static inline void do_softirq_own_stack(void)
6787         __do_softirq();
6788  }
6789  #endif
6790 +#else
6791 +extern void thread_do_softirq(void);
6792 +#endif
6793  
6794  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
6795  extern void softirq_init(void);
6796  extern void __raise_softirq_irqoff(unsigned int nr);
6797 +#ifdef CONFIG_PREEMPT_RT_FULL
6798 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
6799 +#else
6800 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
6801 +{
6802 +       __raise_softirq_irqoff(nr);
6803 +}
6804 +#endif
6805  
6806  extern void raise_softirq_irqoff(unsigned int nr);
6807  extern void raise_softirq(unsigned int nr);
6808 +extern void softirq_check_pending_idle(void);
6809  
6810  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
6811  
6812 @@ -504,8 +529,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
6813       to be executed on some cpu at least once after this.
6814     * If the tasklet is already scheduled, but its execution is still not
6815       started, it will be executed only once.
6816 -   * If this tasklet is already running on another CPU (or schedule is called
6817 -     from tasklet itself), it is rescheduled for later.
6818 +   * If this tasklet is already running on another CPU, it is rescheduled
6819 +     for later.
6820 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
6821     * Tasklet is strictly serialized wrt itself, but not
6822       wrt another tasklets. If client needs some intertask synchronization,
6823       he makes it with spinlocks.
6824 @@ -530,27 +556,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
6825  enum
6826  {
6827         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
6828 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
6829 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
6830 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
6831  };
6832  
6833 -#ifdef CONFIG_SMP
6834 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
6835 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
6836 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
6837 +
6838 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
6839  static inline int tasklet_trylock(struct tasklet_struct *t)
6840  {
6841         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
6842  }
6843  
6844 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
6845 +{
6846 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
6847 +}
6848 +
6849  static inline void tasklet_unlock(struct tasklet_struct *t)
6850  {
6851         smp_mb__before_atomic();
6852         clear_bit(TASKLET_STATE_RUN, &(t)->state);
6853  }
6854  
6855 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
6856 -{
6857 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
6858 -}
6859 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
6860 +
6861  #else
6862  #define tasklet_trylock(t) 1
6863 +#define tasklet_tryunlock(t)   1
6864  #define tasklet_unlock_wait(t) do { } while (0)
6865  #define tasklet_unlock(t) do { } while (0)
6866  #endif
6867 @@ -599,12 +634,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
6868         smp_mb();
6869  }
6870  
6871 -static inline void tasklet_enable(struct tasklet_struct *t)
6872 -{
6873 -       smp_mb__before_atomic();
6874 -       atomic_dec(&t->count);
6875 -}
6876 -
6877 +extern void tasklet_enable(struct tasklet_struct *t);
6878  extern void tasklet_kill(struct tasklet_struct *t);
6879  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
6880  extern void tasklet_init(struct tasklet_struct *t,
6881 @@ -635,6 +665,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
6882         tasklet_kill(&ttimer->tasklet);
6883  }
6884  
6885 +#ifdef CONFIG_PREEMPT_RT_FULL
6886 +extern void softirq_early_init(void);
6887 +#else
6888 +static inline void softirq_early_init(void) { }
6889 +#endif
6890 +
6891  /*
6892   * Autoprobing for irqs:
6893   *
6894 diff --git a/include/linux/irq.h b/include/linux/irq.h
6895 index e79875574b39..177cee0c3305 100644
6896 --- a/include/linux/irq.h
6897 +++ b/include/linux/irq.h
6898 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
6899   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
6900   *                               it from the spurious interrupt detection
6901   *                               mechanism and from core side polling.
6902 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
6903   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
6904   */
6905  enum {
6906 @@ -99,13 +100,14 @@ enum {
6907         IRQ_PER_CPU_DEVID       = (1 << 17),
6908         IRQ_IS_POLLED           = (1 << 18),
6909         IRQ_DISABLE_UNLAZY      = (1 << 19),
6910 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
6911  };
6912  
6913  #define IRQF_MODIFY_MASK       \
6914         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
6915          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
6916          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
6917 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
6918 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
6919  
6920  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
6921  
6922 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
6923 index 47b9ebd4a74f..2543aab05daa 100644
6924 --- a/include/linux/irq_work.h
6925 +++ b/include/linux/irq_work.h
6926 @@ -16,6 +16,7 @@
6927  #define IRQ_WORK_BUSY          2UL
6928  #define IRQ_WORK_FLAGS         3UL
6929  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
6930 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
6931  
6932  struct irq_work {
6933         unsigned long flags;
6934 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
6935  static inline void irq_work_run(void) { }
6936  #endif
6937  
6938 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
6939 +void irq_work_tick_soft(void);
6940 +#else
6941 +static inline void irq_work_tick_soft(void) { }
6942 +#endif
6943 +
6944  #endif /* _LINUX_IRQ_WORK_H */
6945 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
6946 index c9be57931b58..eeeb540971ae 100644
6947 --- a/include/linux/irqdesc.h
6948 +++ b/include/linux/irqdesc.h
6949 @@ -66,6 +66,7 @@ struct irq_desc {
6950         unsigned int            irqs_unhandled;
6951         atomic_t                threads_handled;
6952         int                     threads_handled_last;
6953 +       u64                     random_ip;
6954         raw_spinlock_t          lock;
6955         struct cpumask          *percpu_enabled;
6956         const struct cpumask    *percpu_affinity;
6957 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
6958 index 5dd1272d1ab2..9b77034f7c5e 100644
6959 --- a/include/linux/irqflags.h
6960 +++ b/include/linux/irqflags.h
6961 @@ -25,8 +25,6 @@
6962  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
6963  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
6964  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
6965 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
6966 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
6967  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
6968  #else
6969  # define trace_hardirqs_on()           do { } while (0)
6970 @@ -39,9 +37,15 @@
6971  # define trace_softirqs_enabled(p)     0
6972  # define trace_hardirq_enter()         do { } while (0)
6973  # define trace_hardirq_exit()          do { } while (0)
6974 +# define INIT_TRACE_IRQFLAGS
6975 +#endif
6976 +
6977 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
6978 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
6979 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
6980 +#else
6981  # define lockdep_softirq_enter()       do { } while (0)
6982  # define lockdep_softirq_exit()                do { } while (0)
6983 -# define INIT_TRACE_IRQFLAGS
6984  #endif
6985  
6986  #if defined(CONFIG_IRQSOFF_TRACER) || \
6987 @@ -148,4 +152,23 @@
6988  
6989  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
6990  
6991 +/*
6992 + * local_irq* variants depending on RT/!RT
6993 + */
6994 +#ifdef CONFIG_PREEMPT_RT_FULL
6995 +# define local_irq_disable_nort()      do { } while (0)
6996 +# define local_irq_enable_nort()       do { } while (0)
6997 +# define local_irq_save_nort(flags)    local_save_flags(flags)
6998 +# define local_irq_restore_nort(flags) (void)(flags)
6999 +# define local_irq_disable_rt()                local_irq_disable()
7000 +# define local_irq_enable_rt()         local_irq_enable()
7001 +#else
7002 +# define local_irq_disable_nort()      local_irq_disable()
7003 +# define local_irq_enable_nort()       local_irq_enable()
7004 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7005 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7006 +# define local_irq_disable_rt()                do { } while (0)
7007 +# define local_irq_enable_rt()         do { } while (0)
7008 +#endif
7009 +
7010  #endif
7011 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
7012 index dfaa1f4dcb0c..d57dd06544a1 100644
7013 --- a/include/linux/jbd2.h
7014 +++ b/include/linux/jbd2.h
7015 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
7016  
7017  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7018  {
7019 +#ifndef CONFIG_PREEMPT_RT_BASE
7020         bit_spin_lock(BH_State, &bh->b_state);
7021 +#else
7022 +       spin_lock(&bh->b_state_lock);
7023 +#endif
7024  }
7025  
7026  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7027  {
7028 +#ifndef CONFIG_PREEMPT_RT_BASE
7029         return bit_spin_trylock(BH_State, &bh->b_state);
7030 +#else
7031 +       return spin_trylock(&bh->b_state_lock);
7032 +#endif
7033  }
7034  
7035  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7036  {
7037 +#ifndef CONFIG_PREEMPT_RT_BASE
7038         return bit_spin_is_locked(BH_State, &bh->b_state);
7039 +#else
7040 +       return spin_is_locked(&bh->b_state_lock);
7041 +#endif
7042  }
7043  
7044  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7045  {
7046 +#ifndef CONFIG_PREEMPT_RT_BASE
7047         bit_spin_unlock(BH_State, &bh->b_state);
7048 +#else
7049 +       spin_unlock(&bh->b_state_lock);
7050 +#endif
7051  }
7052  
7053  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7054  {
7055 +#ifndef CONFIG_PREEMPT_RT_BASE
7056         bit_spin_lock(BH_JournalHead, &bh->b_state);
7057 +#else
7058 +       spin_lock(&bh->b_journal_head_lock);
7059 +#endif
7060  }
7061  
7062  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7063  {
7064 +#ifndef CONFIG_PREEMPT_RT_BASE
7065         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7066 +#else
7067 +       spin_unlock(&bh->b_journal_head_lock);
7068 +#endif
7069  }
7070  
7071  #define J_ASSERT(assert)       BUG_ON(!(assert))
7072 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
7073 index 410decacff8f..0861bebfc188 100644
7074 --- a/include/linux/kdb.h
7075 +++ b/include/linux/kdb.h
7076 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
7077  extern __printf(1, 2) int kdb_printf(const char *, ...);
7078  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7079  
7080 +#define in_kdb_printk()        (kdb_trap_printk)
7081  extern void kdb_init(int level);
7082  
7083  /* Access to kdb specific polling devices */
7084 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
7085  extern int kdb_unregister(char *);
7086  #else /* ! CONFIG_KGDB_KDB */
7087  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7088 +#define in_kdb_printk() (0)
7089  static inline void kdb_init(int level) {}
7090  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7091                                char *help, short minlen) { return 0; }
7092 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
7093 index bc6ed52a39b9..7894d55e4998 100644
7094 --- a/include/linux/kernel.h
7095 +++ b/include/linux/kernel.h
7096 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
7097   */
7098  # define might_sleep() \
7099         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7100 +
7101 +# define might_sleep_no_state_check() \
7102 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7103  # define sched_annotate_sleep()        (current->task_state_change = 0)
7104  #else
7105    static inline void ___might_sleep(const char *file, int line,
7106 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
7107    static inline void __might_sleep(const char *file, int line,
7108                                    int preempt_offset) { }
7109  # define might_sleep() do { might_resched(); } while (0)
7110 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7111  # define sched_annotate_sleep() do { } while (0)
7112  #endif
7113  
7114 @@ -488,6 +492,7 @@ extern enum system_states {
7115         SYSTEM_HALT,
7116         SYSTEM_POWER_OFF,
7117         SYSTEM_RESTART,
7118 +       SYSTEM_SUSPEND,
7119  } system_state;
7120  
7121  #define TAINT_PROPRIETARY_MODULE       0
7122 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
7123 index cb483305e1f5..4e5062316bb6 100644
7124 --- a/include/linux/list_bl.h
7125 +++ b/include/linux/list_bl.h
7126 @@ -2,6 +2,7 @@
7127  #define _LINUX_LIST_BL_H
7128  
7129  #include <linux/list.h>
7130 +#include <linux/spinlock.h>
7131  #include <linux/bit_spinlock.h>
7132  
7133  /*
7134 @@ -32,13 +33,24 @@
7135  
7136  struct hlist_bl_head {
7137         struct hlist_bl_node *first;
7138 +#ifdef CONFIG_PREEMPT_RT_BASE
7139 +       raw_spinlock_t lock;
7140 +#endif
7141  };
7142  
7143  struct hlist_bl_node {
7144         struct hlist_bl_node *next, **pprev;
7145  };
7146 -#define INIT_HLIST_BL_HEAD(ptr) \
7147 -       ((ptr)->first = NULL)
7148 +
7149 +#ifdef CONFIG_PREEMPT_RT_BASE
7150 +#define INIT_HLIST_BL_HEAD(h)          \
7151 +do {                                   \
7152 +       (h)->first = NULL;              \
7153 +       raw_spin_lock_init(&(h)->lock); \
7154 +} while (0)
7155 +#else
7156 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7157 +#endif
7158  
7159  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7160  {
7161 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
7162  
7163  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7164  {
7165 +#ifndef CONFIG_PREEMPT_RT_BASE
7166         bit_spin_lock(0, (unsigned long *)b);
7167 +#else
7168 +       raw_spin_lock(&b->lock);
7169 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7170 +       __set_bit(0, (unsigned long *)b);
7171 +#endif
7172 +#endif
7173  }
7174  
7175  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7176  {
7177 +#ifndef CONFIG_PREEMPT_RT_BASE
7178         __bit_spin_unlock(0, (unsigned long *)b);
7179 +#else
7180 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7181 +       __clear_bit(0, (unsigned long *)b);
7182 +#endif
7183 +       raw_spin_unlock(&b->lock);
7184 +#endif
7185  }
7186  
7187  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7188 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
7189 new file mode 100644
7190 index 000000000000..845c77f1a5ca
7191 --- /dev/null
7192 +++ b/include/linux/locallock.h
7193 @@ -0,0 +1,278 @@
7194 +#ifndef _LINUX_LOCALLOCK_H
7195 +#define _LINUX_LOCALLOCK_H
7196 +
7197 +#include <linux/percpu.h>
7198 +#include <linux/spinlock.h>
7199 +
7200 +#ifdef CONFIG_PREEMPT_RT_BASE
7201 +
7202 +#ifdef CONFIG_DEBUG_SPINLOCK
7203 +# define LL_WARN(cond) WARN_ON(cond)
7204 +#else
7205 +# define LL_WARN(cond) do { } while (0)
7206 +#endif
7207 +
7208 +/*
7209 + * per cpu lock based substitute for local_irq_*()
7210 + */
7211 +struct local_irq_lock {
7212 +       spinlock_t              lock;
7213 +       struct task_struct      *owner;
7214 +       int                     nestcnt;
7215 +       unsigned long           flags;
7216 +};
7217 +
7218 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7219 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7220 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7221 +
7222 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7223 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7224 +
7225 +#define local_irq_lock_init(lvar)                                      \
7226 +       do {                                                            \
7227 +               int __cpu;                                              \
7228 +               for_each_possible_cpu(__cpu)                            \
7229 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7230 +       } while (0)
7231 +
7232 +/*
7233 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7234 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7235 + * already takes care of the migrate_disable/enable
7236 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7237 + */
7238 +#ifdef CONFIG_PREEMPT_RT_FULL
7239 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7240 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7241 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7242 +#else
7243 +# define spin_lock_local(lock)                 spin_lock(lock)
7244 +# define spin_trylock_local(lock)              spin_trylock(lock)
7245 +# define spin_unlock_local(lock)               spin_unlock(lock)
7246 +#endif
7247 +
7248 +static inline void __local_lock(struct local_irq_lock *lv)
7249 +{
7250 +       if (lv->owner != current) {
7251 +               spin_lock_local(&lv->lock);
7252 +               LL_WARN(lv->owner);
7253 +               LL_WARN(lv->nestcnt);
7254 +               lv->owner = current;
7255 +       }
7256 +       lv->nestcnt++;
7257 +}
7258 +
7259 +#define local_lock(lvar)                                       \
7260 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7261 +
7262 +#define local_lock_on(lvar, cpu)                               \
7263 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7264 +
7265 +static inline int __local_trylock(struct local_irq_lock *lv)
7266 +{
7267 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7268 +               LL_WARN(lv->owner);
7269 +               LL_WARN(lv->nestcnt);
7270 +               lv->owner = current;
7271 +               lv->nestcnt = 1;
7272 +               return 1;
7273 +       }
7274 +       return 0;
7275 +}
7276 +
7277 +#define local_trylock(lvar)                                            \
7278 +       ({                                                              \
7279 +               int __locked;                                           \
7280 +               __locked = __local_trylock(&get_local_var(lvar));       \
7281 +               if (!__locked)                                          \
7282 +                       put_local_var(lvar);                            \
7283 +               __locked;                                               \
7284 +       })
7285 +
7286 +static inline void __local_unlock(struct local_irq_lock *lv)
7287 +{
7288 +       LL_WARN(lv->nestcnt == 0);
7289 +       LL_WARN(lv->owner != current);
7290 +       if (--lv->nestcnt)
7291 +               return;
7292 +
7293 +       lv->owner = NULL;
7294 +       spin_unlock_local(&lv->lock);
7295 +}
7296 +
7297 +#define local_unlock(lvar)                                     \
7298 +       do {                                                    \
7299 +               __local_unlock(this_cpu_ptr(&lvar));            \
7300 +               put_local_var(lvar);                            \
7301 +       } while (0)
7302 +
7303 +#define local_unlock_on(lvar, cpu)                       \
7304 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7305 +
7306 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7307 +{
7308 +       spin_lock_irqsave(&lv->lock, lv->flags);
7309 +       LL_WARN(lv->owner);
7310 +       LL_WARN(lv->nestcnt);
7311 +       lv->owner = current;
7312 +       lv->nestcnt = 1;
7313 +}
7314 +
7315 +#define local_lock_irq(lvar)                                           \
7316 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7317 +
7318 +#define local_lock_irq_on(lvar, cpu)                                   \
7319 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7320 +
7321 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7322 +{
7323 +       LL_WARN(!lv->nestcnt);
7324 +       LL_WARN(lv->owner != current);
7325 +       lv->owner = NULL;
7326 +       lv->nestcnt = 0;
7327 +       spin_unlock_irq(&lv->lock);
7328 +}
7329 +
7330 +#define local_unlock_irq(lvar)                                         \
7331 +       do {                                                            \
7332 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7333 +               put_local_var(lvar);                                    \
7334 +       } while (0)
7335 +
7336 +#define local_unlock_irq_on(lvar, cpu)                                 \
7337 +       do {                                                            \
7338 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7339 +       } while (0)
7340 +
7341 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7342 +{
7343 +       if (lv->owner != current) {
7344 +               __local_lock_irq(lv);
7345 +               return 0;
7346 +       } else {
7347 +               lv->nestcnt++;
7348 +               return 1;
7349 +       }
7350 +}
7351 +
7352 +#define local_lock_irqsave(lvar, _flags)                               \
7353 +       do {                                                            \
7354 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7355 +                       put_local_var(lvar);                            \
7356 +               _flags = __this_cpu_read(lvar.flags);                   \
7357 +       } while (0)
7358 +
7359 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7360 +       do {                                                            \
7361 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7362 +               _flags = per_cpu(lvar, cpu).flags;                      \
7363 +       } while (0)
7364 +
7365 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7366 +                                           unsigned long flags)
7367 +{
7368 +       LL_WARN(!lv->nestcnt);
7369 +       LL_WARN(lv->owner != current);
7370 +       if (--lv->nestcnt)
7371 +               return 0;
7372 +
7373 +       lv->owner = NULL;
7374 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7375 +       return 1;
7376 +}
7377 +
7378 +#define local_unlock_irqrestore(lvar, flags)                           \
7379 +       do {                                                            \
7380 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7381 +                       put_local_var(lvar);                            \
7382 +       } while (0)
7383 +
7384 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7385 +       do {                                                            \
7386 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7387 +       } while (0)
7388 +
7389 +#define local_spin_trylock_irq(lvar, lock)                             \
7390 +       ({                                                              \
7391 +               int __locked;                                           \
7392 +               local_lock_irq(lvar);                                   \
7393 +               __locked = spin_trylock(lock);                          \
7394 +               if (!__locked)                                          \
7395 +                       local_unlock_irq(lvar);                         \
7396 +               __locked;                                               \
7397 +       })
7398 +
7399 +#define local_spin_lock_irq(lvar, lock)                                        \
7400 +       do {                                                            \
7401 +               local_lock_irq(lvar);                                   \
7402 +               spin_lock(lock);                                        \
7403 +       } while (0)
7404 +
7405 +#define local_spin_unlock_irq(lvar, lock)                              \
7406 +       do {                                                            \
7407 +               spin_unlock(lock);                                      \
7408 +               local_unlock_irq(lvar);                                 \
7409 +       } while (0)
7410 +
7411 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7412 +       do {                                                            \
7413 +               local_lock_irqsave(lvar, flags);                        \
7414 +               spin_lock(lock);                                        \
7415 +       } while (0)
7416 +
7417 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7418 +       do {                                                            \
7419 +               spin_unlock(lock);                                      \
7420 +               local_unlock_irqrestore(lvar, flags);                   \
7421 +       } while (0)
7422 +
7423 +#define get_locked_var(lvar, var)                                      \
7424 +       (*({                                                            \
7425 +               local_lock(lvar);                                       \
7426 +               this_cpu_ptr(&var);                                     \
7427 +       }))
7428 +
7429 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7430 +
7431 +#define local_lock_cpu(lvar)                                           \
7432 +       ({                                                              \
7433 +               local_lock(lvar);                                       \
7434 +               smp_processor_id();                                     \
7435 +       })
7436 +
7437 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7438 +
7439 +#else /* PREEMPT_RT_BASE */
7440 +
7441 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7442 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7443 +
7444 +static inline void local_irq_lock_init(int lvar) { }
7445 +
7446 +#define local_lock(lvar)                       preempt_disable()
7447 +#define local_unlock(lvar)                     preempt_enable()
7448 +#define local_lock_irq(lvar)                   local_irq_disable()
7449 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7450 +#define local_unlock_irq(lvar)                 local_irq_enable()
7451 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7452 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7453 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7454 +
7455 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7456 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7457 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7458 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7459 +       spin_lock_irqsave(lock, flags)
7460 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7461 +       spin_unlock_irqrestore(lock, flags)
7462 +
7463 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7464 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7465 +
7466 +#define local_lock_cpu(lvar)                   get_cpu()
7467 +#define local_unlock_cpu(lvar)                 put_cpu()
7468 +
7469 +#endif
7470 +
7471 +#endif
7472 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
7473 index 08d947fc4c59..705fb564a605 100644
7474 --- a/include/linux/mm_types.h
7475 +++ b/include/linux/mm_types.h
7476 @@ -11,6 +11,7 @@
7477  #include <linux/completion.h>
7478  #include <linux/cpumask.h>
7479  #include <linux/uprobes.h>
7480 +#include <linux/rcupdate.h>
7481  #include <linux/page-flags-layout.h>
7482  #include <linux/workqueue.h>
7483  #include <asm/page.h>
7484 @@ -509,6 +510,9 @@ struct mm_struct {
7485         bool tlb_flush_pending;
7486  #endif
7487         struct uprobes_state uprobes_state;
7488 +#ifdef CONFIG_PREEMPT_RT_BASE
7489 +       struct rcu_head delayed_drop;
7490 +#endif
7491  #ifdef CONFIG_X86_INTEL_MPX
7492         /* address of the bounds directory */
7493         void __user *bd_addr;
7494 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
7495 index 2cb7531e7d7a..b3fdfc820216 100644
7496 --- a/include/linux/mutex.h
7497 +++ b/include/linux/mutex.h
7498 @@ -19,6 +19,17 @@
7499  #include <asm/processor.h>
7500  #include <linux/osq_lock.h>
7501  
7502 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7503 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7504 +       , .dep_map = { .name = #lockname }
7505 +#else
7506 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7507 +#endif
7508 +
7509 +#ifdef CONFIG_PREEMPT_RT_FULL
7510 +# include <linux/mutex_rt.h>
7511 +#else
7512 +
7513  /*
7514   * Simple, straightforward mutexes with strict semantics:
7515   *
7516 @@ -99,13 +110,6 @@ do {                                                        \
7517  static inline void mutex_destroy(struct mutex *lock) {}
7518  #endif
7519  
7520 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7521 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7522 -               , .dep_map = { .name = #lockname }
7523 -#else
7524 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7525 -#endif
7526 -
7527  #define __MUTEX_INITIALIZER(lockname) \
7528                 { .count = ATOMIC_INIT(1) \
7529                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7530 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
7531  extern int mutex_trylock(struct mutex *lock);
7532  extern void mutex_unlock(struct mutex *lock);
7533  
7534 +#endif /* !PREEMPT_RT_FULL */
7535 +
7536  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7537  
7538  #endif /* __LINUX_MUTEX_H */
7539 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
7540 new file mode 100644
7541 index 000000000000..c38a44b14da5
7542 --- /dev/null
7543 +++ b/include/linux/mutex_rt.h
7544 @@ -0,0 +1,84 @@
7545 +#ifndef __LINUX_MUTEX_RT_H
7546 +#define __LINUX_MUTEX_RT_H
7547 +
7548 +#ifndef __LINUX_MUTEX_H
7549 +#error "Please include mutex.h"
7550 +#endif
7551 +
7552 +#include <linux/rtmutex.h>
7553 +
7554 +/* FIXME: Just for __lockfunc */
7555 +#include <linux/spinlock.h>
7556 +
7557 +struct mutex {
7558 +       struct rt_mutex         lock;
7559 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7560 +       struct lockdep_map      dep_map;
7561 +#endif
7562 +};
7563 +
7564 +#define __MUTEX_INITIALIZER(mutexname)                                 \
7565 +       {                                                               \
7566 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
7567 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
7568 +       }
7569 +
7570 +#define DEFINE_MUTEX(mutexname)                                                \
7571 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7572 +
7573 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7574 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7575 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7576 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7577 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7578 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7579 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7580 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7581 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7582 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7583 +
7584 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
7585 +#define mutex_lock(l)                  _mutex_lock(l)
7586 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
7587 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
7588 +#define mutex_trylock(l)               _mutex_trylock(l)
7589 +#define mutex_unlock(l)                        _mutex_unlock(l)
7590 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
7591 +
7592 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7593 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
7594 +# define mutex_lock_interruptible_nested(l, s) \
7595 +                                       _mutex_lock_interruptible_nested(l, s)
7596 +# define mutex_lock_killable_nested(l, s) \
7597 +                                       _mutex_lock_killable_nested(l, s)
7598 +
7599 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
7600 +do {                                                                   \
7601 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
7602 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
7603 +} while (0)
7604 +
7605 +#else
7606 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
7607 +# define mutex_lock_interruptible_nested(l, s) \
7608 +                                       _mutex_lock_interruptible(l)
7609 +# define mutex_lock_killable_nested(l, s) \
7610 +                                       _mutex_lock_killable(l)
7611 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7612 +#endif
7613 +
7614 +# define mutex_init(mutex)                             \
7615 +do {                                                   \
7616 +       static struct lock_class_key __key;             \
7617 +                                                       \
7618 +       rt_mutex_init(&(mutex)->lock);                  \
7619 +       __mutex_do_init((mutex), #mutex, &__key);       \
7620 +} while (0)
7621 +
7622 +# define __mutex_init(mutex, name, key)                        \
7623 +do {                                                   \
7624 +       rt_mutex_init(&(mutex)->lock);                  \
7625 +       __mutex_do_init((mutex), name, key);            \
7626 +} while (0)
7627 +
7628 +#endif
7629 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
7630 index d83590ef74a1..0ae3b6cf430c 100644
7631 --- a/include/linux/netdevice.h
7632 +++ b/include/linux/netdevice.h
7633 @@ -396,7 +396,19 @@ typedef enum rx_handler_result rx_handler_result_t;
7634  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7635  
7636  void __napi_schedule(struct napi_struct *n);
7637 +
7638 +/*
7639 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7640 + * run as threads, and they can also be preempted (without PREEMPT_RT
7641 + * interrupt threads can not be preempted). Which means that calling
7642 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7643 + * and can corrupt the napi->poll_list.
7644 + */
7645 +#ifdef CONFIG_PREEMPT_RT_FULL
7646 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7647 +#else
7648  void __napi_schedule_irqoff(struct napi_struct *n);
7649 +#endif
7650  
7651  static inline bool napi_disable_pending(struct napi_struct *n)
7652  {
7653 @@ -2461,14 +2473,53 @@ void netdev_freemem(struct net_device *dev);
7654  void synchronize_net(void);
7655  int init_dummy_netdev(struct net_device *dev);
7656  
7657 -DECLARE_PER_CPU(int, xmit_recursion);
7658  #define XMIT_RECURSION_LIMIT   10
7659 +#ifdef CONFIG_PREEMPT_RT_FULL
7660 +static inline int dev_recursion_level(void)
7661 +{
7662 +       return current->xmit_recursion;
7663 +}
7664 +
7665 +static inline int xmit_rec_read(void)
7666 +{
7667 +       return current->xmit_recursion;
7668 +}
7669 +
7670 +static inline void xmit_rec_inc(void)
7671 +{
7672 +       current->xmit_recursion++;
7673 +}
7674 +
7675 +static inline void xmit_rec_dec(void)
7676 +{
7677 +       current->xmit_recursion--;
7678 +}
7679 +
7680 +#else
7681 +
7682 +DECLARE_PER_CPU(int, xmit_recursion);
7683  
7684  static inline int dev_recursion_level(void)
7685  {
7686         return this_cpu_read(xmit_recursion);
7687  }
7688  
7689 +static inline int xmit_rec_read(void)
7690 +{
7691 +       return __this_cpu_read(xmit_recursion);
7692 +}
7693 +
7694 +static inline void xmit_rec_inc(void)
7695 +{
7696 +       __this_cpu_inc(xmit_recursion);
7697 +}
7698 +
7699 +static inline void xmit_rec_dec(void)
7700 +{
7701 +       __this_cpu_dec(xmit_recursion);
7702 +}
7703 +#endif
7704 +
7705  struct net_device *dev_get_by_index(struct net *net, int ifindex);
7706  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
7707  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
7708 @@ -2851,6 +2902,7 @@ struct softnet_data {
7709         unsigned int            dropped;
7710         struct sk_buff_head     input_pkt_queue;
7711         struct napi_struct      backlog;
7712 +       struct sk_buff_head     tofree_queue;
7713  
7714  };
7715  
7716 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
7717 index 2ad1a2b289b5..b4d10155af54 100644
7718 --- a/include/linux/netfilter/x_tables.h
7719 +++ b/include/linux/netfilter/x_tables.h
7720 @@ -4,6 +4,7 @@
7721  
7722  #include <linux/netdevice.h>
7723  #include <linux/static_key.h>
7724 +#include <linux/locallock.h>
7725  #include <uapi/linux/netfilter/x_tables.h>
7726  
7727  /* Test a struct->invflags and a boolean for inequality */
7728 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
7729   */
7730  DECLARE_PER_CPU(seqcount_t, xt_recseq);
7731  
7732 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
7733 +
7734  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
7735   *
7736   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
7737 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
7738  {
7739         unsigned int addend;
7740  
7741 +       /* RT protection */
7742 +       local_lock(xt_write_lock);
7743 +
7744         /*
7745          * Low order bit of sequence is set if we already
7746          * called xt_write_recseq_begin().
7747 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
7748         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
7749         smp_wmb();
7750         __this_cpu_add(xt_recseq.sequence, addend);
7751 +       local_unlock(xt_write_lock);
7752  }
7753  
7754  /*
7755 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
7756 index 810124b33327..d54ca43d571f 100644
7757 --- a/include/linux/nfs_fs.h
7758 +++ b/include/linux/nfs_fs.h
7759 @@ -165,7 +165,11 @@ struct nfs_inode {
7760  
7761         /* Readers: in-flight sillydelete RPC calls */
7762         /* Writers: rmdir */
7763 +#ifdef CONFIG_PREEMPT_RT_BASE
7764 +       struct semaphore        rmdir_sem;
7765 +#else
7766         struct rw_semaphore     rmdir_sem;
7767 +#endif
7768  
7769  #if IS_ENABLED(CONFIG_NFS_V4)
7770         struct nfs4_cached_acl  *nfs4_acl;
7771 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
7772 index beb1e10f446e..ebaf2e7bfe29 100644
7773 --- a/include/linux/nfs_xdr.h
7774 +++ b/include/linux/nfs_xdr.h
7775 @@ -1490,7 +1490,7 @@ struct nfs_unlinkdata {
7776         struct nfs_removeargs args;
7777         struct nfs_removeres res;
7778         struct dentry *dentry;
7779 -       wait_queue_head_t wq;
7780 +       struct swait_queue_head wq;
7781         struct rpc_cred *cred;
7782         struct nfs_fattr dir_attr;
7783         long timeout;
7784 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
7785 index 4149868de4e6..babe5b9bcb91 100644
7786 --- a/include/linux/notifier.h
7787 +++ b/include/linux/notifier.h
7788 @@ -6,7 +6,7 @@
7789   *
7790   *                             Alan Cox <Alan.Cox@linux.org>
7791   */
7792
7793 +
7794  #ifndef _LINUX_NOTIFIER_H
7795  #define _LINUX_NOTIFIER_H
7796  #include <linux/errno.h>
7797 @@ -42,9 +42,7 @@
7798   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
7799   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
7800   * SRCU notifier chains should be used when the chain will be called very
7801 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
7802 - * chains are slightly more difficult to use because they require special
7803 - * runtime initialization.
7804 + * often but notifier_blocks will seldom be removed.
7805   */
7806  
7807  struct notifier_block;
7808 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
7809                 (name)->head = NULL;            \
7810         } while (0)
7811  
7812 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
7813 +/* srcu_notifier_heads must be cleaned up dynamically */
7814  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
7815  #define srcu_cleanup_notifier_head(name)       \
7816                 cleanup_srcu_struct(&(name)->srcu);
7817 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
7818                 .head = NULL }
7819  #define RAW_NOTIFIER_INIT(name)        {                               \
7820                 .head = NULL }
7821 -/* srcu_notifier_heads cannot be initialized statically */
7822 +
7823 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
7824 +       {                                                       \
7825 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
7826 +               .head = NULL,                                   \
7827 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
7828 +       }
7829  
7830  #define ATOMIC_NOTIFIER_HEAD(name)                             \
7831         struct atomic_notifier_head name =                      \
7832 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
7833         struct raw_notifier_head name =                         \
7834                 RAW_NOTIFIER_INIT(name)
7835  
7836 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
7837 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
7838 +                       name##_head_srcu_array);                \
7839 +       mod struct srcu_notifier_head name =                    \
7840 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
7841 +
7842 +#define SRCU_NOTIFIER_HEAD(name)                               \
7843 +       _SRCU_NOTIFIER_HEAD(name, )
7844 +
7845 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
7846 +       _SRCU_NOTIFIER_HEAD(name, static)
7847 +
7848  #ifdef __KERNEL__
7849  
7850  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
7851 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
7852  
7853  /*
7854   *     Declared notifiers so far. I can imagine quite a few more chains
7855 - *     over time (eg laptop power reset chains, reboot chain (to clean 
7856 + *     over time (eg laptop power reset chains, reboot chain (to clean
7857   *     device units up), device [un]mount chain, module load/unload chain,
7858 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
7859 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
7860   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
7861   */
7862
7863 +
7864  /* CPU notfiers are defined in include/linux/cpu.h. */
7865  
7866  /* netdevice notifiers are defined in include/linux/netdevice.h */
7867 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
7868 index 5b2e6159b744..ea940f451606 100644
7869 --- a/include/linux/percpu-rwsem.h
7870 +++ b/include/linux/percpu-rwsem.h
7871 @@ -4,7 +4,7 @@
7872  #include <linux/atomic.h>
7873  #include <linux/rwsem.h>
7874  #include <linux/percpu.h>
7875 -#include <linux/wait.h>
7876 +#include <linux/swait.h>
7877  #include <linux/rcu_sync.h>
7878  #include <linux/lockdep.h>
7879  
7880 @@ -12,7 +12,7 @@ struct percpu_rw_semaphore {
7881         struct rcu_sync         rss;
7882         unsigned int __percpu   *read_count;
7883         struct rw_semaphore     rw_sem;
7884 -       wait_queue_head_t       writer;
7885 +       struct swait_queue_head writer;
7886         int                     readers_block;
7887  };
7888  
7889 @@ -22,13 +22,13 @@ static struct percpu_rw_semaphore name = {                          \
7890         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
7891         .read_count = &__percpu_rwsem_rc_##name,                        \
7892         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
7893 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
7894 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
7895  }
7896  
7897  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
7898  extern void __percpu_up_read(struct percpu_rw_semaphore *);
7899  
7900 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
7901 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7902  {
7903         might_sleep();
7904  
7905 @@ -46,16 +46,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
7906         __this_cpu_inc(*sem->read_count);
7907         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
7908                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
7909 -       barrier();
7910         /*
7911 -        * The barrier() prevents the compiler from
7912 +        * The preempt_enable() prevents the compiler from
7913          * bleeding the critical section out.
7914          */
7915 -}
7916 -
7917 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7918 -{
7919 -       percpu_down_read_preempt_disable(sem);
7920         preempt_enable();
7921  }
7922  
7923 @@ -82,13 +76,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
7924         return ret;
7925  }
7926  
7927 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
7928 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7929  {
7930 -       /*
7931 -        * The barrier() prevents the compiler from
7932 -        * bleeding the critical section out.
7933 -        */
7934 -       barrier();
7935 +       preempt_disable();
7936         /*
7937          * Same as in percpu_down_read().
7938          */
7939 @@ -101,12 +91,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
7940         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
7941  }
7942  
7943 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7944 -{
7945 -       preempt_disable();
7946 -       percpu_up_read_preempt_enable(sem);
7947 -}
7948 -
7949  extern void percpu_down_write(struct percpu_rw_semaphore *);
7950  extern void percpu_up_write(struct percpu_rw_semaphore *);
7951  
7952 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
7953 index 56939d3f6e53..1c7e33fc83e4 100644
7954 --- a/include/linux/percpu.h
7955 +++ b/include/linux/percpu.h
7956 @@ -18,6 +18,35 @@
7957  #define PERCPU_MODULE_RESERVE          0
7958  #endif
7959  
7960 +#ifdef CONFIG_PREEMPT_RT_FULL
7961 +
7962 +#define get_local_var(var) (*({        \
7963 +       migrate_disable();      \
7964 +       this_cpu_ptr(&var);     }))
7965 +
7966 +#define put_local_var(var) do {        \
7967 +       (void)&(var);           \
7968 +       migrate_enable();       \
7969 +} while (0)
7970 +
7971 +# define get_local_ptr(var) ({ \
7972 +       migrate_disable();      \
7973 +       this_cpu_ptr(var);      })
7974 +
7975 +# define put_local_ptr(var) do {       \
7976 +       (void)(var);                    \
7977 +       migrate_enable();               \
7978 +} while (0)
7979 +
7980 +#else
7981 +
7982 +#define get_local_var(var)     get_cpu_var(var)
7983 +#define put_local_var(var)     put_cpu_var(var)
7984 +#define get_local_ptr(var)     get_cpu_ptr(var)
7985 +#define put_local_ptr(var)     put_cpu_ptr(var)
7986 +
7987 +#endif
7988 +
7989  /* minimum unit size, also is the maximum supported allocation size */
7990  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
7991  
7992 diff --git a/include/linux/pid.h b/include/linux/pid.h
7993 index 23705a53abba..2cc64b779f03 100644
7994 --- a/include/linux/pid.h
7995 +++ b/include/linux/pid.h
7996 @@ -2,6 +2,7 @@
7997  #define _LINUX_PID_H
7998  
7999  #include <linux/rcupdate.h>
8000 +#include <linux/atomic.h>
8001  
8002  enum pid_type
8003  {
8004 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
8005 index 75e4e30677f1..1cfb1cb72354 100644
8006 --- a/include/linux/preempt.h
8007 +++ b/include/linux/preempt.h
8008 @@ -50,7 +50,11 @@
8009  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8010  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8011  
8012 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8013 +#ifndef CONFIG_PREEMPT_RT_FULL
8014 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8015 +#else
8016 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8017 +#endif
8018  
8019  /* We use the MSB mostly because its available */
8020  #define PREEMPT_NEED_RESCHED   0x80000000
8021 @@ -59,9 +63,15 @@
8022  #include <asm/preempt.h>
8023  
8024  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8025 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8026  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8027                                  | NMI_MASK))
8028 +#ifndef CONFIG_PREEMPT_RT_FULL
8029 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8030 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8031 +#else
8032 +# define softirq_count()       (0UL)
8033 +extern int in_serving_softirq(void);
8034 +#endif
8035  
8036  /*
8037   * Are we doing bottom half or hardware interrupt processing?
8038 @@ -72,7 +82,6 @@
8039  #define in_irq()               (hardirq_count())
8040  #define in_softirq()           (softirq_count())
8041  #define in_interrupt()         (irq_count())
8042 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8043  
8044  /*
8045   * Are we in NMI context?
8046 @@ -91,7 +100,11 @@
8047  /*
8048   * The preempt_count offset after spin_lock()
8049   */
8050 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8051  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8052 +#else
8053 +#define PREEMPT_LOCK_OFFSET    0
8054 +#endif
8055  
8056  /*
8057   * The preempt_count offset needed for things like:
8058 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
8059  #define preempt_count_inc() preempt_count_add(1)
8060  #define preempt_count_dec() preempt_count_sub(1)
8061  
8062 +#ifdef CONFIG_PREEMPT_LAZY
8063 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8064 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8065 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8066 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8067 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8068 +#else
8069 +#define add_preempt_lazy_count(val)    do { } while (0)
8070 +#define sub_preempt_lazy_count(val)    do { } while (0)
8071 +#define inc_preempt_lazy_count()       do { } while (0)
8072 +#define dec_preempt_lazy_count()       do { } while (0)
8073 +#define preempt_lazy_count()           (0)
8074 +#endif
8075 +
8076  #ifdef CONFIG_PREEMPT_COUNT
8077  
8078  #define preempt_disable() \
8079 @@ -148,13 +175,25 @@ do { \
8080         barrier(); \
8081  } while (0)
8082  
8083 +#define preempt_lazy_disable() \
8084 +do { \
8085 +       inc_preempt_lazy_count(); \
8086 +       barrier(); \
8087 +} while (0)
8088 +
8089  #define sched_preempt_enable_no_resched() \
8090  do { \
8091         barrier(); \
8092         preempt_count_dec(); \
8093  } while (0)
8094  
8095 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8096 +#ifdef CONFIG_PREEMPT_RT_BASE
8097 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8098 +# define preempt_check_resched_rt() preempt_check_resched()
8099 +#else
8100 +# define preempt_enable_no_resched() preempt_enable()
8101 +# define preempt_check_resched_rt() barrier();
8102 +#endif
8103  
8104  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8105  
8106 @@ -179,6 +218,13 @@ do { \
8107                 __preempt_schedule(); \
8108  } while (0)
8109  
8110 +#define preempt_lazy_enable() \
8111 +do { \
8112 +       dec_preempt_lazy_count(); \
8113 +       barrier(); \
8114 +       preempt_check_resched(); \
8115 +} while (0)
8116 +
8117  #else /* !CONFIG_PREEMPT */
8118  #define preempt_enable() \
8119  do { \
8120 @@ -224,6 +270,7 @@ do { \
8121  #define preempt_disable_notrace()              barrier()
8122  #define preempt_enable_no_resched_notrace()    barrier()
8123  #define preempt_enable_notrace()               barrier()
8124 +#define preempt_check_resched_rt()             barrier()
8125  #define preemptible()                          0
8126  
8127  #endif /* CONFIG_PREEMPT_COUNT */
8128 @@ -244,10 +291,31 @@ do { \
8129  } while (0)
8130  #define preempt_fold_need_resched() \
8131  do { \
8132 -       if (tif_need_resched()) \
8133 +       if (tif_need_resched_now()) \
8134                 set_preempt_need_resched(); \
8135  } while (0)
8136  
8137 +#ifdef CONFIG_PREEMPT_RT_FULL
8138 +# define preempt_disable_rt()          preempt_disable()
8139 +# define preempt_enable_rt()           preempt_enable()
8140 +# define preempt_disable_nort()                barrier()
8141 +# define preempt_enable_nort()         barrier()
8142 +# ifdef CONFIG_SMP
8143 +   extern void migrate_disable(void);
8144 +   extern void migrate_enable(void);
8145 +# else /* CONFIG_SMP */
8146 +#  define migrate_disable()            barrier()
8147 +#  define migrate_enable()             barrier()
8148 +# endif /* CONFIG_SMP */
8149 +#else
8150 +# define preempt_disable_rt()          barrier()
8151 +# define preempt_enable_rt()           barrier()
8152 +# define preempt_disable_nort()                preempt_disable()
8153 +# define preempt_enable_nort()         preempt_enable()
8154 +# define migrate_disable()             preempt_disable()
8155 +# define migrate_enable()              preempt_enable()
8156 +#endif
8157 +
8158  #ifdef CONFIG_PREEMPT_NOTIFIERS
8159  
8160  struct preempt_notifier;
8161 diff --git a/include/linux/printk.h b/include/linux/printk.h
8162 index eac1af8502bb..37e647af0b0b 100644
8163 --- a/include/linux/printk.h
8164 +++ b/include/linux/printk.h
8165 @@ -126,9 +126,11 @@ struct va_format {
8166  #ifdef CONFIG_EARLY_PRINTK
8167  extern asmlinkage __printf(1, 2)
8168  void early_printk(const char *fmt, ...);
8169 +extern void printk_kill(void);
8170  #else
8171  static inline __printf(1, 2) __cold
8172  void early_printk(const char *s, ...) { }
8173 +static inline void printk_kill(void) { }
8174  #endif
8175  
8176  #ifdef CONFIG_PRINTK_NMI
8177 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
8178 index af3581b8a451..f87f87dec84c 100644
8179 --- a/include/linux/radix-tree.h
8180 +++ b/include/linux/radix-tree.h
8181 @@ -289,9 +289,19 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
8182  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
8183                         void ***results, unsigned long *indices,
8184                         unsigned long first_index, unsigned int max_items);
8185 +#ifdef CONFIG_PREEMPT_RT_FULL
8186 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
8187 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
8188 +static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
8189 +{
8190 +       return 0;
8191 +};
8192 +
8193 +#else
8194  int radix_tree_preload(gfp_t gfp_mask);
8195  int radix_tree_maybe_preload(gfp_t gfp_mask);
8196  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8197 +#endif
8198  void radix_tree_init(void);
8199  void *radix_tree_tag_set(struct radix_tree_root *root,
8200                         unsigned long index, unsigned int tag);
8201 @@ -316,7 +326,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8202  
8203  static inline void radix_tree_preload_end(void)
8204  {
8205 -       preempt_enable();
8206 +       preempt_enable_nort();
8207  }
8208  
8209  /**
8210 diff --git a/include/linux/random.h b/include/linux/random.h
8211 index 7bd2403e4fef..b2df7148a42b 100644
8212 --- a/include/linux/random.h
8213 +++ b/include/linux/random.h
8214 @@ -31,7 +31,7 @@ static inline void add_latent_entropy(void) {}
8215  
8216  extern void add_input_randomness(unsigned int type, unsigned int code,
8217                                  unsigned int value) __latent_entropy;
8218 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8219 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8220  
8221  extern void get_random_bytes(void *buf, int nbytes);
8222  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8223 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
8224 index e585018498d5..25c64474fc27 100644
8225 --- a/include/linux/rbtree.h
8226 +++ b/include/linux/rbtree.h
8227 @@ -31,7 +31,7 @@
8228  
8229  #include <linux/kernel.h>
8230  #include <linux/stddef.h>
8231 -#include <linux/rcupdate.h>
8232 +#include <linux/rcu_assign_pointer.h>
8233  
8234  struct rb_node {
8235         unsigned long  __rb_parent_color;
8236 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
8237 index d076183e49be..36bfb4dd57ae 100644
8238 --- a/include/linux/rbtree_augmented.h
8239 +++ b/include/linux/rbtree_augmented.h
8240 @@ -26,6 +26,7 @@
8241  
8242  #include <linux/compiler.h>
8243  #include <linux/rbtree.h>
8244 +#include <linux/rcupdate.h>
8245  
8246  /*
8247   * Please note - only struct rb_augment_callbacks and the prototypes for
8248 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
8249 new file mode 100644
8250 index 000000000000..7066962a4379
8251 --- /dev/null
8252 +++ b/include/linux/rcu_assign_pointer.h
8253 @@ -0,0 +1,54 @@
8254 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8255 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8256 +#include <linux/compiler.h>
8257 +#include <asm/barrier.h>
8258 +
8259 +/**
8260 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8261 + * @v: The value to statically initialize with.
8262 + */
8263 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8264 +
8265 +/**
8266 + * rcu_assign_pointer() - assign to RCU-protected pointer
8267 + * @p: pointer to assign to
8268 + * @v: value to assign (publish)
8269 + *
8270 + * Assigns the specified value to the specified RCU-protected
8271 + * pointer, ensuring that any concurrent RCU readers will see
8272 + * any prior initialization.
8273 + *
8274 + * Inserts memory barriers on architectures that require them
8275 + * (which is most of them), and also prevents the compiler from
8276 + * reordering the code that initializes the structure after the pointer
8277 + * assignment.  More importantly, this call documents which pointers
8278 + * will be dereferenced by RCU read-side code.
8279 + *
8280 + * In some special cases, you may use RCU_INIT_POINTER() instead
8281 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8282 + * to the fact that it does not constrain either the CPU or the compiler.
8283 + * That said, using RCU_INIT_POINTER() when you should have used
8284 + * rcu_assign_pointer() is a very bad thing that results in
8285 + * impossible-to-diagnose memory corruption.  So please be careful.
8286 + * See the RCU_INIT_POINTER() comment header for details.
8287 + *
8288 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8289 + * once, appearances notwithstanding.  One of the "extra" evaluations
8290 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8291 + * neither of which actually execute the argument.  As with most cpp
8292 + * macros, this execute-arguments-only-once property is important, so
8293 + * please be careful when making changes to rcu_assign_pointer() and the
8294 + * other macros that it invokes.
8295 + */
8296 +#define rcu_assign_pointer(p, v)                                             \
8297 +({                                                                           \
8298 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8299 +                                                                             \
8300 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8301 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8302 +       else                                                                  \
8303 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8304 +       _r_a_p__v;                                                            \
8305 +})
8306 +
8307 +#endif
8308 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
8309 index 321f9ed552a9..a52a110bf815 100644
8310 --- a/include/linux/rcupdate.h
8311 +++ b/include/linux/rcupdate.h
8312 @@ -46,6 +46,7 @@
8313  #include <linux/compiler.h>
8314  #include <linux/ktime.h>
8315  #include <linux/irqflags.h>
8316 +#include <linux/rcu_assign_pointer.h>
8317  
8318  #include <asm/barrier.h>
8319  
8320 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
8321  
8322  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8323  
8324 +#ifdef CONFIG_PREEMPT_RT_FULL
8325 +#define call_rcu_bh    call_rcu
8326 +#else
8327  /**
8328   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8329   * @head: structure to be used for queueing the RCU updates.
8330 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
8331   */
8332  void call_rcu_bh(struct rcu_head *head,
8333                  rcu_callback_t func);
8334 +#endif
8335  
8336  /**
8337   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8338 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
8339   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8340   */
8341  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8342 +#ifndef CONFIG_PREEMPT_RT_FULL
8343 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8344 +#else
8345 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8346 +#endif
8347  
8348  #else /* #ifdef CONFIG_PREEMPT_RCU */
8349  
8350 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
8351         return 0;
8352  }
8353  
8354 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8355 +
8356  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8357  
8358  /* Internal to kernel */
8359 @@ -501,7 +513,14 @@ extern struct lockdep_map rcu_callback_map;
8360  int debug_lockdep_rcu_enabled(void);
8361  
8362  int rcu_read_lock_held(void);
8363 +#ifdef CONFIG_PREEMPT_RT_FULL
8364 +static inline int rcu_read_lock_bh_held(void)
8365 +{
8366 +       return rcu_read_lock_held();
8367 +}
8368 +#else
8369  int rcu_read_lock_bh_held(void);
8370 +#endif
8371  
8372  /**
8373   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8374 @@ -622,54 +641,6 @@ static inline void rcu_preempt_sleep_check(void)
8375  })
8376  
8377  /**
8378 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8379 - * @v: The value to statically initialize with.
8380 - */
8381 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8382 -
8383 -/**
8384 - * rcu_assign_pointer() - assign to RCU-protected pointer
8385 - * @p: pointer to assign to
8386 - * @v: value to assign (publish)
8387 - *
8388 - * Assigns the specified value to the specified RCU-protected
8389 - * pointer, ensuring that any concurrent RCU readers will see
8390 - * any prior initialization.
8391 - *
8392 - * Inserts memory barriers on architectures that require them
8393 - * (which is most of them), and also prevents the compiler from
8394 - * reordering the code that initializes the structure after the pointer
8395 - * assignment.  More importantly, this call documents which pointers
8396 - * will be dereferenced by RCU read-side code.
8397 - *
8398 - * In some special cases, you may use RCU_INIT_POINTER() instead
8399 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8400 - * to the fact that it does not constrain either the CPU or the compiler.
8401 - * That said, using RCU_INIT_POINTER() when you should have used
8402 - * rcu_assign_pointer() is a very bad thing that results in
8403 - * impossible-to-diagnose memory corruption.  So please be careful.
8404 - * See the RCU_INIT_POINTER() comment header for details.
8405 - *
8406 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8407 - * once, appearances notwithstanding.  One of the "extra" evaluations
8408 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8409 - * neither of which actually execute the argument.  As with most cpp
8410 - * macros, this execute-arguments-only-once property is important, so
8411 - * please be careful when making changes to rcu_assign_pointer() and the
8412 - * other macros that it invokes.
8413 - */
8414 -#define rcu_assign_pointer(p, v)                                             \
8415 -({                                                                           \
8416 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8417 -                                                                             \
8418 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8419 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8420 -       else                                                                  \
8421 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8422 -       _r_a_p__v;                                                            \
8423 -})
8424 -
8425 -/**
8426   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8427   * @p: The pointer to read
8428   *
8429 @@ -947,10 +918,14 @@ static inline void rcu_read_unlock(void)
8430  static inline void rcu_read_lock_bh(void)
8431  {
8432         local_bh_disable();
8433 +#ifdef CONFIG_PREEMPT_RT_FULL
8434 +       rcu_read_lock();
8435 +#else
8436         __acquire(RCU_BH);
8437         rcu_lock_acquire(&rcu_bh_lock_map);
8438         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8439                          "rcu_read_lock_bh() used illegally while idle");
8440 +#endif
8441  }
8442  
8443  /*
8444 @@ -960,10 +935,14 @@ static inline void rcu_read_lock_bh(void)
8445   */
8446  static inline void rcu_read_unlock_bh(void)
8447  {
8448 +#ifdef CONFIG_PREEMPT_RT_FULL
8449 +       rcu_read_unlock();
8450 +#else
8451         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8452                          "rcu_read_unlock_bh() used illegally while idle");
8453         rcu_lock_release(&rcu_bh_lock_map);
8454         __release(RCU_BH);
8455 +#endif
8456         local_bh_enable();
8457  }
8458  
8459 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
8460 index 63a4e4cf40a5..08ab12df2863 100644
8461 --- a/include/linux/rcutree.h
8462 +++ b/include/linux/rcutree.h
8463 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
8464         rcu_note_context_switch();
8465  }
8466  
8467 +#ifdef CONFIG_PREEMPT_RT_FULL
8468 +# define synchronize_rcu_bh    synchronize_rcu
8469 +#else
8470  void synchronize_rcu_bh(void);
8471 +#endif
8472  void synchronize_sched_expedited(void);
8473  void synchronize_rcu_expedited(void);
8474  
8475 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
8476  }
8477  
8478  void rcu_barrier(void);
8479 +#ifdef CONFIG_PREEMPT_RT_FULL
8480 +# define rcu_barrier_bh                rcu_barrier
8481 +#else
8482  void rcu_barrier_bh(void);
8483 +#endif
8484  void rcu_barrier_sched(void);
8485  unsigned long get_state_synchronize_rcu(void);
8486  void cond_synchronize_rcu(unsigned long oldstate);
8487 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
8488  extern unsigned long rcutorture_testseq;
8489  extern unsigned long rcutorture_vernum;
8490  unsigned long rcu_batches_started(void);
8491 -unsigned long rcu_batches_started_bh(void);
8492  unsigned long rcu_batches_started_sched(void);
8493  unsigned long rcu_batches_completed(void);
8494 -unsigned long rcu_batches_completed_bh(void);
8495  unsigned long rcu_batches_completed_sched(void);
8496  unsigned long rcu_exp_batches_completed(void);
8497  unsigned long rcu_exp_batches_completed_sched(void);
8498  void show_rcu_gp_kthreads(void);
8499  
8500  void rcu_force_quiescent_state(void);
8501 -void rcu_bh_force_quiescent_state(void);
8502  void rcu_sched_force_quiescent_state(void);
8503  
8504  void rcu_idle_enter(void);
8505 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
8506  
8507  bool rcu_is_watching(void);
8508  
8509 +#ifndef CONFIG_PREEMPT_RT_FULL
8510 +void rcu_bh_force_quiescent_state(void);
8511 +unsigned long rcu_batches_started_bh(void);
8512 +unsigned long rcu_batches_completed_bh(void);
8513 +#else
8514 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
8515 +# define rcu_batches_completed_bh      rcu_batches_completed
8516 +# define rcu_batches_started_bh                rcu_batches_completed
8517 +#endif
8518 +
8519  void rcu_all_qs(void);
8520  
8521  /* RCUtree hotplug events */
8522 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
8523 index 1abba5ce2a2f..30211c627511 100644
8524 --- a/include/linux/rtmutex.h
8525 +++ b/include/linux/rtmutex.h
8526 @@ -13,11 +13,15 @@
8527  #define __LINUX_RT_MUTEX_H
8528  
8529  #include <linux/linkage.h>
8530 +#include <linux/spinlock_types_raw.h>
8531  #include <linux/rbtree.h>
8532 -#include <linux/spinlock_types.h>
8533  
8534  extern int max_lock_depth; /* for sysctl */
8535  
8536 +#ifdef CONFIG_DEBUG_MUTEXES
8537 +#include <linux/debug_locks.h>
8538 +#endif
8539 +
8540  /**
8541   * The rt_mutex structure
8542   *
8543 @@ -31,8 +35,8 @@ struct rt_mutex {
8544         struct rb_root          waiters;
8545         struct rb_node          *waiters_leftmost;
8546         struct task_struct      *owner;
8547 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8548         int                     save_state;
8549 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8550         const char              *name, *file;
8551         int                     line;
8552         void                    *magic;
8553 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
8554  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
8555  #endif
8556  
8557 +# define rt_mutex_init(mutex)                                  \
8558 +       do {                                                    \
8559 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
8560 +               __rt_mutex_init(mutex, #mutex);                 \
8561 +       } while (0)
8562 +
8563  #ifdef CONFIG_DEBUG_RT_MUTEXES
8564  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8565         , .name = #mutexname, .file = __FILE__, .line = __LINE__
8566 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
8567   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8568  #else
8569  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8570 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
8571  # define rt_mutex_debug_task_free(t)                   do { } while (0)
8572  #endif
8573  
8574 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8575 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8576 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8577 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8578         , .waiters = RB_ROOT \
8579         , .owner = NULL \
8580 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8581 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8582 +
8583 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8584 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8585 +
8586 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8587 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
8588 +       , .save_state = 1 }
8589  
8590  #define DEFINE_RT_MUTEX(mutexname) \
8591         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8592 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
8593  
8594  extern void rt_mutex_lock(struct rt_mutex *lock);
8595  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8596 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8597  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8598                                struct hrtimer_sleeper *timeout);
8599  
8600 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
8601 new file mode 100644
8602 index 000000000000..49ed2d45d3be
8603 --- /dev/null
8604 +++ b/include/linux/rwlock_rt.h
8605 @@ -0,0 +1,99 @@
8606 +#ifndef __LINUX_RWLOCK_RT_H
8607 +#define __LINUX_RWLOCK_RT_H
8608 +
8609 +#ifndef __LINUX_SPINLOCK_H
8610 +#error Do not include directly. Use spinlock.h
8611 +#endif
8612 +
8613 +#define rwlock_init(rwl)                               \
8614 +do {                                                   \
8615 +       static struct lock_class_key __key;             \
8616 +                                                       \
8617 +       rt_mutex_init(&(rwl)->lock);                    \
8618 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
8619 +} while (0)
8620 +
8621 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8622 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8623 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8624 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8625 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8626 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8627 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8628 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8629 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8630 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8631 +
8632 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
8633 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
8634 +
8635 +#define write_trylock_irqsave(lock, flags)     \
8636 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8637 +
8638 +#define read_lock_irqsave(lock, flags)                 \
8639 +       do {                                            \
8640 +               typecheck(unsigned long, flags);        \
8641 +               flags = rt_read_lock_irqsave(lock);     \
8642 +       } while (0)
8643 +
8644 +#define write_lock_irqsave(lock, flags)                        \
8645 +       do {                                            \
8646 +               typecheck(unsigned long, flags);        \
8647 +               flags = rt_write_lock_irqsave(lock);    \
8648 +       } while (0)
8649 +
8650 +#define read_lock(lock)                rt_read_lock(lock)
8651 +
8652 +#define read_lock_bh(lock)                             \
8653 +       do {                                            \
8654 +               local_bh_disable();                     \
8655 +               rt_read_lock(lock);                     \
8656 +       } while (0)
8657 +
8658 +#define read_lock_irq(lock)    read_lock(lock)
8659 +
8660 +#define write_lock(lock)       rt_write_lock(lock)
8661 +
8662 +#define write_lock_bh(lock)                            \
8663 +       do {                                            \
8664 +               local_bh_disable();                     \
8665 +               rt_write_lock(lock);                    \
8666 +       } while (0)
8667 +
8668 +#define write_lock_irq(lock)   write_lock(lock)
8669 +
8670 +#define read_unlock(lock)      rt_read_unlock(lock)
8671 +
8672 +#define read_unlock_bh(lock)                           \
8673 +       do {                                            \
8674 +               rt_read_unlock(lock);                   \
8675 +               local_bh_enable();                      \
8676 +       } while (0)
8677 +
8678 +#define read_unlock_irq(lock)  read_unlock(lock)
8679 +
8680 +#define write_unlock(lock)     rt_write_unlock(lock)
8681 +
8682 +#define write_unlock_bh(lock)                          \
8683 +       do {                                            \
8684 +               rt_write_unlock(lock);                  \
8685 +               local_bh_enable();                      \
8686 +       } while (0)
8687 +
8688 +#define write_unlock_irq(lock) write_unlock(lock)
8689 +
8690 +#define read_unlock_irqrestore(lock, flags)            \
8691 +       do {                                            \
8692 +               typecheck(unsigned long, flags);        \
8693 +               (void) flags;                           \
8694 +               rt_read_unlock(lock);                   \
8695 +       } while (0)
8696 +
8697 +#define write_unlock_irqrestore(lock, flags) \
8698 +       do {                                            \
8699 +               typecheck(unsigned long, flags);        \
8700 +               (void) flags;                           \
8701 +               rt_write_unlock(lock);                  \
8702 +       } while (0)
8703 +
8704 +#endif
8705 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
8706 index cc0072e93e36..5317cd957292 100644
8707 --- a/include/linux/rwlock_types.h
8708 +++ b/include/linux/rwlock_types.h
8709 @@ -1,6 +1,10 @@
8710  #ifndef __LINUX_RWLOCK_TYPES_H
8711  #define __LINUX_RWLOCK_TYPES_H
8712  
8713 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
8714 +# error "Do not include directly, include spinlock_types.h"
8715 +#endif
8716 +
8717  /*
8718   * include/linux/rwlock_types.h - generic rwlock type definitions
8719   *                               and initializers
8720 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
8721 new file mode 100644
8722 index 000000000000..51b28d775fe1
8723 --- /dev/null
8724 +++ b/include/linux/rwlock_types_rt.h
8725 @@ -0,0 +1,33 @@
8726 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
8727 +#define __LINUX_RWLOCK_TYPES_RT_H
8728 +
8729 +#ifndef __LINUX_SPINLOCK_TYPES_H
8730 +#error "Do not include directly. Include spinlock_types.h instead"
8731 +#endif
8732 +
8733 +/*
8734 + * rwlocks - rtmutex which allows single reader recursion
8735 + */
8736 +typedef struct {
8737 +       struct rt_mutex         lock;
8738 +       int                     read_depth;
8739 +       unsigned int            break_lock;
8740 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8741 +       struct lockdep_map      dep_map;
8742 +#endif
8743 +} rwlock_t;
8744 +
8745 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8746 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
8747 +#else
8748 +# define RW_DEP_MAP_INIT(lockname)
8749 +#endif
8750 +
8751 +#define __RW_LOCK_UNLOCKED(name) \
8752 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
8753 +         RW_DEP_MAP_INIT(name) }
8754 +
8755 +#define DEFINE_RWLOCK(name) \
8756 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
8757 +
8758 +#endif
8759 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
8760 index dd1d14250340..8e1f44ff1f2f 100644
8761 --- a/include/linux/rwsem.h
8762 +++ b/include/linux/rwsem.h
8763 @@ -19,6 +19,10 @@
8764  #include <linux/osq_lock.h>
8765  #endif
8766  
8767 +#ifdef CONFIG_PREEMPT_RT_FULL
8768 +#include <linux/rwsem_rt.h>
8769 +#else /* PREEMPT_RT_FULL */
8770 +
8771  struct rw_semaphore;
8772  
8773  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
8774 @@ -184,4 +188,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
8775  # define up_read_non_owner(sem)                        up_read(sem)
8776  #endif
8777  
8778 +#endif /* !PREEMPT_RT_FULL */
8779 +
8780  #endif /* _LINUX_RWSEM_H */
8781 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
8782 new file mode 100644
8783 index 000000000000..e26bd95a57c3
8784 --- /dev/null
8785 +++ b/include/linux/rwsem_rt.h
8786 @@ -0,0 +1,167 @@
8787 +#ifndef _LINUX_RWSEM_RT_H
8788 +#define _LINUX_RWSEM_RT_H
8789 +
8790 +#ifndef _LINUX_RWSEM_H
8791 +#error "Include rwsem.h"
8792 +#endif
8793 +
8794 +/*
8795 + * RW-semaphores are a spinlock plus a reader-depth count.
8796 + *
8797 + * Note that the semantics are different from the usual
8798 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
8799 + * multiple readers to hold the lock at once, we only allow
8800 + * a read-lock owner to read-lock recursively. This is
8801 + * better for latency, makes the implementation inherently
8802 + * fair and makes it simpler as well.
8803 + */
8804 +
8805 +#include <linux/rtmutex.h>
8806 +
8807 +struct rw_semaphore {
8808 +       struct rt_mutex         lock;
8809 +       int                     read_depth;
8810 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8811 +       struct lockdep_map      dep_map;
8812 +#endif
8813 +};
8814 +
8815 +#define __RWSEM_INITIALIZER(name) \
8816 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
8817 +         RW_DEP_MAP_INIT(name) }
8818 +
8819 +#define DECLARE_RWSEM(lockname) \
8820 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
8821 +
8822 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
8823 +                                    struct lock_class_key *key);
8824 +
8825 +#define __rt_init_rwsem(sem, name, key)                        \
8826 +       do {                                            \
8827 +               rt_mutex_init(&(sem)->lock);            \
8828 +               __rt_rwsem_init((sem), (name), (key));\
8829 +       } while (0)
8830 +
8831 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
8832 +
8833 +# define rt_init_rwsem(sem)                            \
8834 +do {                                                   \
8835 +       static struct lock_class_key __key;             \
8836 +                                                       \
8837 +       __rt_init_rwsem((sem), #sem, &__key);           \
8838 +} while (0)
8839 +
8840 +extern void rt_down_write(struct rw_semaphore *rwsem);
8841 +extern int  rt_down_write_killable(struct rw_semaphore *rwsem);
8842 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
8843 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
8844 +extern int  rt_down_write_killable_nested(struct rw_semaphore *rwsem,
8845 +                                         int subclass);
8846 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
8847 +                                     struct lockdep_map *nest);
8848 +extern void rt__down_read(struct rw_semaphore *rwsem);
8849 +extern void rt_down_read(struct rw_semaphore *rwsem);
8850 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
8851 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
8852 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
8853 +extern void __rt_up_read(struct rw_semaphore *rwsem);
8854 +extern void rt_up_read(struct rw_semaphore *rwsem);
8855 +extern void rt_up_write(struct rw_semaphore *rwsem);
8856 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
8857 +
8858 +#define init_rwsem(sem)                rt_init_rwsem(sem)
8859 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
8860 +
8861 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
8862 +{
8863 +       /* rt_mutex_has_waiters() */
8864 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
8865 +}
8866 +
8867 +static inline void __down_read(struct rw_semaphore *sem)
8868 +{
8869 +       rt__down_read(sem);
8870 +}
8871 +
8872 +static inline void down_read(struct rw_semaphore *sem)
8873 +{
8874 +       rt_down_read(sem);
8875 +}
8876 +
8877 +static inline int __down_read_trylock(struct rw_semaphore *sem)
8878 +{
8879 +       return rt__down_read_trylock(sem);
8880 +}
8881 +
8882 +static inline int down_read_trylock(struct rw_semaphore *sem)
8883 +{
8884 +       return rt_down_read_trylock(sem);
8885 +}
8886 +
8887 +static inline void down_write(struct rw_semaphore *sem)
8888 +{
8889 +       rt_down_write(sem);
8890 +}
8891 +
8892 +static inline int down_write_killable(struct rw_semaphore *sem)
8893 +{
8894 +       return rt_down_write_killable(sem);
8895 +}
8896 +
8897 +static inline int down_write_trylock(struct rw_semaphore *sem)
8898 +{
8899 +       return rt_down_write_trylock(sem);
8900 +}
8901 +
8902 +static inline void __up_read(struct rw_semaphore *sem)
8903 +{
8904 +       __rt_up_read(sem);
8905 +}
8906 +
8907 +static inline void up_read(struct rw_semaphore *sem)
8908 +{
8909 +       rt_up_read(sem);
8910 +}
8911 +
8912 +static inline void up_write(struct rw_semaphore *sem)
8913 +{
8914 +       rt_up_write(sem);
8915 +}
8916 +
8917 +static inline void downgrade_write(struct rw_semaphore *sem)
8918 +{
8919 +       rt_downgrade_write(sem);
8920 +}
8921 +
8922 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
8923 +{
8924 +       return rt_down_read_nested(sem, subclass);
8925 +}
8926 +
8927 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
8928 +{
8929 +       rt_down_write_nested(sem, subclass);
8930 +}
8931 +
8932 +static inline int down_write_killable_nested(struct rw_semaphore *sem,
8933 +                                            int subclass)
8934 +{
8935 +       return rt_down_write_killable_nested(sem, subclass);
8936 +}
8937 +
8938 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8939 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
8940 +               struct rw_semaphore *nest_lock)
8941 +{
8942 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
8943 +}
8944 +
8945 +#else
8946 +
8947 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
8948 +               struct rw_semaphore *nest_lock)
8949 +{
8950 +       rt_down_write_nested_lock(sem, NULL);
8951 +}
8952 +#endif
8953 +#endif
8954 diff --git a/include/linux/sched.h b/include/linux/sched.h
8955 index 75d9a57e212e..8cb7df0f56e3 100644
8956 --- a/include/linux/sched.h
8957 +++ b/include/linux/sched.h
8958 @@ -26,6 +26,7 @@ struct sched_param {
8959  #include <linux/nodemask.h>
8960  #include <linux/mm_types.h>
8961  #include <linux/preempt.h>
8962 +#include <asm/kmap_types.h>
8963  
8964  #include <asm/page.h>
8965  #include <asm/ptrace.h>
8966 @@ -243,10 +244,7 @@ extern char ___assert_task_state[1 - 2*!!(
8967                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
8968                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
8969  
8970 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
8971  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
8972 -#define task_is_stopped_or_traced(task)        \
8973 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
8974  #define task_contributes_to_load(task) \
8975                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
8976                                  (task->flags & PF_FROZEN) == 0 && \
8977 @@ -312,6 +310,11 @@ extern char ___assert_task_state[1 - 2*!!(
8978  
8979  #endif
8980  
8981 +#define __set_current_state_no_track(state_value)      \
8982 +       do { current->state = (state_value); } while (0)
8983 +#define set_current_state_no_track(state_value)                \
8984 +       set_mb(current->state, (state_value))
8985 +
8986  /* Task command name length */
8987  #define TASK_COMM_LEN 16
8988  
8989 @@ -1013,8 +1016,18 @@ struct wake_q_head {
8990         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
8991  
8992  extern void wake_q_add(struct wake_q_head *head,
8993 -                      struct task_struct *task);
8994 -extern void wake_up_q(struct wake_q_head *head);
8995 +                             struct task_struct *task);
8996 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
8997 +
8998 +static inline void wake_up_q(struct wake_q_head *head)
8999 +{
9000 +       __wake_up_q(head, false);
9001 +}
9002 +
9003 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
9004 +{
9005 +       __wake_up_q(head, true);
9006 +}
9007  
9008  /*
9009   * sched-domains (multiprocessor balancing) declarations:
9010 @@ -1481,6 +1494,7 @@ struct task_struct {
9011         struct thread_info thread_info;
9012  #endif
9013         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9014 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
9015         void *stack;
9016         atomic_t usage;
9017         unsigned int flags;     /* per process flags, defined below */
9018 @@ -1520,6 +1534,12 @@ struct task_struct {
9019  #endif
9020  
9021         unsigned int policy;
9022 +#ifdef CONFIG_PREEMPT_RT_FULL
9023 +       int migrate_disable;
9024 +# ifdef CONFIG_SCHED_DEBUG
9025 +       int migrate_disable_atomic;
9026 +# endif
9027 +#endif
9028         int nr_cpus_allowed;
9029         cpumask_t cpus_allowed;
9030  
9031 @@ -1654,6 +1674,9 @@ struct task_struct {
9032  
9033         struct task_cputime cputime_expires;
9034         struct list_head cpu_timers[3];
9035 +#ifdef CONFIG_PREEMPT_RT_BASE
9036 +       struct task_struct *posix_timer_list;
9037 +#endif
9038  
9039  /* process credentials */
9040         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
9041 @@ -1685,10 +1708,15 @@ struct task_struct {
9042  /* signal handlers */
9043         struct signal_struct *signal;
9044         struct sighand_struct *sighand;
9045 +       struct sigqueue *sigqueue_cache;
9046  
9047         sigset_t blocked, real_blocked;
9048         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9049         struct sigpending pending;
9050 +#ifdef CONFIG_PREEMPT_RT_FULL
9051 +       /* TODO: move me into ->restart_block ? */
9052 +       struct siginfo forced_info;
9053 +#endif
9054  
9055         unsigned long sas_ss_sp;
9056         size_t sas_ss_size;
9057 @@ -1917,6 +1945,12 @@ struct task_struct {
9058         /* bitmask and counter of trace recursion */
9059         unsigned long trace_recursion;
9060  #endif /* CONFIG_TRACING */
9061 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
9062 +       u64 preempt_timestamp_hist;
9063 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
9064 +       long timer_offset;
9065 +#endif
9066 +#endif
9067  #ifdef CONFIG_KCOV
9068         /* Coverage collection mode enabled for this task (0 if disabled). */
9069         enum kcov_mode kcov_mode;
9070 @@ -1942,9 +1976,23 @@ struct task_struct {
9071         unsigned int    sequential_io;
9072         unsigned int    sequential_io_avg;
9073  #endif
9074 +#ifdef CONFIG_PREEMPT_RT_BASE
9075 +       struct rcu_head put_rcu;
9076 +       int softirq_nestcnt;
9077 +       unsigned int softirqs_raised;
9078 +#endif
9079 +#ifdef CONFIG_PREEMPT_RT_FULL
9080 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9081 +       int kmap_idx;
9082 +       pte_t kmap_pte[KM_TYPE_NR];
9083 +# endif
9084 +#endif
9085  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9086         unsigned long   task_state_change;
9087  #endif
9088 +#ifdef CONFIG_PREEMPT_RT_FULL
9089 +       int xmit_recursion;
9090 +#endif
9091         int pagefault_disabled;
9092  #ifdef CONFIG_MMU
9093         struct task_struct *oom_reaper_list;
9094 @@ -1984,14 +2032,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
9095  }
9096  #endif
9097  
9098 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9099 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9100 -
9101 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9102 -{
9103 -       return p->nr_cpus_allowed;
9104 -}
9105 -
9106  #define TNF_MIGRATED   0x01
9107  #define TNF_NO_GROUP   0x02
9108  #define TNF_SHARED     0x04
9109 @@ -2207,6 +2247,15 @@ extern struct pid *cad_pid;
9110  extern void free_task(struct task_struct *tsk);
9111  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9112  
9113 +#ifdef CONFIG_PREEMPT_RT_BASE
9114 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9115 +
9116 +static inline void put_task_struct(struct task_struct *t)
9117 +{
9118 +       if (atomic_dec_and_test(&t->usage))
9119 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9120 +}
9121 +#else
9122  extern void __put_task_struct(struct task_struct *t);
9123  
9124  static inline void put_task_struct(struct task_struct *t)
9125 @@ -2214,6 +2263,7 @@ static inline void put_task_struct(struct task_struct *t)
9126         if (atomic_dec_and_test(&t->usage))
9127                 __put_task_struct(t);
9128  }
9129 +#endif
9130  
9131  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9132  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9133 @@ -2255,6 +2305,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
9134  /*
9135   * Per process flags
9136   */
9137 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9138  #define PF_EXITING     0x00000004      /* getting shut down */
9139  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9140  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9141 @@ -2423,6 +2474,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
9142  
9143  extern int set_cpus_allowed_ptr(struct task_struct *p,
9144                                 const struct cpumask *new_mask);
9145 +int migrate_me(void);
9146 +void tell_sched_cpu_down_begin(int cpu);
9147 +void tell_sched_cpu_down_done(int cpu);
9148 +
9149  #else
9150  static inline void do_set_cpus_allowed(struct task_struct *p,
9151                                       const struct cpumask *new_mask)
9152 @@ -2435,6 +2490,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
9153                 return -EINVAL;
9154         return 0;
9155  }
9156 +static inline int migrate_me(void) { return 0; }
9157 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9158 +static inline void tell_sched_cpu_down_done(int cpu) { }
9159  #endif
9160  
9161  #ifdef CONFIG_NO_HZ_COMMON
9162 @@ -2673,6 +2731,7 @@ extern void xtime_update(unsigned long ticks);
9163  
9164  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9165  extern int wake_up_process(struct task_struct *tsk);
9166 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9167  extern void wake_up_new_task(struct task_struct *tsk);
9168  #ifdef CONFIG_SMP
9169   extern void kick_process(struct task_struct *tsk);
9170 @@ -2881,6 +2940,17 @@ static inline void mmdrop(struct mm_struct *mm)
9171                 __mmdrop(mm);
9172  }
9173  
9174 +#ifdef CONFIG_PREEMPT_RT_BASE
9175 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9176 +static inline void mmdrop_delayed(struct mm_struct *mm)
9177 +{
9178 +       if (atomic_dec_and_test(&mm->mm_count))
9179 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9180 +}
9181 +#else
9182 +# define mmdrop_delayed(mm)    mmdrop(mm)
9183 +#endif
9184 +
9185  static inline void mmdrop_async_fn(struct work_struct *work)
9186  {
9187         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9188 @@ -3273,6 +3343,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
9189         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9190  }
9191  
9192 +#ifdef CONFIG_PREEMPT_LAZY
9193 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9194 +{
9195 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9196 +}
9197 +
9198 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9199 +{
9200 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9201 +}
9202 +
9203 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9204 +{
9205 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9206 +}
9207 +
9208 +static inline int need_resched_lazy(void)
9209 +{
9210 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9211 +}
9212 +
9213 +static inline int need_resched_now(void)
9214 +{
9215 +       return test_thread_flag(TIF_NEED_RESCHED);
9216 +}
9217 +
9218 +#else
9219 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9220 +static inline int need_resched_lazy(void) { return 0; }
9221 +
9222 +static inline int need_resched_now(void)
9223 +{
9224 +       return test_thread_flag(TIF_NEED_RESCHED);
9225 +}
9226 +
9227 +#endif
9228 +
9229  static inline int restart_syscall(void)
9230  {
9231         set_tsk_thread_flag(current, TIF_SIGPENDING);
9232 @@ -3304,6 +3411,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
9233         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9234  }
9235  
9236 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9237 +{
9238 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9239 +               return true;
9240 +#ifdef CONFIG_PREEMPT_RT_FULL
9241 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9242 +               return true;
9243 +#endif
9244 +       return false;
9245 +}
9246 +
9247 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9248 +{
9249 +       bool traced_stopped;
9250 +
9251 +#ifdef CONFIG_PREEMPT_RT_FULL
9252 +       unsigned long flags;
9253 +
9254 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9255 +       traced_stopped = __task_is_stopped_or_traced(task);
9256 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9257 +#else
9258 +       traced_stopped = __task_is_stopped_or_traced(task);
9259 +#endif
9260 +       return traced_stopped;
9261 +}
9262 +
9263 +static inline bool task_is_traced(struct task_struct *task)
9264 +{
9265 +       bool traced = false;
9266 +
9267 +       if (task->state & __TASK_TRACED)
9268 +               return true;
9269 +#ifdef CONFIG_PREEMPT_RT_FULL
9270 +       /* in case the task is sleeping on tasklist_lock */
9271 +       raw_spin_lock_irq(&task->pi_lock);
9272 +       if (task->state & __TASK_TRACED)
9273 +               traced = true;
9274 +       else if (task->saved_state & __TASK_TRACED)
9275 +               traced = true;
9276 +       raw_spin_unlock_irq(&task->pi_lock);
9277 +#endif
9278 +       return traced;
9279 +}
9280 +
9281  /*
9282   * cond_resched() and cond_resched_lock(): latency reduction via
9283   * explicit rescheduling in places that are safe. The return
9284 @@ -3329,12 +3481,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
9285         __cond_resched_lock(lock);                              \
9286  })
9287  
9288 +#ifndef CONFIG_PREEMPT_RT_FULL
9289  extern int __cond_resched_softirq(void);
9290  
9291  #define cond_resched_softirq() ({                                      \
9292         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9293         __cond_resched_softirq();                                       \
9294  })
9295 +#else
9296 +# define cond_resched_softirq()                cond_resched()
9297 +#endif
9298  
9299  static inline void cond_resched_rcu(void)
9300  {
9301 @@ -3509,6 +3665,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
9302  
9303  #endif /* CONFIG_SMP */
9304  
9305 +static inline int __migrate_disabled(struct task_struct *p)
9306 +{
9307 +#ifdef CONFIG_PREEMPT_RT_FULL
9308 +       return p->migrate_disable;
9309 +#else
9310 +       return 0;
9311 +#endif
9312 +}
9313 +
9314 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9315 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9316 +{
9317 +       if (__migrate_disabled(p))
9318 +               return cpumask_of(task_cpu(p));
9319 +
9320 +       return &p->cpus_allowed;
9321 +}
9322 +
9323 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9324 +{
9325 +       if (__migrate_disabled(p))
9326 +               return 1;
9327 +       return p->nr_cpus_allowed;
9328 +}
9329 +
9330  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9331  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9332  
9333 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
9334 index ead97654c4e9..3d7223ffdd3b 100644
9335 --- a/include/linux/seqlock.h
9336 +++ b/include/linux/seqlock.h
9337 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
9338         return __read_seqcount_retry(s, start);
9339  }
9340  
9341 -
9342 -
9343 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9344 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9345  {
9346         s->sequence++;
9347         smp_wmb();
9348  }
9349  
9350 -static inline void raw_write_seqcount_end(seqcount_t *s)
9351 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9352 +{
9353 +       preempt_disable_rt();
9354 +       __raw_write_seqcount_begin(s);
9355 +}
9356 +
9357 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9358  {
9359         smp_wmb();
9360         s->sequence++;
9361  }
9362  
9363 +static inline void raw_write_seqcount_end(seqcount_t *s)
9364 +{
9365 +       __raw_write_seqcount_end(s);
9366 +       preempt_enable_rt();
9367 +}
9368 +
9369  /**
9370   * raw_write_seqcount_barrier - do a seq write barrier
9371   * @s: pointer to seqcount_t
9372 @@ -428,10 +438,32 @@ typedef struct {
9373  /*
9374   * Read side functions for starting and finalizing a read side section.
9375   */
9376 +#ifndef CONFIG_PREEMPT_RT_FULL
9377  static inline unsigned read_seqbegin(const seqlock_t *sl)
9378  {
9379         return read_seqcount_begin(&sl->seqcount);
9380  }
9381 +#else
9382 +/*
9383 + * Starvation safe read side for RT
9384 + */
9385 +static inline unsigned read_seqbegin(seqlock_t *sl)
9386 +{
9387 +       unsigned ret;
9388 +
9389 +repeat:
9390 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9391 +       if (unlikely(ret & 1)) {
9392 +               /*
9393 +                * Take the lock and let the writer proceed (i.e. evtl
9394 +                * boost it), otherwise we could loop here forever.
9395 +                */
9396 +               spin_unlock_wait(&sl->lock);
9397 +               goto repeat;
9398 +       }
9399 +       return ret;
9400 +}
9401 +#endif
9402  
9403  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9404  {
9405 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9406  static inline void write_seqlock(seqlock_t *sl)
9407  {
9408         spin_lock(&sl->lock);
9409 -       write_seqcount_begin(&sl->seqcount);
9410 +       __raw_write_seqcount_begin(&sl->seqcount);
9411 +}
9412 +
9413 +static inline int try_write_seqlock(seqlock_t *sl)
9414 +{
9415 +       if (spin_trylock(&sl->lock)) {
9416 +               __raw_write_seqcount_begin(&sl->seqcount);
9417 +               return 1;
9418 +       }
9419 +       return 0;
9420  }
9421  
9422  static inline void write_sequnlock(seqlock_t *sl)
9423  {
9424 -       write_seqcount_end(&sl->seqcount);
9425 +       __raw_write_seqcount_end(&sl->seqcount);
9426         spin_unlock(&sl->lock);
9427  }
9428  
9429  static inline void write_seqlock_bh(seqlock_t *sl)
9430  {
9431         spin_lock_bh(&sl->lock);
9432 -       write_seqcount_begin(&sl->seqcount);
9433 +       __raw_write_seqcount_begin(&sl->seqcount);
9434  }
9435  
9436  static inline void write_sequnlock_bh(seqlock_t *sl)
9437  {
9438 -       write_seqcount_end(&sl->seqcount);
9439 +       __raw_write_seqcount_end(&sl->seqcount);
9440         spin_unlock_bh(&sl->lock);
9441  }
9442  
9443  static inline void write_seqlock_irq(seqlock_t *sl)
9444  {
9445         spin_lock_irq(&sl->lock);
9446 -       write_seqcount_begin(&sl->seqcount);
9447 +       __raw_write_seqcount_begin(&sl->seqcount);
9448  }
9449  
9450  static inline void write_sequnlock_irq(seqlock_t *sl)
9451  {
9452 -       write_seqcount_end(&sl->seqcount);
9453 +       __raw_write_seqcount_end(&sl->seqcount);
9454         spin_unlock_irq(&sl->lock);
9455  }
9456  
9457 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9458         unsigned long flags;
9459  
9460         spin_lock_irqsave(&sl->lock, flags);
9461 -       write_seqcount_begin(&sl->seqcount);
9462 +       __raw_write_seqcount_begin(&sl->seqcount);
9463         return flags;
9464  }
9465  
9466 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9467  static inline void
9468  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9469  {
9470 -       write_seqcount_end(&sl->seqcount);
9471 +       __raw_write_seqcount_end(&sl->seqcount);
9472         spin_unlock_irqrestore(&sl->lock, flags);
9473  }
9474  
9475 diff --git a/include/linux/signal.h b/include/linux/signal.h
9476 index b63f63eaa39c..295540fdfc72 100644
9477 --- a/include/linux/signal.h
9478 +++ b/include/linux/signal.h
9479 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
9480  }
9481  
9482  extern void flush_sigqueue(struct sigpending *queue);
9483 +extern void flush_task_sigqueue(struct task_struct *tsk);
9484  
9485  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9486  static inline int valid_signal(unsigned long sig)
9487 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
9488 index 32810f279f8e..0db6e31161f6 100644
9489 --- a/include/linux/skbuff.h
9490 +++ b/include/linux/skbuff.h
9491 @@ -284,6 +284,7 @@ struct sk_buff_head {
9492  
9493         __u32           qlen;
9494         spinlock_t      lock;
9495 +       raw_spinlock_t  raw_lock;
9496  };
9497  
9498  struct sk_buff;
9499 @@ -1573,6 +1574,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
9500         __skb_queue_head_init(list);
9501  }
9502  
9503 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9504 +{
9505 +       raw_spin_lock_init(&list->raw_lock);
9506 +       __skb_queue_head_init(list);
9507 +}
9508 +
9509  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9510                 struct lock_class_key *class)
9511  {
9512 diff --git a/include/linux/smp.h b/include/linux/smp.h
9513 index 8e0cb7a0f836..b16ca967ad80 100644
9514 --- a/include/linux/smp.h
9515 +++ b/include/linux/smp.h
9516 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
9517  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
9518  #define put_cpu()              preempt_enable()
9519  
9520 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
9521 +#define put_cpu_light()                migrate_enable()
9522 +
9523  /*
9524   * Callback to arch code if there's nosmp or maxcpus=0 on the
9525   * boot command line:
9526 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
9527 index 47dd0cebd204..02928fa5499d 100644
9528 --- a/include/linux/spinlock.h
9529 +++ b/include/linux/spinlock.h
9530 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9531  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
9532  
9533  /* Include rwlock functions */
9534 -#include <linux/rwlock.h>
9535 +#ifdef CONFIG_PREEMPT_RT_FULL
9536 +# include <linux/rwlock_rt.h>
9537 +#else
9538 +# include <linux/rwlock.h>
9539 +#endif
9540  
9541  /*
9542   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9543 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9544  # include <linux/spinlock_api_up.h>
9545  #endif
9546  
9547 +#ifdef CONFIG_PREEMPT_RT_FULL
9548 +# include <linux/spinlock_rt.h>
9549 +#else /* PREEMPT_RT_FULL */
9550 +
9551  /*
9552   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9553   */
9554 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
9555         raw_spin_unlock(&lock->rlock);
9556  }
9557  
9558 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
9559 +{
9560 +       raw_spin_unlock(&lock->rlock);
9561 +       return 0;
9562 +}
9563 +
9564  static __always_inline void spin_unlock_bh(spinlock_t *lock)
9565  {
9566         raw_spin_unlock_bh(&lock->rlock);
9567 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
9568  #define atomic_dec_and_lock(atomic, lock) \
9569                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9570  
9571 +#endif /* !PREEMPT_RT_FULL */
9572 +
9573  #endif /* __LINUX_SPINLOCK_H */
9574 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
9575 index 5344268e6e62..043263f30e81 100644
9576 --- a/include/linux/spinlock_api_smp.h
9577 +++ b/include/linux/spinlock_api_smp.h
9578 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
9579         return 0;
9580  }
9581  
9582 -#include <linux/rwlock_api_smp.h>
9583 +#ifndef CONFIG_PREEMPT_RT_FULL
9584 +# include <linux/rwlock_api_smp.h>
9585 +#endif
9586  
9587  #endif /* __LINUX_SPINLOCK_API_SMP_H */
9588 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
9589 new file mode 100644
9590 index 000000000000..3534cff3dd08
9591 --- /dev/null
9592 +++ b/include/linux/spinlock_rt.h
9593 @@ -0,0 +1,164 @@
9594 +#ifndef __LINUX_SPINLOCK_RT_H
9595 +#define __LINUX_SPINLOCK_RT_H
9596 +
9597 +#ifndef __LINUX_SPINLOCK_H
9598 +#error Do not include directly. Use spinlock.h
9599 +#endif
9600 +
9601 +#include <linux/bug.h>
9602 +
9603 +extern void
9604 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9605 +
9606 +#define spin_lock_init(slock)                          \
9607 +do {                                                   \
9608 +       static struct lock_class_key __key;             \
9609 +                                                       \
9610 +       rt_mutex_init(&(slock)->lock);                  \
9611 +       __rt_spin_lock_init(slock, #slock, &__key);     \
9612 +} while (0)
9613 +
9614 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9615 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9616 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9617 +
9618 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9619 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9620 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9621 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9622 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
9623 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9624 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9625 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9626 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9627 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9628 +
9629 +/*
9630 + * lockdep-less calls, for derived types like rwlock:
9631 + * (for trylock they can use rt_mutex_trylock() directly.
9632 + */
9633 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9634 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9635 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9636 +
9637 +#define spin_lock(lock)                        rt_spin_lock(lock)
9638 +
9639 +#define spin_lock_bh(lock)                     \
9640 +       do {                                    \
9641 +               local_bh_disable();             \
9642 +               rt_spin_lock(lock);             \
9643 +       } while (0)
9644 +
9645 +#define spin_lock_irq(lock)            spin_lock(lock)
9646 +
9647 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
9648 +
9649 +#define spin_trylock(lock)                     \
9650 +({                                             \
9651 +       int __locked;                           \
9652 +       __locked = spin_do_trylock(lock);       \
9653 +       __locked;                               \
9654 +})
9655 +
9656 +#ifdef CONFIG_LOCKDEP
9657 +# define spin_lock_nested(lock, subclass)              \
9658 +       do {                                            \
9659 +               rt_spin_lock_nested(lock, subclass);    \
9660 +       } while (0)
9661 +
9662 +#define spin_lock_bh_nested(lock, subclass)            \
9663 +       do {                                            \
9664 +               local_bh_disable();                     \
9665 +               rt_spin_lock_nested(lock, subclass);    \
9666 +       } while (0)
9667 +
9668 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9669 +       do {                                             \
9670 +               typecheck(unsigned long, flags);         \
9671 +               flags = 0;                               \
9672 +               rt_spin_lock_nested(lock, subclass);     \
9673 +       } while (0)
9674 +#else
9675 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
9676 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
9677 +
9678 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9679 +       do {                                             \
9680 +               typecheck(unsigned long, flags);         \
9681 +               flags = 0;                               \
9682 +               spin_lock(lock);                         \
9683 +       } while (0)
9684 +#endif
9685 +
9686 +#define spin_lock_irqsave(lock, flags)                  \
9687 +       do {                                             \
9688 +               typecheck(unsigned long, flags);         \
9689 +               flags = 0;                               \
9690 +               spin_lock(lock);                         \
9691 +       } while (0)
9692 +
9693 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
9694 +{
9695 +       unsigned long flags = 0;
9696 +#ifdef CONFIG_TRACE_IRQFLAGS
9697 +       flags = rt_spin_lock_trace_flags(lock);
9698 +#else
9699 +       spin_lock(lock); /* lock_local */
9700 +#endif
9701 +       return flags;
9702 +}
9703 +
9704 +/* FIXME: we need rt_spin_lock_nest_lock */
9705 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
9706 +
9707 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
9708 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
9709 +
9710 +#define spin_unlock_bh(lock)                           \
9711 +       do {                                            \
9712 +               rt_spin_unlock(lock);                   \
9713 +               local_bh_enable();                      \
9714 +       } while (0)
9715 +
9716 +#define spin_unlock_irq(lock)          spin_unlock(lock)
9717 +
9718 +#define spin_unlock_irqrestore(lock, flags)            \
9719 +       do {                                            \
9720 +               typecheck(unsigned long, flags);        \
9721 +               (void) flags;                           \
9722 +               spin_unlock(lock);                      \
9723 +       } while (0)
9724 +
9725 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
9726 +#define spin_trylock_irq(lock) spin_trylock(lock)
9727 +
9728 +#define spin_trylock_irqsave(lock, flags)      \
9729 +       rt_spin_trylock_irqsave(lock, &(flags))
9730 +
9731 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
9732 +
9733 +#ifdef CONFIG_GENERIC_LOCKBREAK
9734 +# define spin_is_contended(lock)       ((lock)->break_lock)
9735 +#else
9736 +# define spin_is_contended(lock)       (((void)(lock), 0))
9737 +#endif
9738 +
9739 +static inline int spin_can_lock(spinlock_t *lock)
9740 +{
9741 +       return !rt_mutex_is_locked(&lock->lock);
9742 +}
9743 +
9744 +static inline int spin_is_locked(spinlock_t *lock)
9745 +{
9746 +       return rt_mutex_is_locked(&lock->lock);
9747 +}
9748 +
9749 +static inline void assert_spin_locked(spinlock_t *lock)
9750 +{
9751 +       BUG_ON(!spin_is_locked(lock));
9752 +}
9753 +
9754 +#define atomic_dec_and_lock(atomic, lock) \
9755 +       atomic_dec_and_spin_lock(atomic, lock)
9756 +
9757 +#endif
9758 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
9759 index 73548eb13a5d..10bac715ea96 100644
9760 --- a/include/linux/spinlock_types.h
9761 +++ b/include/linux/spinlock_types.h
9762 @@ -9,80 +9,15 @@
9763   * Released under the General Public License (GPL).
9764   */
9765  
9766 -#if defined(CONFIG_SMP)
9767 -# include <asm/spinlock_types.h>
9768 +#include <linux/spinlock_types_raw.h>
9769 +
9770 +#ifndef CONFIG_PREEMPT_RT_FULL
9771 +# include <linux/spinlock_types_nort.h>
9772 +# include <linux/rwlock_types.h>
9773  #else
9774 -# include <linux/spinlock_types_up.h>
9775 +# include <linux/rtmutex.h>
9776 +# include <linux/spinlock_types_rt.h>
9777 +# include <linux/rwlock_types_rt.h>
9778  #endif
9779  
9780 -#include <linux/lockdep.h>
9781 -
9782 -typedef struct raw_spinlock {
9783 -       arch_spinlock_t raw_lock;
9784 -#ifdef CONFIG_GENERIC_LOCKBREAK
9785 -       unsigned int break_lock;
9786 -#endif
9787 -#ifdef CONFIG_DEBUG_SPINLOCK
9788 -       unsigned int magic, owner_cpu;
9789 -       void *owner;
9790 -#endif
9791 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9792 -       struct lockdep_map dep_map;
9793 -#endif
9794 -} raw_spinlock_t;
9795 -
9796 -#define SPINLOCK_MAGIC         0xdead4ead
9797 -
9798 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9799 -
9800 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9801 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9802 -#else
9803 -# define SPIN_DEP_MAP_INIT(lockname)
9804 -#endif
9805 -
9806 -#ifdef CONFIG_DEBUG_SPINLOCK
9807 -# define SPIN_DEBUG_INIT(lockname)             \
9808 -       .magic = SPINLOCK_MAGIC,                \
9809 -       .owner_cpu = -1,                        \
9810 -       .owner = SPINLOCK_OWNER_INIT,
9811 -#else
9812 -# define SPIN_DEBUG_INIT(lockname)
9813 -#endif
9814 -
9815 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9816 -       {                                       \
9817 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9818 -       SPIN_DEBUG_INIT(lockname)               \
9819 -       SPIN_DEP_MAP_INIT(lockname) }
9820 -
9821 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9822 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9823 -
9824 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9825 -
9826 -typedef struct spinlock {
9827 -       union {
9828 -               struct raw_spinlock rlock;
9829 -
9830 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9831 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9832 -               struct {
9833 -                       u8 __padding[LOCK_PADSIZE];
9834 -                       struct lockdep_map dep_map;
9835 -               };
9836 -#endif
9837 -       };
9838 -} spinlock_t;
9839 -
9840 -#define __SPIN_LOCK_INITIALIZER(lockname) \
9841 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9842 -
9843 -#define __SPIN_LOCK_UNLOCKED(lockname) \
9844 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9845 -
9846 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9847 -
9848 -#include <linux/rwlock_types.h>
9849 -
9850  #endif /* __LINUX_SPINLOCK_TYPES_H */
9851 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
9852 new file mode 100644
9853 index 000000000000..f1dac1fb1d6a
9854 --- /dev/null
9855 +++ b/include/linux/spinlock_types_nort.h
9856 @@ -0,0 +1,33 @@
9857 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
9858 +#define __LINUX_SPINLOCK_TYPES_NORT_H
9859 +
9860 +#ifndef __LINUX_SPINLOCK_TYPES_H
9861 +#error "Do not include directly. Include spinlock_types.h instead"
9862 +#endif
9863 +
9864 +/*
9865 + * The non RT version maps spinlocks to raw_spinlocks
9866 + */
9867 +typedef struct spinlock {
9868 +       union {
9869 +               struct raw_spinlock rlock;
9870 +
9871 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9872 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9873 +               struct {
9874 +                       u8 __padding[LOCK_PADSIZE];
9875 +                       struct lockdep_map dep_map;
9876 +               };
9877 +#endif
9878 +       };
9879 +} spinlock_t;
9880 +
9881 +#define __SPIN_LOCK_INITIALIZER(lockname) \
9882 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9883 +
9884 +#define __SPIN_LOCK_UNLOCKED(lockname) \
9885 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9886 +
9887 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9888 +
9889 +#endif
9890 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
9891 new file mode 100644
9892 index 000000000000..edffc4d53fc9
9893 --- /dev/null
9894 +++ b/include/linux/spinlock_types_raw.h
9895 @@ -0,0 +1,56 @@
9896 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
9897 +#define __LINUX_SPINLOCK_TYPES_RAW_H
9898 +
9899 +#if defined(CONFIG_SMP)
9900 +# include <asm/spinlock_types.h>
9901 +#else
9902 +# include <linux/spinlock_types_up.h>
9903 +#endif
9904 +
9905 +#include <linux/lockdep.h>
9906 +
9907 +typedef struct raw_spinlock {
9908 +       arch_spinlock_t raw_lock;
9909 +#ifdef CONFIG_GENERIC_LOCKBREAK
9910 +       unsigned int break_lock;
9911 +#endif
9912 +#ifdef CONFIG_DEBUG_SPINLOCK
9913 +       unsigned int magic, owner_cpu;
9914 +       void *owner;
9915 +#endif
9916 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9917 +       struct lockdep_map dep_map;
9918 +#endif
9919 +} raw_spinlock_t;
9920 +
9921 +#define SPINLOCK_MAGIC         0xdead4ead
9922 +
9923 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9924 +
9925 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9926 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9927 +#else
9928 +# define SPIN_DEP_MAP_INIT(lockname)
9929 +#endif
9930 +
9931 +#ifdef CONFIG_DEBUG_SPINLOCK
9932 +# define SPIN_DEBUG_INIT(lockname)             \
9933 +       .magic = SPINLOCK_MAGIC,                \
9934 +       .owner_cpu = -1,                        \
9935 +       .owner = SPINLOCK_OWNER_INIT,
9936 +#else
9937 +# define SPIN_DEBUG_INIT(lockname)
9938 +#endif
9939 +
9940 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9941 +       {                                       \
9942 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9943 +       SPIN_DEBUG_INIT(lockname)               \
9944 +       SPIN_DEP_MAP_INIT(lockname) }
9945 +
9946 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9947 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9948 +
9949 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9950 +
9951 +#endif
9952 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
9953 new file mode 100644
9954 index 000000000000..3e3d8c5f7a9a
9955 --- /dev/null
9956 +++ b/include/linux/spinlock_types_rt.h
9957 @@ -0,0 +1,48 @@
9958 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
9959 +#define __LINUX_SPINLOCK_TYPES_RT_H
9960 +
9961 +#ifndef __LINUX_SPINLOCK_TYPES_H
9962 +#error "Do not include directly. Include spinlock_types.h instead"
9963 +#endif
9964 +
9965 +#include <linux/cache.h>
9966 +
9967 +/*
9968 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
9969 + */
9970 +typedef struct spinlock {
9971 +       struct rt_mutex         lock;
9972 +       unsigned int            break_lock;
9973 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9974 +       struct lockdep_map      dep_map;
9975 +#endif
9976 +} spinlock_t;
9977 +
9978 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9979 +# define __RT_SPIN_INITIALIZER(name) \
9980 +       { \
9981 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
9982 +       .save_state = 1, \
9983 +       .file = __FILE__, \
9984 +       .line = __LINE__ , \
9985 +       }
9986 +#else
9987 +# define __RT_SPIN_INITIALIZER(name) \
9988 +       {                                                               \
9989 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
9990 +       .save_state = 1, \
9991 +       }
9992 +#endif
9993 +
9994 +/*
9995 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
9996 +*/
9997 +
9998 +#define __SPIN_LOCK_UNLOCKED(name)                     \
9999 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
10000 +         SPIN_DEP_MAP_INIT(name) }
10001 +
10002 +#define DEFINE_SPINLOCK(name) \
10003 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
10004 +
10005 +#endif
10006 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
10007 index dc8eb63c6568..e793d3a257da 100644
10008 --- a/include/linux/srcu.h
10009 +++ b/include/linux/srcu.h
10010 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
10011  
10012  void process_srcu(struct work_struct *work);
10013  
10014 -#define __SRCU_STRUCT_INIT(name)                                       \
10015 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
10016         {                                                               \
10017                 .completed = -300,                                      \
10018 -               .per_cpu_ref = &name##_srcu_array,                      \
10019 +               .per_cpu_ref = &pcpu_name,                              \
10020                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
10021                 .running = false,                                       \
10022                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
10023 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
10024   */
10025  #define __DEFINE_SRCU(name, is_static)                                 \
10026         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
10027 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
10028 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
10029  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
10030  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
10031  
10032 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
10033 index d9718378a8be..e81e6dc7dcb1 100644
10034 --- a/include/linux/suspend.h
10035 +++ b/include/linux/suspend.h
10036 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
10037         void (*end)(void);
10038  };
10039  
10040 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
10041 +extern bool pm_in_action;
10042 +#else
10043 +# define pm_in_action false
10044 +#endif
10045 +
10046  #ifdef CONFIG_SUSPEND
10047  /**
10048   * suspend_set_ops - set platform dependent suspend operations
10049 diff --git a/include/linux/swait.h b/include/linux/swait.h
10050 index c1f9c62a8a50..83f004a72320 100644
10051 --- a/include/linux/swait.h
10052 +++ b/include/linux/swait.h
10053 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
10054  extern void swake_up(struct swait_queue_head *q);
10055  extern void swake_up_all(struct swait_queue_head *q);
10056  extern void swake_up_locked(struct swait_queue_head *q);
10057 +extern void swake_up_all_locked(struct swait_queue_head *q);
10058  
10059  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
10060  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
10061 diff --git a/include/linux/swap.h b/include/linux/swap.h
10062 index a56523cefb9b..c59a9f0d8ca1 100644
10063 --- a/include/linux/swap.h
10064 +++ b/include/linux/swap.h
10065 @@ -11,6 +11,7 @@
10066  #include <linux/fs.h>
10067  #include <linux/atomic.h>
10068  #include <linux/page-flags.h>
10069 +#include <linux/locallock.h>
10070  #include <asm/page.h>
10071  
10072  struct notifier_block;
10073 @@ -246,7 +247,8 @@ struct swap_info_struct {
10074  void *workingset_eviction(struct address_space *mapping, struct page *page);
10075  bool workingset_refault(void *shadow);
10076  void workingset_activation(struct page *page);
10077 -extern struct list_lru workingset_shadow_nodes;
10078 +extern struct list_lru __workingset_shadow_nodes;
10079 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
10080  
10081  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10082  {
10083 @@ -291,6 +293,7 @@ extern unsigned long nr_free_pagecache_pages(void);
10084  
10085  
10086  /* linux/mm/swap.c */
10087 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10088  extern void lru_cache_add(struct page *);
10089  extern void lru_cache_add_anon(struct page *page);
10090  extern void lru_cache_add_file(struct page *page);
10091 diff --git a/include/linux/swork.h b/include/linux/swork.h
10092 new file mode 100644
10093 index 000000000000..f175fa9a6016
10094 --- /dev/null
10095 +++ b/include/linux/swork.h
10096 @@ -0,0 +1,24 @@
10097 +#ifndef _LINUX_SWORK_H
10098 +#define _LINUX_SWORK_H
10099 +
10100 +#include <linux/list.h>
10101 +
10102 +struct swork_event {
10103 +       struct list_head item;
10104 +       unsigned long flags;
10105 +       void (*func)(struct swork_event *);
10106 +};
10107 +
10108 +static inline void INIT_SWORK(struct swork_event *event,
10109 +                             void (*func)(struct swork_event *))
10110 +{
10111 +       event->flags = 0;
10112 +       event->func = func;
10113 +}
10114 +
10115 +bool swork_queue(struct swork_event *sev);
10116 +
10117 +int swork_get(void);
10118 +void swork_put(void);
10119 +
10120 +#endif /* _LINUX_SWORK_H */
10121 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
10122 index 2873baf5372a..eb1a108f17ca 100644
10123 --- a/include/linux/thread_info.h
10124 +++ b/include/linux/thread_info.h
10125 @@ -107,7 +107,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
10126  #define test_thread_flag(flag) \
10127         test_ti_thread_flag(current_thread_info(), flag)
10128  
10129 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10130 +#ifdef CONFIG_PREEMPT_LAZY
10131 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10132 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10133 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10134 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10135 +
10136 +#else
10137 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10138 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10139 +#define tif_need_resched_lazy()        0
10140 +#endif
10141  
10142  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10143  static inline int arch_within_stack_frames(const void * const stack,
10144 diff --git a/include/linux/timer.h b/include/linux/timer.h
10145 index 51d601f192d4..83cea629efe1 100644
10146 --- a/include/linux/timer.h
10147 +++ b/include/linux/timer.h
10148 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
10149  
10150  extern int try_to_del_timer_sync(struct timer_list *timer);
10151  
10152 -#ifdef CONFIG_SMP
10153 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10154    extern int del_timer_sync(struct timer_list *timer);
10155  #else
10156  # define del_timer_sync(t)             del_timer(t)
10157 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
10158 index be007610ceb0..15154b13a53b 100644
10159 --- a/include/linux/trace_events.h
10160 +++ b/include/linux/trace_events.h
10161 @@ -56,6 +56,9 @@ struct trace_entry {
10162         unsigned char           flags;
10163         unsigned char           preempt_count;
10164         int                     pid;
10165 +       unsigned short          migrate_disable;
10166 +       unsigned short          padding;
10167 +       unsigned char           preempt_lazy_count;
10168  };
10169  
10170  #define TRACE_EVENT_TYPE_MAX                                           \
10171 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
10172 index f30c187ed785..83bf0f798426 100644
10173 --- a/include/linux/uaccess.h
10174 +++ b/include/linux/uaccess.h
10175 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
10176   */
10177  static inline void pagefault_disable(void)
10178  {
10179 +       migrate_disable();
10180         pagefault_disabled_inc();
10181         /*
10182          * make sure to have issued the store before a pagefault
10183 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
10184          */
10185         barrier();
10186         pagefault_disabled_dec();
10187 +       migrate_enable();
10188  }
10189  
10190  /*
10191 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
10192 index 4a29c75b146e..0a294e950df8 100644
10193 --- a/include/linux/uprobes.h
10194 +++ b/include/linux/uprobes.h
10195 @@ -27,6 +27,7 @@
10196  #include <linux/errno.h>
10197  #include <linux/rbtree.h>
10198  #include <linux/types.h>
10199 +#include <linux/wait.h>
10200  
10201  struct vm_area_struct;
10202  struct mm_struct;
10203 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
10204 index 613771909b6e..e28c5a43229d 100644
10205 --- a/include/linux/vmstat.h
10206 +++ b/include/linux/vmstat.h
10207 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
10208   */
10209  static inline void __count_vm_event(enum vm_event_item item)
10210  {
10211 +       preempt_disable_rt();
10212         raw_cpu_inc(vm_event_states.event[item]);
10213 +       preempt_enable_rt();
10214  }
10215  
10216  static inline void count_vm_event(enum vm_event_item item)
10217 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
10218  
10219  static inline void __count_vm_events(enum vm_event_item item, long delta)
10220  {
10221 +       preempt_disable_rt();
10222         raw_cpu_add(vm_event_states.event[item], delta);
10223 +       preempt_enable_rt();
10224  }
10225  
10226  static inline void count_vm_events(enum vm_event_item item, long delta)
10227 diff --git a/include/linux/wait.h b/include/linux/wait.h
10228 index 2408e8d5c05c..db50d6609195 100644
10229 --- a/include/linux/wait.h
10230 +++ b/include/linux/wait.h
10231 @@ -8,6 +8,7 @@
10232  #include <linux/spinlock.h>
10233  #include <asm/current.h>
10234  #include <uapi/linux/wait.h>
10235 +#include <linux/atomic.h>
10236  
10237  typedef struct __wait_queue wait_queue_t;
10238  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10239 diff --git a/include/net/dst.h b/include/net/dst.h
10240 index 6835d224d47b..55a5a9698f14 100644
10241 --- a/include/net/dst.h
10242 +++ b/include/net/dst.h
10243 @@ -446,7 +446,7 @@ static inline void dst_confirm(struct dst_entry *dst)
10244  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10245                                    struct sk_buff *skb)
10246  {
10247 -       const struct hh_cache *hh;
10248 +       struct hh_cache *hh;
10249  
10250         if (dst->pending_confirm) {
10251                 unsigned long now = jiffies;
10252 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
10253 index 231e121cc7d9..d125222b979d 100644
10254 --- a/include/net/gen_stats.h
10255 +++ b/include/net/gen_stats.h
10256 @@ -5,6 +5,7 @@
10257  #include <linux/socket.h>
10258  #include <linux/rtnetlink.h>
10259  #include <linux/pkt_sched.h>
10260 +#include <net/net_seq_lock.h>
10261  
10262  struct gnet_stats_basic_cpu {
10263         struct gnet_stats_basic_packed bstats;
10264 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
10265                                  spinlock_t *lock, struct gnet_dump *d,
10266                                  int padattr);
10267  
10268 -int gnet_stats_copy_basic(const seqcount_t *running,
10269 +int gnet_stats_copy_basic(net_seqlock_t *running,
10270                           struct gnet_dump *d,
10271                           struct gnet_stats_basic_cpu __percpu *cpu,
10272                           struct gnet_stats_basic_packed *b);
10273 -void __gnet_stats_copy_basic(const seqcount_t *running,
10274 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10275                              struct gnet_stats_basic_packed *bstats,
10276                              struct gnet_stats_basic_cpu __percpu *cpu,
10277                              struct gnet_stats_basic_packed *b);
10278 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
10279                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10280                       struct gnet_stats_rate_est64 *rate_est,
10281                       spinlock_t *stats_lock,
10282 -                     seqcount_t *running, struct nlattr *opt);
10283 +                     net_seqlock_t *running, struct nlattr *opt);
10284  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10285                         struct gnet_stats_rate_est64 *rate_est);
10286  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10287                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10288                           struct gnet_stats_rate_est64 *rate_est,
10289                           spinlock_t *stats_lock,
10290 -                         seqcount_t *running, struct nlattr *opt);
10291 +                         net_seqlock_t *running, struct nlattr *opt);
10292  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10293                           const struct gnet_stats_rate_est64 *rate_est);
10294  #endif
10295 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
10296 index 8b683841e574..bf656008f6e7 100644
10297 --- a/include/net/neighbour.h
10298 +++ b/include/net/neighbour.h
10299 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
10300  }
10301  #endif
10302  
10303 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10304 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10305  {
10306         unsigned int seq;
10307         int hh_len;
10308 @@ -501,7 +501,7 @@ struct neighbour_cb {
10309  
10310  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10311  
10312 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10313 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10314                                      const struct net_device *dev)
10315  {
10316         unsigned int seq;
10317 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
10318 new file mode 100644
10319 index 000000000000..a7034298a82a
10320 --- /dev/null
10321 +++ b/include/net/net_seq_lock.h
10322 @@ -0,0 +1,15 @@
10323 +#ifndef __NET_NET_SEQ_LOCK_H__
10324 +#define __NET_NET_SEQ_LOCK_H__
10325 +
10326 +#ifdef CONFIG_PREEMPT_RT_BASE
10327 +# define net_seqlock_t                 seqlock_t
10328 +# define net_seq_begin(__r)            read_seqbegin(__r)
10329 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10330 +
10331 +#else
10332 +# define net_seqlock_t                 seqcount_t
10333 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10334 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10335 +#endif
10336 +
10337 +#endif
10338 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
10339 index 7adf4386ac8f..d3fd5c357268 100644
10340 --- a/include/net/netns/ipv4.h
10341 +++ b/include/net/netns/ipv4.h
10342 @@ -69,6 +69,7 @@ struct netns_ipv4 {
10343  
10344         int sysctl_icmp_echo_ignore_all;
10345         int sysctl_icmp_echo_ignore_broadcasts;
10346 +       int sysctl_icmp_echo_sysrq;
10347         int sysctl_icmp_ignore_bogus_error_responses;
10348         int sysctl_icmp_ratelimit;
10349         int sysctl_icmp_ratemask;
10350 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
10351 index e6aa0a249672..b57736f2a8a3 100644
10352 --- a/include/net/sch_generic.h
10353 +++ b/include/net/sch_generic.h
10354 @@ -10,6 +10,7 @@
10355  #include <linux/dynamic_queue_limits.h>
10356  #include <net/gen_stats.h>
10357  #include <net/rtnetlink.h>
10358 +#include <net/net_seq_lock.h>
10359  
10360  struct Qdisc_ops;
10361  struct qdisc_walker;
10362 @@ -86,7 +87,7 @@ struct Qdisc {
10363         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10364         struct qdisc_skb_head   q;
10365         struct gnet_stats_basic_packed bstats;
10366 -       seqcount_t              running;
10367 +       net_seqlock_t           running;
10368         struct gnet_stats_queue qstats;
10369         unsigned long           state;
10370         struct Qdisc            *next_sched;
10371 @@ -98,13 +99,22 @@ struct Qdisc {
10372         spinlock_t              busylock ____cacheline_aligned_in_smp;
10373  };
10374  
10375 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10376 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10377  {
10378 +#ifdef CONFIG_PREEMPT_RT_BASE
10379 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10380 +#else
10381         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10382 +#endif
10383  }
10384  
10385  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10386  {
10387 +#ifdef CONFIG_PREEMPT_RT_BASE
10388 +       if (try_write_seqlock(&qdisc->running))
10389 +               return true;
10390 +       return false;
10391 +#else
10392         if (qdisc_is_running(qdisc))
10393                 return false;
10394         /* Variant of write_seqcount_begin() telling lockdep a trylock
10395 @@ -113,11 +123,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10396         raw_write_seqcount_begin(&qdisc->running);
10397         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10398         return true;
10399 +#endif
10400  }
10401  
10402  static inline void qdisc_run_end(struct Qdisc *qdisc)
10403  {
10404 +#ifdef CONFIG_PREEMPT_RT_BASE
10405 +       write_sequnlock(&qdisc->running);
10406 +#else
10407         write_seqcount_end(&qdisc->running);
10408 +#endif
10409  }
10410  
10411  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10412 @@ -308,7 +323,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
10413         return qdisc_lock(root);
10414  }
10415  
10416 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10417 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10418  {
10419         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10420  
10421 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
10422 new file mode 100644
10423 index 000000000000..f7710de1b1f3
10424 --- /dev/null
10425 +++ b/include/trace/events/hist.h
10426 @@ -0,0 +1,73 @@
10427 +#undef TRACE_SYSTEM
10428 +#define TRACE_SYSTEM hist
10429 +
10430 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10431 +#define _TRACE_HIST_H
10432 +
10433 +#include "latency_hist.h"
10434 +#include <linux/tracepoint.h>
10435 +
10436 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10437 +#define trace_preemptirqsoff_hist(a, b)
10438 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10439 +#else
10440 +TRACE_EVENT(preemptirqsoff_hist,
10441 +
10442 +       TP_PROTO(int reason, int starthist),
10443 +
10444 +       TP_ARGS(reason, starthist),
10445 +
10446 +       TP_STRUCT__entry(
10447 +               __field(int,    reason)
10448 +               __field(int,    starthist)
10449 +       ),
10450 +
10451 +       TP_fast_assign(
10452 +               __entry->reason         = reason;
10453 +               __entry->starthist      = starthist;
10454 +       ),
10455 +
10456 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10457 +                 __entry->starthist ? "start" : "stop")
10458 +);
10459 +#endif
10460 +
10461 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10462 +#define trace_hrtimer_interrupt(a, b, c, d)
10463 +#else
10464 +TRACE_EVENT(hrtimer_interrupt,
10465 +
10466 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10467 +               struct task_struct *task),
10468 +
10469 +       TP_ARGS(cpu, offset, curr, task),
10470 +
10471 +       TP_STRUCT__entry(
10472 +               __field(int,            cpu)
10473 +               __field(long long,      offset)
10474 +               __array(char,           ccomm,  TASK_COMM_LEN)
10475 +               __field(int,            cprio)
10476 +               __array(char,           tcomm,  TASK_COMM_LEN)
10477 +               __field(int,            tprio)
10478 +       ),
10479 +
10480 +       TP_fast_assign(
10481 +               __entry->cpu    = cpu;
10482 +               __entry->offset = offset;
10483 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10484 +               __entry->cprio  = curr->prio;
10485 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10486 +                       task != NULL ? TASK_COMM_LEN : 7);
10487 +               __entry->tprio  = task != NULL ? task->prio : -1;
10488 +       ),
10489 +
10490 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10491 +               __entry->cpu, __entry->offset, __entry->ccomm,
10492 +               __entry->cprio, __entry->tcomm, __entry->tprio)
10493 +);
10494 +#endif
10495 +
10496 +#endif /* _TRACE_HIST_H */
10497 +
10498 +/* This part must be outside protection */
10499 +#include <trace/define_trace.h>
10500 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
10501 new file mode 100644
10502 index 000000000000..d3f2fbd560b1
10503 --- /dev/null
10504 +++ b/include/trace/events/latency_hist.h
10505 @@ -0,0 +1,29 @@
10506 +#ifndef _LATENCY_HIST_H
10507 +#define _LATENCY_HIST_H
10508 +
10509 +enum hist_action {
10510 +       IRQS_ON,
10511 +       PREEMPT_ON,
10512 +       TRACE_STOP,
10513 +       IRQS_OFF,
10514 +       PREEMPT_OFF,
10515 +       TRACE_START,
10516 +};
10517 +
10518 +static char *actions[] = {
10519 +       "IRQS_ON",
10520 +       "PREEMPT_ON",
10521 +       "TRACE_STOP",
10522 +       "IRQS_OFF",
10523 +       "PREEMPT_OFF",
10524 +       "TRACE_START",
10525 +};
10526 +
10527 +static inline char *getaction(int action)
10528 +{
10529 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10530 +               return actions[action];
10531 +       return "unknown";
10532 +}
10533 +
10534 +#endif /* _LATENCY_HIST_H */
10535 diff --git a/init/Kconfig b/init/Kconfig
10536 index 34407f15e6d3..2ce33a32e65d 100644
10537 --- a/init/Kconfig
10538 +++ b/init/Kconfig
10539 @@ -506,7 +506,7 @@ config TINY_RCU
10540  
10541  config RCU_EXPERT
10542         bool "Make expert-level adjustments to RCU configuration"
10543 -       default n
10544 +       default y if PREEMPT_RT_FULL
10545         help
10546           This option needs to be enabled if you wish to make
10547           expert-level adjustments to RCU configuration.  By default,
10548 @@ -623,7 +623,7 @@ config RCU_FANOUT_LEAF
10549  
10550  config RCU_FAST_NO_HZ
10551         bool "Accelerate last non-dyntick-idle CPU's grace periods"
10552 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10553 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10554         default n
10555         help
10556           This option permits CPUs to enter dynticks-idle state even if
10557 @@ -650,7 +650,7 @@ config TREE_RCU_TRACE
10558  config RCU_BOOST
10559         bool "Enable RCU priority boosting"
10560         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10561 -       default n
10562 +       default y if PREEMPT_RT_FULL
10563         help
10564           This option boosts the priority of preempted RCU readers that
10565           block the current preemptible RCU grace period for too long.
10566 @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
10567  
10568  endchoice
10569  
10570 -config RCU_EXPEDITE_BOOT
10571 -       bool
10572 -       default n
10573 -       help
10574 -         This option enables expedited grace periods at boot time,
10575 -         as if rcu_expedite_gp() had been invoked early in boot.
10576 -         The corresponding rcu_unexpedite_gp() is invoked from
10577 -         rcu_end_inkernel_boot(), which is intended to be invoked
10578 -         at the end of the kernel-only boot sequence, just before
10579 -         init is exec'ed.
10580 -
10581 -         Accept the default if unsure.
10582 -
10583  endmenu # "RCU Subsystem"
10584  
10585  config BUILD_BIN2C
10586 @@ -1064,6 +1051,7 @@ config CFS_BANDWIDTH
10587  config RT_GROUP_SCHED
10588         bool "Group scheduling for SCHED_RR/FIFO"
10589         depends on CGROUP_SCHED
10590 +       depends on !PREEMPT_RT_FULL
10591         default n
10592         help
10593           This feature lets you explicitly allocate real CPU bandwidth
10594 @@ -1772,6 +1760,7 @@ choice
10595  
10596  config SLAB
10597         bool "SLAB"
10598 +       depends on !PREEMPT_RT_FULL
10599         select HAVE_HARDENED_USERCOPY_ALLOCATOR
10600         help
10601           The regular slab allocator that is established and known to work
10602 @@ -1792,6 +1781,7 @@ config SLUB
10603  config SLOB
10604         depends on EXPERT
10605         bool "SLOB (Simple Allocator)"
10606 +       depends on !PREEMPT_RT_FULL
10607         help
10608            SLOB replaces the stock allocator with a drastically simpler
10609            allocator. SLOB is generally more space efficient but
10610 @@ -1810,7 +1800,7 @@ config SLAB_FREELIST_RANDOM
10611  
10612  config SLUB_CPU_PARTIAL
10613         default y
10614 -       depends on SLUB && SMP
10615 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
10616         bool "SLUB per cpu partial cache"
10617         help
10618           Per cpu partial caches accellerate objects allocation and freeing
10619 diff --git a/init/Makefile b/init/Makefile
10620 index c4fb45525d08..821190dfaa75 100644
10621 --- a/init/Makefile
10622 +++ b/init/Makefile
10623 @@ -35,4 +35,4 @@ $(obj)/version.o: include/generated/compile.h
10624  include/generated/compile.h: FORCE
10625         @$($(quiet)chk_compile.h)
10626         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
10627 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
10628 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
10629 diff --git a/init/main.c b/init/main.c
10630 index 2858be732f6d..3c97c3c91d88 100644
10631 --- a/init/main.c
10632 +++ b/init/main.c
10633 @@ -507,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
10634         setup_command_line(command_line);
10635         setup_nr_cpu_ids();
10636         setup_per_cpu_areas();
10637 +       softirq_early_init();
10638         boot_cpu_state_init();
10639         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
10640  
10641 diff --git a/ipc/sem.c b/ipc/sem.c
10642 index 10b94bc59d4a..b8360eaacc7a 100644
10643 --- a/ipc/sem.c
10644 +++ b/ipc/sem.c
10645 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
10646  static void wake_up_sem_queue_prepare(struct list_head *pt,
10647                                 struct sem_queue *q, int error)
10648  {
10649 +#ifdef CONFIG_PREEMPT_RT_BASE
10650 +       struct task_struct *p = q->sleeper;
10651 +       get_task_struct(p);
10652 +       q->status = error;
10653 +       wake_up_process(p);
10654 +       put_task_struct(p);
10655 +#else
10656         if (list_empty(pt)) {
10657                 /*
10658                  * Hold preempt off so that we don't get preempted and have the
10659 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
10660         q->pid = error;
10661  
10662         list_add_tail(&q->list, pt);
10663 +#endif
10664  }
10665  
10666  /**
10667 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
10668   */
10669  static void wake_up_sem_queue_do(struct list_head *pt)
10670  {
10671 +#ifndef CONFIG_PREEMPT_RT_BASE
10672         struct sem_queue *q, *t;
10673         int did_something;
10674  
10675 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
10676         }
10677         if (did_something)
10678                 preempt_enable();
10679 +#endif
10680  }
10681  
10682  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
10683 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
10684 index ebdb0043203a..b9e6aa7e5aa6 100644
10685 --- a/kernel/Kconfig.locks
10686 +++ b/kernel/Kconfig.locks
10687 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
10688  
10689  config MUTEX_SPIN_ON_OWNER
10690         def_bool y
10691 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
10692 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10693  
10694  config RWSEM_SPIN_ON_OWNER
10695         def_bool y
10696 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
10697 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10698  
10699  config LOCK_SPIN_ON_OWNER
10700         def_bool y
10701 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
10702 index 3f9c97419f02..11dbe26a8279 100644
10703 --- a/kernel/Kconfig.preempt
10704 +++ b/kernel/Kconfig.preempt
10705 @@ -1,3 +1,16 @@
10706 +config PREEMPT
10707 +       bool
10708 +       select PREEMPT_COUNT
10709 +
10710 +config PREEMPT_RT_BASE
10711 +       bool
10712 +       select PREEMPT
10713 +
10714 +config HAVE_PREEMPT_LAZY
10715 +       bool
10716 +
10717 +config PREEMPT_LAZY
10718 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
10719  
10720  choice
10721         prompt "Preemption Model"
10722 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
10723  
10724           Select this if you are building a kernel for a desktop system.
10725  
10726 -config PREEMPT
10727 +config PREEMPT__LL
10728         bool "Preemptible Kernel (Low-Latency Desktop)"
10729 -       select PREEMPT_COUNT
10730 +       select PREEMPT
10731         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
10732         help
10733           This option reduces the latency of the kernel by making
10734 @@ -52,6 +65,22 @@ config PREEMPT
10735           embedded system with latency requirements in the milliseconds
10736           range.
10737  
10738 +config PREEMPT_RTB
10739 +       bool "Preemptible Kernel (Basic RT)"
10740 +       select PREEMPT_RT_BASE
10741 +       help
10742 +         This option is basically the same as (Low-Latency Desktop) but
10743 +         enables changes which are preliminary for the full preemptible
10744 +         RT kernel.
10745 +
10746 +config PREEMPT_RT_FULL
10747 +       bool "Fully Preemptible Kernel (RT)"
10748 +       depends on IRQ_FORCED_THREADING
10749 +       select PREEMPT_RT_BASE
10750 +       select PREEMPT_RCU
10751 +       help
10752 +         All and everything
10753 +
10754  endchoice
10755  
10756  config PREEMPT_COUNT
10757 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
10758 index 85bc9beb046d..3b8da75ba2e0 100644
10759 --- a/kernel/cgroup.c
10760 +++ b/kernel/cgroup.c
10761 @@ -5040,10 +5040,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
10762         queue_work(cgroup_destroy_wq, &css->destroy_work);
10763  }
10764  
10765 -static void css_release_work_fn(struct work_struct *work)
10766 +static void css_release_work_fn(struct swork_event *sev)
10767  {
10768         struct cgroup_subsys_state *css =
10769 -               container_of(work, struct cgroup_subsys_state, destroy_work);
10770 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
10771         struct cgroup_subsys *ss = css->ss;
10772         struct cgroup *cgrp = css->cgroup;
10773  
10774 @@ -5086,8 +5086,8 @@ static void css_release(struct percpu_ref *ref)
10775         struct cgroup_subsys_state *css =
10776                 container_of(ref, struct cgroup_subsys_state, refcnt);
10777  
10778 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
10779 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
10780 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
10781 +       swork_queue(&css->destroy_swork);
10782  }
10783  
10784  static void init_and_link_css(struct cgroup_subsys_state *css,
10785 @@ -5742,6 +5742,7 @@ static int __init cgroup_wq_init(void)
10786          */
10787         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
10788         BUG_ON(!cgroup_destroy_wq);
10789 +       BUG_ON(swork_get());
10790  
10791         /*
10792          * Used to destroy pidlists and separate to serve as flush domain.
10793 diff --git a/kernel/cpu.c b/kernel/cpu.c
10794 index 217fd2e7f435..69444f1bc924 100644
10795 --- a/kernel/cpu.c
10796 +++ b/kernel/cpu.c
10797 @@ -239,6 +239,289 @@ static struct {
10798  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
10799  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
10800  
10801 +/**
10802 + * hotplug_pcp - per cpu hotplug descriptor
10803 + * @unplug:    set when pin_current_cpu() needs to sync tasks
10804 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
10805 + * @refcount:  counter of tasks in pinned sections
10806 + * @grab_lock: set when the tasks entering pinned sections should wait
10807 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
10808 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
10809 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
10810 + *
10811 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
10812 + * is used as a flag and still exists after @sync_tsk has exited and
10813 + * @sync_tsk set to NULL.
10814 + */
10815 +struct hotplug_pcp {
10816 +       struct task_struct *unplug;
10817 +       struct task_struct *sync_tsk;
10818 +       int refcount;
10819 +       int grab_lock;
10820 +       struct completion synced;
10821 +       struct completion unplug_wait;
10822 +#ifdef CONFIG_PREEMPT_RT_FULL
10823 +       /*
10824 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
10825 +        * the task, otherwise the mutex will cause the task to fail
10826 +        * to sleep when required. (Because it's called from migrate_disable())
10827 +        *
10828 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
10829 +        * state.
10830 +        */
10831 +       spinlock_t lock;
10832 +#else
10833 +       struct mutex mutex;
10834 +#endif
10835 +       int mutex_init;
10836 +};
10837 +
10838 +#ifdef CONFIG_PREEMPT_RT_FULL
10839 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
10840 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
10841 +#else
10842 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
10843 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
10844 +#endif
10845 +
10846 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
10847 +
10848 +/**
10849 + * pin_current_cpu - Prevent the current cpu from being unplugged
10850 + *
10851 + * Lightweight version of get_online_cpus() to prevent cpu from being
10852 + * unplugged when code runs in a migration disabled region.
10853 + *
10854 + * Must be called with preemption disabled (preempt_count = 1)!
10855 + */
10856 +void pin_current_cpu(void)
10857 +{
10858 +       struct hotplug_pcp *hp;
10859 +       int force = 0;
10860 +
10861 +retry:
10862 +       hp = this_cpu_ptr(&hotplug_pcp);
10863 +
10864 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
10865 +           hp->unplug == current) {
10866 +               hp->refcount++;
10867 +               return;
10868 +       }
10869 +       if (hp->grab_lock) {
10870 +               preempt_enable();
10871 +               hotplug_lock(hp);
10872 +               hotplug_unlock(hp);
10873 +       } else {
10874 +               preempt_enable();
10875 +               /*
10876 +                * Try to push this task off of this CPU.
10877 +                */
10878 +               if (!migrate_me()) {
10879 +                       preempt_disable();
10880 +                       hp = this_cpu_ptr(&hotplug_pcp);
10881 +                       if (!hp->grab_lock) {
10882 +                               /*
10883 +                                * Just let it continue it's already pinned
10884 +                                * or about to sleep.
10885 +                                */
10886 +                               force = 1;
10887 +                               goto retry;
10888 +                       }
10889 +                       preempt_enable();
10890 +               }
10891 +       }
10892 +       preempt_disable();
10893 +       goto retry;
10894 +}
10895 +
10896 +/**
10897 + * unpin_current_cpu - Allow unplug of current cpu
10898 + *
10899 + * Must be called with preemption or interrupts disabled!
10900 + */
10901 +void unpin_current_cpu(void)
10902 +{
10903 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
10904 +
10905 +       WARN_ON(hp->refcount <= 0);
10906 +
10907 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
10908 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
10909 +               wake_up_process(hp->unplug);
10910 +}
10911 +
10912 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
10913 +{
10914 +       set_current_state(TASK_UNINTERRUPTIBLE);
10915 +       while (hp->refcount) {
10916 +               schedule_preempt_disabled();
10917 +               set_current_state(TASK_UNINTERRUPTIBLE);
10918 +       }
10919 +}
10920 +
10921 +static int sync_unplug_thread(void *data)
10922 +{
10923 +       struct hotplug_pcp *hp = data;
10924 +
10925 +       wait_for_completion(&hp->unplug_wait);
10926 +       preempt_disable();
10927 +       hp->unplug = current;
10928 +       wait_for_pinned_cpus(hp);
10929 +
10930 +       /*
10931 +        * This thread will synchronize the cpu_down() with threads
10932 +        * that have pinned the CPU. When the pinned CPU count reaches
10933 +        * zero, we inform the cpu_down code to continue to the next step.
10934 +        */
10935 +       set_current_state(TASK_UNINTERRUPTIBLE);
10936 +       preempt_enable();
10937 +       complete(&hp->synced);
10938 +
10939 +       /*
10940 +        * If all succeeds, the next step will need tasks to wait till
10941 +        * the CPU is offline before continuing. To do this, the grab_lock
10942 +        * is set and tasks going into pin_current_cpu() will block on the
10943 +        * mutex. But we still need to wait for those that are already in
10944 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
10945 +        * will kick this thread out.
10946 +        */
10947 +       while (!hp->grab_lock && !kthread_should_stop()) {
10948 +               schedule();
10949 +               set_current_state(TASK_UNINTERRUPTIBLE);
10950 +       }
10951 +
10952 +       /* Make sure grab_lock is seen before we see a stale completion */
10953 +       smp_mb();
10954 +
10955 +       /*
10956 +        * Now just before cpu_down() enters stop machine, we need to make
10957 +        * sure all tasks that are in pinned CPU sections are out, and new
10958 +        * tasks will now grab the lock, keeping them from entering pinned
10959 +        * CPU sections.
10960 +        */
10961 +       if (!kthread_should_stop()) {
10962 +               preempt_disable();
10963 +               wait_for_pinned_cpus(hp);
10964 +               preempt_enable();
10965 +               complete(&hp->synced);
10966 +       }
10967 +
10968 +       set_current_state(TASK_UNINTERRUPTIBLE);
10969 +       while (!kthread_should_stop()) {
10970 +               schedule();
10971 +               set_current_state(TASK_UNINTERRUPTIBLE);
10972 +       }
10973 +       set_current_state(TASK_RUNNING);
10974 +
10975 +       /*
10976 +        * Force this thread off this CPU as it's going down and
10977 +        * we don't want any more work on this CPU.
10978 +        */
10979 +       current->flags &= ~PF_NO_SETAFFINITY;
10980 +       set_cpus_allowed_ptr(current, cpu_present_mask);
10981 +       migrate_me();
10982 +       return 0;
10983 +}
10984 +
10985 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
10986 +{
10987 +       wake_up_process(hp->sync_tsk);
10988 +       wait_for_completion(&hp->synced);
10989 +}
10990 +
10991 +static void __cpu_unplug_wait(unsigned int cpu)
10992 +{
10993 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10994 +
10995 +       complete(&hp->unplug_wait);
10996 +       wait_for_completion(&hp->synced);
10997 +}
10998 +
10999 +/*
11000 + * Start the sync_unplug_thread on the target cpu and wait for it to
11001 + * complete.
11002 + */
11003 +static int cpu_unplug_begin(unsigned int cpu)
11004 +{
11005 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11006 +       int err;
11007 +
11008 +       /* Protected by cpu_hotplug.lock */
11009 +       if (!hp->mutex_init) {
11010 +#ifdef CONFIG_PREEMPT_RT_FULL
11011 +               spin_lock_init(&hp->lock);
11012 +#else
11013 +               mutex_init(&hp->mutex);
11014 +#endif
11015 +               hp->mutex_init = 1;
11016 +       }
11017 +
11018 +       /* Inform the scheduler to migrate tasks off this CPU */
11019 +       tell_sched_cpu_down_begin(cpu);
11020 +
11021 +       init_completion(&hp->synced);
11022 +       init_completion(&hp->unplug_wait);
11023 +
11024 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
11025 +       if (IS_ERR(hp->sync_tsk)) {
11026 +               err = PTR_ERR(hp->sync_tsk);
11027 +               hp->sync_tsk = NULL;
11028 +               return err;
11029 +       }
11030 +       kthread_bind(hp->sync_tsk, cpu);
11031 +
11032 +       /*
11033 +        * Wait for tasks to get out of the pinned sections,
11034 +        * it's still OK if new tasks enter. Some CPU notifiers will
11035 +        * wait for tasks that are going to enter these sections and
11036 +        * we must not have them block.
11037 +        */
11038 +       wake_up_process(hp->sync_tsk);
11039 +       return 0;
11040 +}
11041 +
11042 +static void cpu_unplug_sync(unsigned int cpu)
11043 +{
11044 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11045 +
11046 +       init_completion(&hp->synced);
11047 +       /* The completion needs to be initialzied before setting grab_lock */
11048 +       smp_wmb();
11049 +
11050 +       /* Grab the mutex before setting grab_lock */
11051 +       hotplug_lock(hp);
11052 +       hp->grab_lock = 1;
11053 +
11054 +       /*
11055 +        * The CPU notifiers have been completed.
11056 +        * Wait for tasks to get out of pinned CPU sections and have new
11057 +        * tasks block until the CPU is completely down.
11058 +        */
11059 +       __cpu_unplug_sync(hp);
11060 +
11061 +       /* All done with the sync thread */
11062 +       kthread_stop(hp->sync_tsk);
11063 +       hp->sync_tsk = NULL;
11064 +}
11065 +
11066 +static void cpu_unplug_done(unsigned int cpu)
11067 +{
11068 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11069 +
11070 +       hp->unplug = NULL;
11071 +       /* Let all tasks know cpu unplug is finished before cleaning up */
11072 +       smp_wmb();
11073 +
11074 +       if (hp->sync_tsk)
11075 +               kthread_stop(hp->sync_tsk);
11076 +
11077 +       if (hp->grab_lock) {
11078 +               hotplug_unlock(hp);
11079 +               /* protected by cpu_hotplug.lock */
11080 +               hp->grab_lock = 0;
11081 +       }
11082 +       tell_sched_cpu_down_done(cpu);
11083 +}
11084  
11085  void get_online_cpus(void)
11086  {
11087 @@ -789,10 +1072,14 @@ static int takedown_cpu(unsigned int cpu)
11088         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11089         int err;
11090  
11091 +       __cpu_unplug_wait(cpu);
11092         /* Park the smpboot threads */
11093         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
11094         smpboot_park_threads(cpu);
11095  
11096 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
11097 +       cpu_unplug_sync(cpu);
11098 +
11099         /*
11100          * Prevent irq alloc/free while the dying cpu reorganizes the
11101          * interrupt affinities.
11102 @@ -877,6 +1164,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11103         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11104         int prev_state, ret = 0;
11105         bool hasdied = false;
11106 +       int mycpu;
11107 +       cpumask_var_t cpumask;
11108 +       cpumask_var_t cpumask_org;
11109  
11110         if (num_online_cpus() == 1)
11111                 return -EBUSY;
11112 @@ -884,7 +1174,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11113         if (!cpu_present(cpu))
11114                 return -EINVAL;
11115  
11116 +       /* Move the downtaker off the unplug cpu */
11117 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11118 +               return -ENOMEM;
11119 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11120 +               free_cpumask_var(cpumask);
11121 +               return -ENOMEM;
11122 +       }
11123 +
11124 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11125 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11126 +       set_cpus_allowed_ptr(current, cpumask);
11127 +       free_cpumask_var(cpumask);
11128 +       migrate_disable();
11129 +       mycpu = smp_processor_id();
11130 +       if (mycpu == cpu) {
11131 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11132 +               migrate_enable();
11133 +               ret = -EBUSY;
11134 +               goto restore_cpus;
11135 +       }
11136 +
11137 +       migrate_enable();
11138         cpu_hotplug_begin();
11139 +       ret = cpu_unplug_begin(cpu);
11140 +       if (ret) {
11141 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11142 +               goto out_cancel;
11143 +       }
11144  
11145         cpuhp_tasks_frozen = tasks_frozen;
11146  
11147 @@ -923,10 +1240,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11148  
11149         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11150  out:
11151 +       cpu_unplug_done(cpu);
11152 +out_cancel:
11153         cpu_hotplug_done();
11154         /* This post dead nonsense must die */
11155         if (!ret && hasdied)
11156                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11157 +restore_cpus:
11158 +       set_cpus_allowed_ptr(current, cpumask_org);
11159 +       free_cpumask_var(cpumask_org);
11160         return ret;
11161  }
11162  
11163 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
11164 index fc1ef736253c..83c666537a7a 100644
11165 --- a/kernel/debug/kdb/kdb_io.c
11166 +++ b/kernel/debug/kdb/kdb_io.c
11167 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11168         int linecount;
11169         int colcount;
11170         int logging, saved_loglevel = 0;
11171 -       int saved_trap_printk;
11172         int got_printf_lock = 0;
11173         int retlen = 0;
11174         int fnd, len;
11175 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11176         unsigned long uninitialized_var(flags);
11177  
11178         preempt_disable();
11179 -       saved_trap_printk = kdb_trap_printk;
11180 -       kdb_trap_printk = 0;
11181  
11182         /* Serialize kdb_printf if multiple cpus try to write at once.
11183          * But if any cpu goes recursive in kdb, just print the output,
11184 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11185         } else {
11186                 __release(kdb_printf_lock);
11187         }
11188 -       kdb_trap_printk = saved_trap_printk;
11189         preempt_enable();
11190         return retlen;
11191  }
11192 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
11193         va_list ap;
11194         int r;
11195  
11196 +       kdb_trap_printk++;
11197         va_start(ap, fmt);
11198         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
11199         va_end(ap);
11200 +       kdb_trap_printk--;
11201  
11202         return r;
11203  }
11204 diff --git a/kernel/events/core.c b/kernel/events/core.c
11205 index 02c8421f8c01..3748cb7b2d6e 100644
11206 --- a/kernel/events/core.c
11207 +++ b/kernel/events/core.c
11208 @@ -1050,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
11209         raw_spin_lock_init(&cpuctx->hrtimer_lock);
11210         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
11211         timer->function = perf_mux_hrtimer_handler;
11212 +       timer->irqsafe = 1;
11213  }
11214  
11215  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
11216 @@ -8335,6 +8336,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
11217  
11218         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11219         hwc->hrtimer.function = perf_swevent_hrtimer;
11220 +       hwc->hrtimer.irqsafe = 1;
11221  
11222         /*
11223          * Since hrtimers have a fixed rate, we can do a static freq->period
11224 diff --git a/kernel/exit.c b/kernel/exit.c
11225 index 3076f3089919..fb2ebcf3ca7c 100644
11226 --- a/kernel/exit.c
11227 +++ b/kernel/exit.c
11228 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
11229          * Do this under ->siglock, we can race with another thread
11230          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
11231          */
11232 -       flush_sigqueue(&tsk->pending);
11233 +       flush_task_sigqueue(tsk);
11234         tsk->sighand = NULL;
11235         spin_unlock(&sighand->siglock);
11236  
11237 diff --git a/kernel/fork.c b/kernel/fork.c
11238 index ba8a01564985..47784f8aed37 100644
11239 --- a/kernel/fork.c
11240 +++ b/kernel/fork.c
11241 @@ -76,6 +76,7 @@
11242  #include <linux/compiler.h>
11243  #include <linux/sysctl.h>
11244  #include <linux/kcov.h>
11245 +#include <linux/kprobes.h>
11246  
11247  #include <asm/pgtable.h>
11248  #include <asm/pgalloc.h>
11249 @@ -376,13 +377,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
11250         if (atomic_dec_and_test(&sig->sigcnt))
11251                 free_signal_struct(sig);
11252  }
11253 -
11254 +#ifdef CONFIG_PREEMPT_RT_BASE
11255 +static
11256 +#endif
11257  void __put_task_struct(struct task_struct *tsk)
11258  {
11259         WARN_ON(!tsk->exit_state);
11260         WARN_ON(atomic_read(&tsk->usage));
11261         WARN_ON(tsk == current);
11262  
11263 +       /*
11264 +        * Remove function-return probe instances associated with this
11265 +        * task and put them back on the free list.
11266 +        */
11267 +       kprobe_flush_task(tsk);
11268 +
11269 +       /* Task is done with its stack. */
11270 +       put_task_stack(tsk);
11271 +
11272         cgroup_free(tsk);
11273         task_numa_free(tsk);
11274         security_task_free(tsk);
11275 @@ -393,7 +405,18 @@ void __put_task_struct(struct task_struct *tsk)
11276         if (!profile_handoff_task(tsk))
11277                 free_task(tsk);
11278  }
11279 +#ifndef CONFIG_PREEMPT_RT_BASE
11280  EXPORT_SYMBOL_GPL(__put_task_struct);
11281 +#else
11282 +void __put_task_struct_cb(struct rcu_head *rhp)
11283 +{
11284 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
11285 +
11286 +       __put_task_struct(tsk);
11287 +
11288 +}
11289 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
11290 +#endif
11291  
11292  void __init __weak arch_task_cache_init(void) { }
11293  
11294 @@ -852,6 +875,19 @@ void __mmdrop(struct mm_struct *mm)
11295  }
11296  EXPORT_SYMBOL_GPL(__mmdrop);
11297  
11298 +#ifdef CONFIG_PREEMPT_RT_BASE
11299 +/*
11300 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
11301 + * want another facility to make this work.
11302 + */
11303 +void __mmdrop_delayed(struct rcu_head *rhp)
11304 +{
11305 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
11306 +
11307 +       __mmdrop(mm);
11308 +}
11309 +#endif
11310 +
11311  static inline void __mmput(struct mm_struct *mm)
11312  {
11313         VM_BUG_ON(atomic_read(&mm->mm_users));
11314 @@ -1426,6 +1462,9 @@ static void rt_mutex_init_task(struct task_struct *p)
11315   */
11316  static void posix_cpu_timers_init(struct task_struct *tsk)
11317  {
11318 +#ifdef CONFIG_PREEMPT_RT_BASE
11319 +       tsk->posix_timer_list = NULL;
11320 +#endif
11321         tsk->cputime_expires.prof_exp = 0;
11322         tsk->cputime_expires.virt_exp = 0;
11323         tsk->cputime_expires.sched_exp = 0;
11324 @@ -1552,6 +1591,7 @@ static __latent_entropy struct task_struct *copy_process(
11325         spin_lock_init(&p->alloc_lock);
11326  
11327         init_sigpending(&p->pending);
11328 +       p->sigqueue_cache = NULL;
11329  
11330         p->utime = p->stime = p->gtime = 0;
11331         p->utimescaled = p->stimescaled = 0;
11332 diff --git a/kernel/futex.c b/kernel/futex.c
11333 index 2c4be467fecd..064917c2d9a5 100644
11334 --- a/kernel/futex.c
11335 +++ b/kernel/futex.c
11336 @@ -904,7 +904,9 @@ void exit_pi_state_list(struct task_struct *curr)
11337                  * task still owns the PI-state:
11338                  */
11339                 if (head->next != next) {
11340 +                       raw_spin_unlock_irq(&curr->pi_lock);
11341                         spin_unlock(&hb->lock);
11342 +                       raw_spin_lock_irq(&curr->pi_lock);
11343                         continue;
11344                 }
11345  
11346 @@ -1299,6 +1301,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11347         struct futex_pi_state *pi_state = this->pi_state;
11348         u32 uninitialized_var(curval), newval;
11349         WAKE_Q(wake_q);
11350 +       WAKE_Q(wake_sleeper_q);
11351         bool deboost;
11352         int ret = 0;
11353  
11354 @@ -1365,7 +1368,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11355  
11356         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11357  
11358 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
11359 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
11360 +                                       &wake_sleeper_q);
11361  
11362         /*
11363          * First unlock HB so the waiter does not spin on it once he got woken
11364 @@ -1373,8 +1377,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11365          * deboost first (and lose our higher priority), then the task might get
11366          * scheduled away before the wake up can take place.
11367          */
11368 -       spin_unlock(&hb->lock);
11369 +       deboost |= spin_unlock_no_deboost(&hb->lock);
11370         wake_up_q(&wake_q);
11371 +       wake_up_q_sleeper(&wake_sleeper_q);
11372         if (deboost)
11373                 rt_mutex_adjust_prio(current);
11374  
11375 @@ -1924,6 +1929,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
11376                                 requeue_pi_wake_futex(this, &key2, hb2);
11377                                 drop_count++;
11378                                 continue;
11379 +                       } else if (ret == -EAGAIN) {
11380 +                               /*
11381 +                                * Waiter was woken by timeout or
11382 +                                * signal and has set pi_blocked_on to
11383 +                                * PI_WAKEUP_INPROGRESS before we
11384 +                                * tried to enqueue it on the rtmutex.
11385 +                                */
11386 +                               this->pi_state = NULL;
11387 +                               put_pi_state(pi_state);
11388 +                               continue;
11389                         } else if (ret) {
11390                                 /*
11391                                  * rt_mutex_start_proxy_lock() detected a
11392 @@ -2814,7 +2829,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11393         struct hrtimer_sleeper timeout, *to = NULL;
11394         struct rt_mutex_waiter rt_waiter;
11395         struct rt_mutex *pi_mutex = NULL;
11396 -       struct futex_hash_bucket *hb;
11397 +       struct futex_hash_bucket *hb, *hb2;
11398         union futex_key key2 = FUTEX_KEY_INIT;
11399         struct futex_q q = futex_q_init;
11400         int res, ret;
11401 @@ -2839,10 +2854,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11402          * The waiter is allocated on our stack, manipulated by the requeue
11403          * code while we sleep on uaddr.
11404          */
11405 -       debug_rt_mutex_init_waiter(&rt_waiter);
11406 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
11407 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
11408 -       rt_waiter.task = NULL;
11409 +       rt_mutex_init_waiter(&rt_waiter, false);
11410  
11411         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
11412         if (unlikely(ret != 0))
11413 @@ -2873,20 +2885,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11414         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
11415         futex_wait_queue_me(hb, &q, to);
11416  
11417 -       spin_lock(&hb->lock);
11418 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11419 -       spin_unlock(&hb->lock);
11420 -       if (ret)
11421 -               goto out_put_keys;
11422 +       /*
11423 +        * On RT we must avoid races with requeue and trying to block
11424 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
11425 +        * serializing access to pi_blocked_on with pi_lock.
11426 +        */
11427 +       raw_spin_lock_irq(&current->pi_lock);
11428 +       if (current->pi_blocked_on) {
11429 +               /*
11430 +                * We have been requeued or are in the process of
11431 +                * being requeued.
11432 +                */
11433 +               raw_spin_unlock_irq(&current->pi_lock);
11434 +       } else {
11435 +               /*
11436 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
11437 +                * prevents a concurrent requeue from moving us to the
11438 +                * uaddr2 rtmutex. After that we can safely acquire
11439 +                * (and possibly block on) hb->lock.
11440 +                */
11441 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
11442 +               raw_spin_unlock_irq(&current->pi_lock);
11443 +
11444 +               spin_lock(&hb->lock);
11445 +
11446 +               /*
11447 +                * Clean up pi_blocked_on. We might leak it otherwise
11448 +                * when we succeeded with the hb->lock in the fast
11449 +                * path.
11450 +                */
11451 +               raw_spin_lock_irq(&current->pi_lock);
11452 +               current->pi_blocked_on = NULL;
11453 +               raw_spin_unlock_irq(&current->pi_lock);
11454 +
11455 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11456 +               spin_unlock(&hb->lock);
11457 +               if (ret)
11458 +                       goto out_put_keys;
11459 +       }
11460  
11461         /*
11462 -        * In order for us to be here, we know our q.key == key2, and since
11463 -        * we took the hb->lock above, we also know that futex_requeue() has
11464 -        * completed and we no longer have to concern ourselves with a wakeup
11465 -        * race with the atomic proxy lock acquisition by the requeue code. The
11466 -        * futex_requeue dropped our key1 reference and incremented our key2
11467 -        * reference count.
11468 +        * In order to be here, we have either been requeued, are in
11469 +        * the process of being requeued, or requeue successfully
11470 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
11471 +        * non-null above, we may be racing with a requeue.  Do not
11472 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
11473 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
11474 +        * reference and incremented our key2 reference count.
11475          */
11476 +       hb2 = hash_futex(&key2);
11477  
11478         /* Check if the requeue code acquired the second futex for us. */
11479         if (!q.rt_waiter) {
11480 @@ -2895,14 +2942,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11481                  * did a lock-steal - fix up the PI-state in that case.
11482                  */
11483                 if (q.pi_state && (q.pi_state->owner != current)) {
11484 -                       spin_lock(q.lock_ptr);
11485 +                       spin_lock(&hb2->lock);
11486 +                       BUG_ON(&hb2->lock != q.lock_ptr);
11487                         ret = fixup_pi_state_owner(uaddr2, &q, current);
11488                         /*
11489                          * Drop the reference to the pi state which
11490                          * the requeue_pi() code acquired for us.
11491                          */
11492                         put_pi_state(q.pi_state);
11493 -                       spin_unlock(q.lock_ptr);
11494 +                       spin_unlock(&hb2->lock);
11495                 }
11496         } else {
11497                 /*
11498 @@ -2915,7 +2963,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11499                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
11500                 debug_rt_mutex_free_waiter(&rt_waiter);
11501  
11502 -               spin_lock(q.lock_ptr);
11503 +               spin_lock(&hb2->lock);
11504 +               BUG_ON(&hb2->lock != q.lock_ptr);
11505                 /*
11506                  * Fixup the pi_state owner and possibly acquire the lock if we
11507                  * haven't already.
11508 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
11509 index d3f24905852c..f87aa8fdcc51 100644
11510 --- a/kernel/irq/handle.c
11511 +++ b/kernel/irq/handle.c
11512 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
11513  {
11514         irqreturn_t retval;
11515         unsigned int flags = 0;
11516 +       struct pt_regs *regs = get_irq_regs();
11517 +       u64 ip = regs ? instruction_pointer(regs) : 0;
11518  
11519         retval = __handle_irq_event_percpu(desc, &flags);
11520  
11521 -       add_interrupt_randomness(desc->irq_data.irq, flags);
11522 +#ifdef CONFIG_PREEMPT_RT_FULL
11523 +       desc->random_ip = ip;
11524 +#else
11525 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
11526 +#endif
11527  
11528         if (!noirqdebug)
11529                 note_interrupt(desc, retval);
11530 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
11531 index 6b669593e7eb..e357bf6c59d5 100644
11532 --- a/kernel/irq/manage.c
11533 +++ b/kernel/irq/manage.c
11534 @@ -22,6 +22,7 @@
11535  #include "internals.h"
11536  
11537  #ifdef CONFIG_IRQ_FORCED_THREADING
11538 +# ifndef CONFIG_PREEMPT_RT_BASE
11539  __read_mostly bool force_irqthreads;
11540  
11541  static int __init setup_forced_irqthreads(char *arg)
11542 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
11543         return 0;
11544  }
11545  early_param("threadirqs", setup_forced_irqthreads);
11546 +# endif
11547  #endif
11548  
11549  static void __synchronize_hardirq(struct irq_desc *desc)
11550 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
11551  
11552         if (desc->affinity_notify) {
11553                 kref_get(&desc->affinity_notify->kref);
11554 +
11555 +#ifdef CONFIG_PREEMPT_RT_BASE
11556 +               swork_queue(&desc->affinity_notify->swork);
11557 +#else
11558                 schedule_work(&desc->affinity_notify->work);
11559 +#endif
11560         }
11561         irqd_set(data, IRQD_AFFINITY_SET);
11562  
11563 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
11564  }
11565  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
11566  
11567 -static void irq_affinity_notify(struct work_struct *work)
11568 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
11569  {
11570 -       struct irq_affinity_notify *notify =
11571 -               container_of(work, struct irq_affinity_notify, work);
11572         struct irq_desc *desc = irq_to_desc(notify->irq);
11573         cpumask_var_t cpumask;
11574         unsigned long flags;
11575 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
11576         kref_put(&notify->kref, notify->release);
11577  }
11578  
11579 +#ifdef CONFIG_PREEMPT_RT_BASE
11580 +static void init_helper_thread(void)
11581 +{
11582 +       static int init_sworker_once;
11583 +
11584 +       if (init_sworker_once)
11585 +               return;
11586 +       if (WARN_ON(swork_get()))
11587 +               return;
11588 +       init_sworker_once = 1;
11589 +}
11590 +
11591 +static void irq_affinity_notify(struct swork_event *swork)
11592 +{
11593 +       struct irq_affinity_notify *notify =
11594 +               container_of(swork, struct irq_affinity_notify, swork);
11595 +       _irq_affinity_notify(notify);
11596 +}
11597 +
11598 +#else
11599 +
11600 +static void irq_affinity_notify(struct work_struct *work)
11601 +{
11602 +       struct irq_affinity_notify *notify =
11603 +               container_of(work, struct irq_affinity_notify, work);
11604 +       _irq_affinity_notify(notify);
11605 +}
11606 +#endif
11607 +
11608  /**
11609   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
11610   *     @irq:           Interrupt for which to enable/disable notification
11611 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
11612         if (notify) {
11613                 notify->irq = irq;
11614                 kref_init(&notify->kref);
11615 +#ifdef CONFIG_PREEMPT_RT_BASE
11616 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
11617 +               init_helper_thread();
11618 +#else
11619                 INIT_WORK(&notify->work, irq_affinity_notify);
11620 +#endif
11621         }
11622  
11623         raw_spin_lock_irqsave(&desc->lock, flags);
11624 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
11625         local_bh_disable();
11626         ret = action->thread_fn(action->irq, action->dev_id);
11627         irq_finalize_oneshot(desc, action);
11628 -       local_bh_enable();
11629 +       /*
11630 +        * Interrupts which have real time requirements can be set up
11631 +        * to avoid softirq processing in the thread handler. This is
11632 +        * safe as these interrupts do not raise soft interrupts.
11633 +        */
11634 +       if (irq_settings_no_softirq_call(desc))
11635 +               _local_bh_enable();
11636 +       else
11637 +               local_bh_enable();
11638         return ret;
11639  }
11640  
11641 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
11642                 if (action_ret == IRQ_WAKE_THREAD)
11643                         irq_wake_secondary(desc, action);
11644  
11645 +#ifdef CONFIG_PREEMPT_RT_FULL
11646 +               migrate_disable();
11647 +               add_interrupt_randomness(action->irq, 0,
11648 +                                desc->random_ip ^ (unsigned long) action);
11649 +               migrate_enable();
11650 +#endif
11651                 wake_threads_waitq(desc);
11652         }
11653  
11654 @@ -1336,6 +1389,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
11655                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
11656                 }
11657  
11658 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
11659 +                       irq_settings_set_no_softirq_call(desc);
11660 +
11661                 /* Set default affinity mask once everything is setup */
11662                 setup_affinity(desc, mask);
11663  
11664 @@ -2061,7 +2117,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
11665   *     This call sets the internal irqchip state of an interrupt,
11666   *     depending on the value of @which.
11667   *
11668 - *     This function should be called with preemption disabled if the
11669 + *     This function should be called with migration disabled if the
11670   *     interrupt controller has per-cpu registers.
11671   */
11672  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
11673 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
11674 index 320579d89091..2df2d4445b1e 100644
11675 --- a/kernel/irq/settings.h
11676 +++ b/kernel/irq/settings.h
11677 @@ -16,6 +16,7 @@ enum {
11678         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
11679         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
11680         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
11681 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
11682         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
11683  };
11684  
11685 @@ -30,6 +31,7 @@ enum {
11686  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
11687  #define IRQ_IS_POLLED          GOT_YOU_MORON
11688  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
11689 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
11690  #undef IRQF_MODIFY_MASK
11691  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
11692  
11693 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
11694         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
11695  }
11696  
11697 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
11698 +{
11699 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
11700 +}
11701 +
11702 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
11703 +{
11704 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
11705 +}
11706 +
11707  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
11708  {
11709         return desc->status_use_accessors & _IRQ_PER_CPU;
11710 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
11711 index 5707f97a3e6a..73f38dc7a7fb 100644
11712 --- a/kernel/irq/spurious.c
11713 +++ b/kernel/irq/spurious.c
11714 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
11715  
11716  static int __init irqfixup_setup(char *str)
11717  {
11718 +#ifdef CONFIG_PREEMPT_RT_BASE
11719 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
11720 +       return 1;
11721 +#endif
11722         irqfixup = 1;
11723         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
11724         printk(KERN_WARNING "This may impact system performance.\n");
11725 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
11726  
11727  static int __init irqpoll_setup(char *str)
11728  {
11729 +#ifdef CONFIG_PREEMPT_RT_BASE
11730 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
11731 +       return 1;
11732 +#endif
11733         irqfixup = 2;
11734         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
11735                                 "enabled\n");
11736 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
11737 index bcf107ce0854..2899ba0d23d1 100644
11738 --- a/kernel/irq_work.c
11739 +++ b/kernel/irq_work.c
11740 @@ -17,6 +17,7 @@
11741  #include <linux/cpu.h>
11742  #include <linux/notifier.h>
11743  #include <linux/smp.h>
11744 +#include <linux/interrupt.h>
11745  #include <asm/processor.h>
11746  
11747  
11748 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
11749   */
11750  bool irq_work_queue_on(struct irq_work *work, int cpu)
11751  {
11752 +       struct llist_head *list;
11753 +
11754         /* All work should have been flushed before going offline */
11755         WARN_ON_ONCE(cpu_is_offline(cpu));
11756  
11757 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
11758         if (!irq_work_claim(work))
11759                 return false;
11760  
11761 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
11762 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
11763 +               list = &per_cpu(lazy_list, cpu);
11764 +       else
11765 +               list = &per_cpu(raised_list, cpu);
11766 +
11767 +       if (llist_add(&work->llnode, list))
11768                 arch_send_call_function_single_ipi(cpu);
11769  
11770         return true;
11771 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
11772  /* Enqueue the irq work @work on the current CPU */
11773  bool irq_work_queue(struct irq_work *work)
11774  {
11775 +       struct llist_head *list;
11776 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
11777 +
11778         /* Only queue if not already pending */
11779         if (!irq_work_claim(work))
11780                 return false;
11781 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
11782         /* Queue the entry and raise the IPI if needed. */
11783         preempt_disable();
11784  
11785 -       /* If the work is "lazy", handle it from next tick if any */
11786 -       if (work->flags & IRQ_WORK_LAZY) {
11787 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
11788 -                   tick_nohz_tick_stopped())
11789 -                       arch_irq_work_raise();
11790 -       } else {
11791 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
11792 +       lazy_work = work->flags & IRQ_WORK_LAZY;
11793 +
11794 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
11795 +               list = this_cpu_ptr(&lazy_list);
11796 +       else
11797 +               list = this_cpu_ptr(&raised_list);
11798 +
11799 +       if (llist_add(&work->llnode, list)) {
11800 +               if (!lazy_work || tick_nohz_tick_stopped())
11801                         arch_irq_work_raise();
11802         }
11803  
11804 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
11805         raised = this_cpu_ptr(&raised_list);
11806         lazy = this_cpu_ptr(&lazy_list);
11807  
11808 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
11809 -               if (llist_empty(lazy))
11810 -                       return false;
11811 +       if (llist_empty(raised) && llist_empty(lazy))
11812 +               return false;
11813  
11814         /* All work should have been flushed before going offline */
11815         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
11816 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
11817         struct irq_work *work;
11818         struct llist_node *llnode;
11819  
11820 -       BUG_ON(!irqs_disabled());
11821 +       BUG_ON_NONRT(!irqs_disabled());
11822  
11823         if (llist_empty(list))
11824                 return;
11825 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
11826  void irq_work_run(void)
11827  {
11828         irq_work_run_list(this_cpu_ptr(&raised_list));
11829 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
11830 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
11831 +               /*
11832 +                * NOTE: we raise softirq via IPI for safety,
11833 +                * and execute in irq_work_tick() to move the
11834 +                * overhead from hard to soft irq context.
11835 +                */
11836 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
11837 +                       raise_softirq(TIMER_SOFTIRQ);
11838 +       } else
11839 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
11840  }
11841  EXPORT_SYMBOL_GPL(irq_work_run);
11842  
11843 @@ -179,8 +200,17 @@ void irq_work_tick(void)
11844  
11845         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
11846                 irq_work_run_list(raised);
11847 +
11848 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
11849 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
11850 +}
11851 +
11852 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
11853 +void irq_work_tick_soft(void)
11854 +{
11855         irq_work_run_list(this_cpu_ptr(&lazy_list));
11856  }
11857 +#endif
11858  
11859  /*
11860   * Synchronize against the irq_work @entry, ensures the entry is not
11861 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
11862 index ee1bc1bb8feb..ddef07958840 100644
11863 --- a/kernel/ksysfs.c
11864 +++ b/kernel/ksysfs.c
11865 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
11866  
11867  #endif /* CONFIG_KEXEC_CORE */
11868  
11869 +#if defined(CONFIG_PREEMPT_RT_FULL)
11870 +static ssize_t  realtime_show(struct kobject *kobj,
11871 +                             struct kobj_attribute *attr, char *buf)
11872 +{
11873 +       return sprintf(buf, "%d\n", 1);
11874 +}
11875 +KERNEL_ATTR_RO(realtime);
11876 +#endif
11877 +
11878  /* whether file capabilities are enabled */
11879  static ssize_t fscaps_show(struct kobject *kobj,
11880                                   struct kobj_attribute *attr, char *buf)
11881 @@ -225,6 +234,9 @@ static struct attribute * kernel_attrs[] = {
11882         &rcu_expedited_attr.attr,
11883         &rcu_normal_attr.attr,
11884  #endif
11885 +#ifdef CONFIG_PREEMPT_RT_FULL
11886 +       &realtime_attr.attr,
11887 +#endif
11888         NULL
11889  };
11890  
11891 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
11892 index 6f88e352cd4f..5e27fb1079e7 100644
11893 --- a/kernel/locking/Makefile
11894 +++ b/kernel/locking/Makefile
11895 @@ -2,7 +2,7 @@
11896  # and is generally not a function of system call inputs.
11897  KCOV_INSTRUMENT                := n
11898  
11899 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
11900 +obj-y += semaphore.o percpu-rwsem.o
11901  
11902  ifdef CONFIG_FUNCTION_TRACER
11903  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
11904 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
11905  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
11906  endif
11907  
11908 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
11909 +obj-y += mutex.o
11910  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
11911 +obj-y += rwsem.o
11912 +endif
11913  obj-$(CONFIG_LOCKDEP) += lockdep.o
11914  ifeq ($(CONFIG_PROC_FS),y)
11915  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
11916 @@ -24,7 +28,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
11917  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
11918  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
11919  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
11920 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
11921  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
11922  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
11923 +endif
11924 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
11925  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
11926  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
11927 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
11928 index 4d7ffc0a0d00..9e52009c192e 100644
11929 --- a/kernel/locking/lockdep.c
11930 +++ b/kernel/locking/lockdep.c
11931 @@ -3689,6 +3689,7 @@ static void check_flags(unsigned long flags)
11932                 }
11933         }
11934  
11935 +#ifndef CONFIG_PREEMPT_RT_FULL
11936         /*
11937          * We dont accurately track softirq state in e.g.
11938          * hardirq contexts (such as on 4KSTACKS), so only
11939 @@ -3703,6 +3704,7 @@ static void check_flags(unsigned long flags)
11940                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
11941                 }
11942         }
11943 +#endif
11944  
11945         if (!debug_locks)
11946                 print_irqtrace_events(current);
11947 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
11948 index f8c5af52a131..788068773e61 100644
11949 --- a/kernel/locking/locktorture.c
11950 +++ b/kernel/locking/locktorture.c
11951 @@ -26,7 +26,6 @@
11952  #include <linux/kthread.h>
11953  #include <linux/sched/rt.h>
11954  #include <linux/spinlock.h>
11955 -#include <linux/rwlock.h>
11956  #include <linux/mutex.h>
11957  #include <linux/rwsem.h>
11958  #include <linux/smp.h>
11959 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
11960 index ce182599cf2e..2ad3a1e8344c 100644
11961 --- a/kernel/locking/percpu-rwsem.c
11962 +++ b/kernel/locking/percpu-rwsem.c
11963 @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
11964         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
11965         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
11966         __init_rwsem(&sem->rw_sem, name, rwsem_key);
11967 -       init_waitqueue_head(&sem->writer);
11968 +       init_swait_queue_head(&sem->writer);
11969         sem->readers_block = 0;
11970         return 0;
11971  }
11972 @@ -103,7 +103,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
11973         __this_cpu_dec(*sem->read_count);
11974  
11975         /* Prod writer to recheck readers_active */
11976 -       wake_up(&sem->writer);
11977 +       swake_up(&sem->writer);
11978  }
11979  EXPORT_SYMBOL_GPL(__percpu_up_read);
11980  
11981 @@ -160,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
11982          */
11983  
11984         /* Wait for all now active readers to complete. */
11985 -       wait_event(sem->writer, readers_active_check(sem));
11986 +       swait_event(sem->writer, readers_active_check(sem));
11987  }
11988  EXPORT_SYMBOL_GPL(percpu_down_write);
11989  
11990 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
11991 new file mode 100644
11992 index 000000000000..665754c00e1e
11993 --- /dev/null
11994 +++ b/kernel/locking/rt.c
11995 @@ -0,0 +1,498 @@
11996 +/*
11997 + * kernel/rt.c
11998 + *
11999 + * Real-Time Preemption Support
12000 + *
12001 + * started by Ingo Molnar:
12002 + *
12003 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
12004 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12005 + *
12006 + * historic credit for proving that Linux spinlocks can be implemented via
12007 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
12008 + * and others) who prototyped it on 2.4 and did lots of comparative
12009 + * research and analysis; TimeSys, for proving that you can implement a
12010 + * fully preemptible kernel via the use of IRQ threading and mutexes;
12011 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
12012 + * right one; and to MontaVista, who ported pmutexes to 2.6.
12013 + *
12014 + * This code is a from-scratch implementation and is not based on pmutexes,
12015 + * but the idea of converting spinlocks to mutexes is used here too.
12016 + *
12017 + * lock debugging, locking tree, deadlock detection:
12018 + *
12019 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
12020 + *  Released under the General Public License (GPL).
12021 + *
12022 + * Includes portions of the generic R/W semaphore implementation from:
12023 + *
12024 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
12025 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
12026 + *  - Derived also from comments by Linus
12027 + *
12028 + * Pending ownership of locks and ownership stealing:
12029 + *
12030 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
12031 + *
12032 + *   (also by Steven Rostedt)
12033 + *    - Converted single pi_lock to individual task locks.
12034 + *
12035 + * By Esben Nielsen:
12036 + *    Doing priority inheritance with help of the scheduler.
12037 + *
12038 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12039 + *  - major rework based on Esben Nielsens initial patch
12040 + *  - replaced thread_info references by task_struct refs
12041 + *  - removed task->pending_owner dependency
12042 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
12043 + *    in the scheduler return path as discussed with Steven Rostedt
12044 + *
12045 + *  Copyright (C) 2006, Kihon Technologies Inc.
12046 + *    Steven Rostedt <rostedt@goodmis.org>
12047 + *  - debugged and patched Thomas Gleixner's rework.
12048 + *  - added back the cmpxchg to the rework.
12049 + *  - turned atomic require back on for SMP.
12050 + */
12051 +
12052 +#include <linux/spinlock.h>
12053 +#include <linux/rtmutex.h>
12054 +#include <linux/sched.h>
12055 +#include <linux/delay.h>
12056 +#include <linux/module.h>
12057 +#include <linux/kallsyms.h>
12058 +#include <linux/syscalls.h>
12059 +#include <linux/interrupt.h>
12060 +#include <linux/plist.h>
12061 +#include <linux/fs.h>
12062 +#include <linux/futex.h>
12063 +#include <linux/hrtimer.h>
12064 +
12065 +#include "rtmutex_common.h"
12066 +
12067 +/*
12068 + * struct mutex functions
12069 + */
12070 +void __mutex_do_init(struct mutex *mutex, const char *name,
12071 +                    struct lock_class_key *key)
12072 +{
12073 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12074 +       /*
12075 +        * Make sure we are not reinitializing a held lock:
12076 +        */
12077 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
12078 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
12079 +#endif
12080 +       mutex->lock.save_state = 0;
12081 +}
12082 +EXPORT_SYMBOL(__mutex_do_init);
12083 +
12084 +void __lockfunc _mutex_lock(struct mutex *lock)
12085 +{
12086 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12087 +       rt_mutex_lock(&lock->lock);
12088 +}
12089 +EXPORT_SYMBOL(_mutex_lock);
12090 +
12091 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
12092 +{
12093 +       int ret;
12094 +
12095 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12096 +       ret = rt_mutex_lock_interruptible(&lock->lock);
12097 +       if (ret)
12098 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12099 +       return ret;
12100 +}
12101 +EXPORT_SYMBOL(_mutex_lock_interruptible);
12102 +
12103 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
12104 +{
12105 +       int ret;
12106 +
12107 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12108 +       ret = rt_mutex_lock_killable(&lock->lock);
12109 +       if (ret)
12110 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12111 +       return ret;
12112 +}
12113 +EXPORT_SYMBOL(_mutex_lock_killable);
12114 +
12115 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12116 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
12117 +{
12118 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
12119 +       rt_mutex_lock(&lock->lock);
12120 +}
12121 +EXPORT_SYMBOL(_mutex_lock_nested);
12122 +
12123 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
12124 +{
12125 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
12126 +       rt_mutex_lock(&lock->lock);
12127 +}
12128 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
12129 +
12130 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
12131 +{
12132 +       int ret;
12133 +
12134 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
12135 +       ret = rt_mutex_lock_interruptible(&lock->lock);
12136 +       if (ret)
12137 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12138 +       return ret;
12139 +}
12140 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
12141 +
12142 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
12143 +{
12144 +       int ret;
12145 +
12146 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
12147 +       ret = rt_mutex_lock_killable(&lock->lock);
12148 +       if (ret)
12149 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12150 +       return ret;
12151 +}
12152 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
12153 +#endif
12154 +
12155 +int __lockfunc _mutex_trylock(struct mutex *lock)
12156 +{
12157 +       int ret = rt_mutex_trylock(&lock->lock);
12158 +
12159 +       if (ret)
12160 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12161 +
12162 +       return ret;
12163 +}
12164 +EXPORT_SYMBOL(_mutex_trylock);
12165 +
12166 +void __lockfunc _mutex_unlock(struct mutex *lock)
12167 +{
12168 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
12169 +       rt_mutex_unlock(&lock->lock);
12170 +}
12171 +EXPORT_SYMBOL(_mutex_unlock);
12172 +
12173 +/*
12174 + * rwlock_t functions
12175 + */
12176 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
12177 +{
12178 +       int ret;
12179 +
12180 +       migrate_disable();
12181 +       ret = rt_mutex_trylock(&rwlock->lock);
12182 +       if (ret)
12183 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
12184 +       else
12185 +               migrate_enable();
12186 +
12187 +       return ret;
12188 +}
12189 +EXPORT_SYMBOL(rt_write_trylock);
12190 +
12191 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
12192 +{
12193 +       int ret;
12194 +
12195 +       *flags = 0;
12196 +       ret = rt_write_trylock(rwlock);
12197 +       return ret;
12198 +}
12199 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
12200 +
12201 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
12202 +{
12203 +       struct rt_mutex *lock = &rwlock->lock;
12204 +       int ret = 1;
12205 +
12206 +       /*
12207 +        * recursive read locks succeed when current owns the lock,
12208 +        * but not when read_depth == 0 which means that the lock is
12209 +        * write locked.
12210 +        */
12211 +       if (rt_mutex_owner(lock) != current) {
12212 +               migrate_disable();
12213 +               ret = rt_mutex_trylock(lock);
12214 +               if (ret)
12215 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
12216 +               else
12217 +                       migrate_enable();
12218 +
12219 +       } else if (!rwlock->read_depth) {
12220 +               ret = 0;
12221 +       }
12222 +
12223 +       if (ret)
12224 +               rwlock->read_depth++;
12225 +
12226 +       return ret;
12227 +}
12228 +EXPORT_SYMBOL(rt_read_trylock);
12229 +
12230 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
12231 +{
12232 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
12233 +       __rt_spin_lock(&rwlock->lock);
12234 +}
12235 +EXPORT_SYMBOL(rt_write_lock);
12236 +
12237 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
12238 +{
12239 +       struct rt_mutex *lock = &rwlock->lock;
12240 +
12241 +
12242 +       /*
12243 +        * recursive read locks succeed when current owns the lock
12244 +        */
12245 +       if (rt_mutex_owner(lock) != current) {
12246 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
12247 +               __rt_spin_lock(lock);
12248 +       }
12249 +       rwlock->read_depth++;
12250 +}
12251 +
12252 +EXPORT_SYMBOL(rt_read_lock);
12253 +
12254 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
12255 +{
12256 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12257 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
12258 +       __rt_spin_unlock(&rwlock->lock);
12259 +       migrate_enable();
12260 +}
12261 +EXPORT_SYMBOL(rt_write_unlock);
12262 +
12263 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
12264 +{
12265 +       /* Release the lock only when read_depth is down to 0 */
12266 +       if (--rwlock->read_depth == 0) {
12267 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
12268 +               __rt_spin_unlock(&rwlock->lock);
12269 +               migrate_enable();
12270 +       }
12271 +}
12272 +EXPORT_SYMBOL(rt_read_unlock);
12273 +
12274 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
12275 +{
12276 +       rt_write_lock(rwlock);
12277 +
12278 +       return 0;
12279 +}
12280 +EXPORT_SYMBOL(rt_write_lock_irqsave);
12281 +
12282 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
12283 +{
12284 +       rt_read_lock(rwlock);
12285 +
12286 +       return 0;
12287 +}
12288 +EXPORT_SYMBOL(rt_read_lock_irqsave);
12289 +
12290 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
12291 +{
12292 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12293 +       /*
12294 +        * Make sure we are not reinitializing a held lock:
12295 +        */
12296 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
12297 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
12298 +#endif
12299 +       rwlock->lock.save_state = 1;
12300 +       rwlock->read_depth = 0;
12301 +}
12302 +EXPORT_SYMBOL(__rt_rwlock_init);
12303 +
12304 +/*
12305 + * rw_semaphores
12306 + */
12307 +
12308 +void  rt_up_write(struct rw_semaphore *rwsem)
12309 +{
12310 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12311 +       rt_mutex_unlock(&rwsem->lock);
12312 +}
12313 +EXPORT_SYMBOL(rt_up_write);
12314 +
12315 +void __rt_up_read(struct rw_semaphore *rwsem)
12316 +{
12317 +       if (--rwsem->read_depth == 0)
12318 +               rt_mutex_unlock(&rwsem->lock);
12319 +}
12320 +
12321 +void  rt_up_read(struct rw_semaphore *rwsem)
12322 +{
12323 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12324 +       __rt_up_read(rwsem);
12325 +}
12326 +EXPORT_SYMBOL(rt_up_read);
12327 +
12328 +/*
12329 + * downgrade a write lock into a read lock
12330 + * - just wake up any readers at the front of the queue
12331 + */
12332 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
12333 +{
12334 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
12335 +       rwsem->read_depth = 1;
12336 +}
12337 +EXPORT_SYMBOL(rt_downgrade_write);
12338 +
12339 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
12340 +{
12341 +       int ret = rt_mutex_trylock(&rwsem->lock);
12342 +
12343 +       if (ret)
12344 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12345 +       return ret;
12346 +}
12347 +EXPORT_SYMBOL(rt_down_write_trylock);
12348 +
12349 +void  rt_down_write(struct rw_semaphore *rwsem)
12350 +{
12351 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12352 +       rt_mutex_lock(&rwsem->lock);
12353 +}
12354 +EXPORT_SYMBOL(rt_down_write);
12355 +
12356 +int rt_down_write_killable(struct rw_semaphore *rwsem)
12357 +{
12358 +       int ret;
12359 +
12360 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12361 +       ret = rt_mutex_lock_killable(&rwsem->lock);
12362 +       if (ret)
12363 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12364 +       return ret;
12365 +}
12366 +EXPORT_SYMBOL(rt_down_write_killable);
12367 +
12368 +int rt_down_write_killable_nested(struct rw_semaphore *rwsem, int subclass)
12369 +{
12370 +       int ret;
12371 +
12372 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12373 +       ret = rt_mutex_lock_killable(&rwsem->lock);
12374 +       if (ret)
12375 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12376 +       return ret;
12377 +}
12378 +EXPORT_SYMBOL(rt_down_write_killable_nested);
12379 +
12380 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
12381 +{
12382 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12383 +       rt_mutex_lock(&rwsem->lock);
12384 +}
12385 +EXPORT_SYMBOL(rt_down_write_nested);
12386 +
12387 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
12388 +                              struct lockdep_map *nest)
12389 +{
12390 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
12391 +       rt_mutex_lock(&rwsem->lock);
12392 +}
12393 +EXPORT_SYMBOL(rt_down_write_nested_lock);
12394 +
12395 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
12396 +{
12397 +       struct rt_mutex *lock = &rwsem->lock;
12398 +       int ret = 1;
12399 +
12400 +       /*
12401 +        * recursive read locks succeed when current owns the rwsem,
12402 +        * but not when read_depth == 0 which means that the rwsem is
12403 +        * write locked.
12404 +        */
12405 +       if (rt_mutex_owner(lock) != current)
12406 +               ret = rt_mutex_trylock(&rwsem->lock);
12407 +       else if (!rwsem->read_depth)
12408 +               ret = 0;
12409 +
12410 +       if (ret)
12411 +               rwsem->read_depth++;
12412 +       return ret;
12413 +
12414 +}
12415 +
12416 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
12417 +{
12418 +       int ret;
12419 +
12420 +       ret = rt__down_read_trylock(rwsem);
12421 +       if (ret)
12422 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12423 +
12424 +       return ret;
12425 +}
12426 +EXPORT_SYMBOL(rt_down_read_trylock);
12427 +
12428 +void rt__down_read(struct rw_semaphore *rwsem)
12429 +{
12430 +       struct rt_mutex *lock = &rwsem->lock;
12431 +
12432 +       if (rt_mutex_owner(lock) != current)
12433 +               rt_mutex_lock(&rwsem->lock);
12434 +       rwsem->read_depth++;
12435 +}
12436 +EXPORT_SYMBOL(rt__down_read);
12437 +
12438 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
12439 +{
12440 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
12441 +       rt__down_read(rwsem);
12442 +}
12443 +
12444 +void  rt_down_read(struct rw_semaphore *rwsem)
12445 +{
12446 +       __rt_down_read(rwsem, 0);
12447 +}
12448 +EXPORT_SYMBOL(rt_down_read);
12449 +
12450 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
12451 +{
12452 +       __rt_down_read(rwsem, subclass);
12453 +}
12454 +EXPORT_SYMBOL(rt_down_read_nested);
12455 +
12456 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
12457 +                             struct lock_class_key *key)
12458 +{
12459 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12460 +       /*
12461 +        * Make sure we are not reinitializing a held lock:
12462 +        */
12463 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
12464 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
12465 +#endif
12466 +       rwsem->read_depth = 0;
12467 +       rwsem->lock.save_state = 0;
12468 +}
12469 +EXPORT_SYMBOL(__rt_rwsem_init);
12470 +
12471 +/**
12472 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
12473 + * @cnt: the atomic which we are to dec
12474 + * @lock: the mutex to return holding if we dec to 0
12475 + *
12476 + * return true and hold lock if we dec to 0, return false otherwise
12477 + */
12478 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
12479 +{
12480 +       /* dec if we can't possibly hit 0 */
12481 +       if (atomic_add_unless(cnt, -1, 1))
12482 +               return 0;
12483 +       /* we might hit 0, so take the lock */
12484 +       mutex_lock(lock);
12485 +       if (!atomic_dec_and_test(cnt)) {
12486 +               /* when we actually did the dec, we didn't hit 0 */
12487 +               mutex_unlock(lock);
12488 +               return 0;
12489 +       }
12490 +       /* we hit 0, and we hold the lock */
12491 +       return 1;
12492 +}
12493 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
12494 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
12495 index 2c49d76f96c3..4f1a7663c34d 100644
12496 --- a/kernel/locking/rtmutex.c
12497 +++ b/kernel/locking/rtmutex.c
12498 @@ -7,6 +7,11 @@
12499   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12500   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
12501   *  Copyright (C) 2006 Esben Nielsen
12502 + *  Adaptive Spinlocks:
12503 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
12504 + *                                  and Peter Morreale,
12505 + * Adaptive Spinlocks simplification:
12506 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
12507   *
12508   *  See Documentation/locking/rt-mutex-design.txt for details.
12509   */
12510 @@ -16,6 +21,7 @@
12511  #include <linux/sched/rt.h>
12512  #include <linux/sched/deadline.h>
12513  #include <linux/timer.h>
12514 +#include <linux/ww_mutex.h>
12515  
12516  #include "rtmutex_common.h"
12517  
12518 @@ -133,6 +139,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
12519                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
12520  }
12521  
12522 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
12523 +{
12524 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
12525 +               waiter != PI_REQUEUE_INPROGRESS;
12526 +}
12527 +
12528  /*
12529   * We can speed up the acquire/release, if there's no debugging state to be
12530   * set up.
12531 @@ -414,6 +426,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
12532         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
12533  }
12534  
12535 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
12536 +{
12537 +       if (waiter->savestate)
12538 +               wake_up_lock_sleeper(waiter->task);
12539 +       else
12540 +               wake_up_process(waiter->task);
12541 +}
12542 +
12543  /*
12544   * Max number of times we'll walk the boosting chain:
12545   */
12546 @@ -421,7 +441,8 @@ int max_lock_depth = 1024;
12547  
12548  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
12549  {
12550 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
12551 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
12552 +               p->pi_blocked_on->lock : NULL;
12553  }
12554  
12555  /*
12556 @@ -557,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
12557          * reached or the state of the chain has changed while we
12558          * dropped the locks.
12559          */
12560 -       if (!waiter)
12561 +       if (!rt_mutex_real_waiter(waiter))
12562                 goto out_unlock_pi;
12563  
12564         /*
12565 @@ -719,13 +740,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
12566          * follow here. This is the end of the chain we are walking.
12567          */
12568         if (!rt_mutex_owner(lock)) {
12569 +               struct rt_mutex_waiter *lock_top_waiter;
12570 +
12571                 /*
12572                  * If the requeue [7] above changed the top waiter,
12573                  * then we need to wake the new top waiter up to try
12574                  * to get the lock.
12575                  */
12576 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
12577 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
12578 +               lock_top_waiter = rt_mutex_top_waiter(lock);
12579 +               if (prerequeue_top_waiter != lock_top_waiter)
12580 +                       rt_mutex_wake_waiter(lock_top_waiter);
12581                 raw_spin_unlock_irq(&lock->wait_lock);
12582                 return 0;
12583         }
12584 @@ -818,6 +842,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
12585         return ret;
12586  }
12587  
12588 +
12589 +#define STEAL_NORMAL  0
12590 +#define STEAL_LATERAL 1
12591 +
12592 +/*
12593 + * Note that RT tasks are excluded from lateral-steals to prevent the
12594 + * introduction of an unbounded latency
12595 + */
12596 +static inline int lock_is_stealable(struct task_struct *task,
12597 +                                   struct task_struct *pendowner, int mode)
12598 +{
12599 +    if (mode == STEAL_NORMAL || rt_task(task)) {
12600 +           if (task->prio >= pendowner->prio)
12601 +                   return 0;
12602 +    } else if (task->prio > pendowner->prio)
12603 +           return 0;
12604 +    return 1;
12605 +}
12606 +
12607  /*
12608   * Try to take an rt-mutex
12609   *
12610 @@ -828,8 +871,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
12611   * @waiter: The waiter that is queued to the lock's wait tree if the
12612   *         callsite called task_blocked_on_lock(), otherwise NULL
12613   */
12614 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12615 -                               struct rt_mutex_waiter *waiter)
12616 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
12617 +                                 struct task_struct *task,
12618 +                                 struct rt_mutex_waiter *waiter, int mode)
12619  {
12620         /*
12621          * Before testing whether we can acquire @lock, we set the
12622 @@ -866,8 +910,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12623                  * If waiter is not the highest priority waiter of
12624                  * @lock, give up.
12625                  */
12626 -               if (waiter != rt_mutex_top_waiter(lock))
12627 +               if (waiter != rt_mutex_top_waiter(lock)) {
12628 +                       /* XXX lock_is_stealable() ? */
12629                         return 0;
12630 +               }
12631  
12632                 /*
12633                  * We can acquire the lock. Remove the waiter from the
12634 @@ -885,14 +931,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12635                  * not need to be dequeued.
12636                  */
12637                 if (rt_mutex_has_waiters(lock)) {
12638 -                       /*
12639 -                        * If @task->prio is greater than or equal to
12640 -                        * the top waiter priority (kernel view),
12641 -                        * @task lost.
12642 -                        */
12643 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
12644 -                               return 0;
12645 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
12646  
12647 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
12648 +                               return 0;
12649                         /*
12650                          * The current top waiter stays enqueued. We
12651                          * don't have to change anything in the lock
12652 @@ -941,6 +983,433 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12653         return 1;
12654  }
12655  
12656 +#ifdef CONFIG_PREEMPT_RT_FULL
12657 +/*
12658 + * preemptible spin_lock functions:
12659 + */
12660 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
12661 +                                        void  (*slowfn)(struct rt_mutex *lock,
12662 +                                                        bool mg_off),
12663 +                                        bool do_mig_dis)
12664 +{
12665 +       might_sleep_no_state_check();
12666 +
12667 +       if (do_mig_dis)
12668 +               migrate_disable();
12669 +
12670 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
12671 +               rt_mutex_deadlock_account_lock(lock, current);
12672 +       else
12673 +               slowfn(lock, do_mig_dis);
12674 +}
12675 +
12676 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
12677 +                                         int (*slowfn)(struct rt_mutex *lock))
12678 +{
12679 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
12680 +               rt_mutex_deadlock_account_unlock(current);
12681 +               return 0;
12682 +       }
12683 +       return slowfn(lock);
12684 +}
12685 +#ifdef CONFIG_SMP
12686 +/*
12687 + * Note that owner is a speculative pointer and dereferencing relies
12688 + * on rcu_read_lock() and the check against the lock owner.
12689 + */
12690 +static int adaptive_wait(struct rt_mutex *lock,
12691 +                        struct task_struct *owner)
12692 +{
12693 +       int res = 0;
12694 +
12695 +       rcu_read_lock();
12696 +       for (;;) {
12697 +               if (owner != rt_mutex_owner(lock))
12698 +                       break;
12699 +               /*
12700 +                * Ensure that owner->on_cpu is dereferenced _after_
12701 +                * checking the above to be valid.
12702 +                */
12703 +               barrier();
12704 +               if (!owner->on_cpu) {
12705 +                       res = 1;
12706 +                       break;
12707 +               }
12708 +               cpu_relax();
12709 +       }
12710 +       rcu_read_unlock();
12711 +       return res;
12712 +}
12713 +#else
12714 +static int adaptive_wait(struct rt_mutex *lock,
12715 +                        struct task_struct *orig_owner)
12716 +{
12717 +       return 1;
12718 +}
12719 +#endif
12720 +
12721 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
12722 +                                  struct rt_mutex_waiter *waiter,
12723 +                                  struct task_struct *task,
12724 +                                  enum rtmutex_chainwalk chwalk);
12725 +/*
12726 + * Slow path lock function spin_lock style: this variant is very
12727 + * careful not to miss any non-lock wakeups.
12728 + *
12729 + * We store the current state under p->pi_lock in p->saved_state and
12730 + * the try_to_wake_up() code handles this accordingly.
12731 + */
12732 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
12733 +                                                   bool mg_off)
12734 +{
12735 +       struct task_struct *lock_owner, *self = current;
12736 +       struct rt_mutex_waiter waiter, *top_waiter;
12737 +       unsigned long flags;
12738 +       int ret;
12739 +
12740 +       rt_mutex_init_waiter(&waiter, true);
12741 +
12742 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
12743 +
12744 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
12745 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12746 +               return;
12747 +       }
12748 +
12749 +       BUG_ON(rt_mutex_owner(lock) == self);
12750 +
12751 +       /*
12752 +        * We save whatever state the task is in and we'll restore it
12753 +        * after acquiring the lock taking real wakeups into account
12754 +        * as well. We are serialized via pi_lock against wakeups. See
12755 +        * try_to_wake_up().
12756 +        */
12757 +       raw_spin_lock(&self->pi_lock);
12758 +       self->saved_state = self->state;
12759 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
12760 +       raw_spin_unlock(&self->pi_lock);
12761 +
12762 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
12763 +       BUG_ON(ret);
12764 +
12765 +       for (;;) {
12766 +               /* Try to acquire the lock again. */
12767 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
12768 +                       break;
12769 +
12770 +               top_waiter = rt_mutex_top_waiter(lock);
12771 +               lock_owner = rt_mutex_owner(lock);
12772 +
12773 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12774 +
12775 +               debug_rt_mutex_print_deadlock(&waiter);
12776 +
12777 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
12778 +                       if (mg_off)
12779 +                               migrate_enable();
12780 +                       schedule();
12781 +                       if (mg_off)
12782 +                               migrate_disable();
12783 +               }
12784 +
12785 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
12786 +
12787 +               raw_spin_lock(&self->pi_lock);
12788 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
12789 +               raw_spin_unlock(&self->pi_lock);
12790 +       }
12791 +
12792 +       /*
12793 +        * Restore the task state to current->saved_state. We set it
12794 +        * to the original state above and the try_to_wake_up() code
12795 +        * has possibly updated it when a real (non-rtmutex) wakeup
12796 +        * happened while we were blocked. Clear saved_state so
12797 +        * try_to_wakeup() does not get confused.
12798 +        */
12799 +       raw_spin_lock(&self->pi_lock);
12800 +       __set_current_state_no_track(self->saved_state);
12801 +       self->saved_state = TASK_RUNNING;
12802 +       raw_spin_unlock(&self->pi_lock);
12803 +
12804 +       /*
12805 +        * try_to_take_rt_mutex() sets the waiter bit
12806 +        * unconditionally. We might have to fix that up:
12807 +        */
12808 +       fixup_rt_mutex_waiters(lock);
12809 +
12810 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
12811 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
12812 +
12813 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12814 +
12815 +       debug_rt_mutex_free_waiter(&waiter);
12816 +}
12817 +
12818 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
12819 +                                   struct wake_q_head *wake_sleeper_q,
12820 +                                   struct rt_mutex *lock);
12821 +/*
12822 + * Slow path to release a rt_mutex spin_lock style
12823 + */
12824 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
12825 +{
12826 +       unsigned long flags;
12827 +       WAKE_Q(wake_q);
12828 +       WAKE_Q(wake_sleeper_q);
12829 +
12830 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
12831 +
12832 +       debug_rt_mutex_unlock(lock);
12833 +
12834 +       rt_mutex_deadlock_account_unlock(current);
12835 +
12836 +       if (!rt_mutex_has_waiters(lock)) {
12837 +               lock->owner = NULL;
12838 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12839 +               return 0;
12840 +       }
12841 +
12842 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
12843 +
12844 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12845 +       wake_up_q(&wake_q);
12846 +       wake_up_q_sleeper(&wake_sleeper_q);
12847 +
12848 +       /* Undo pi boosting.when necessary */
12849 +       rt_mutex_adjust_prio(current);
12850 +       return 0;
12851 +}
12852 +
12853 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
12854 +{
12855 +       unsigned long flags;
12856 +       WAKE_Q(wake_q);
12857 +       WAKE_Q(wake_sleeper_q);
12858 +
12859 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
12860 +
12861 +       debug_rt_mutex_unlock(lock);
12862 +
12863 +       rt_mutex_deadlock_account_unlock(current);
12864 +
12865 +       if (!rt_mutex_has_waiters(lock)) {
12866 +               lock->owner = NULL;
12867 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12868 +               return 0;
12869 +       }
12870 +
12871 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
12872 +
12873 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12874 +       wake_up_q(&wake_q);
12875 +       wake_up_q_sleeper(&wake_sleeper_q);
12876 +       return 1;
12877 +}
12878 +
12879 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
12880 +{
12881 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
12882 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12883 +}
12884 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
12885 +
12886 +void __lockfunc rt_spin_lock(spinlock_t *lock)
12887 +{
12888 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
12889 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12890 +}
12891 +EXPORT_SYMBOL(rt_spin_lock);
12892 +
12893 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
12894 +{
12895 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
12896 +}
12897 +EXPORT_SYMBOL(__rt_spin_lock);
12898 +
12899 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
12900 +{
12901 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
12902 +}
12903 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
12904 +
12905 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12906 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
12907 +{
12908 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
12909 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
12910 +}
12911 +EXPORT_SYMBOL(rt_spin_lock_nested);
12912 +#endif
12913 +
12914 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
12915 +{
12916 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12917 +       spin_release(&lock->dep_map, 1, _RET_IP_);
12918 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
12919 +}
12920 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
12921 +
12922 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
12923 +{
12924 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12925 +       spin_release(&lock->dep_map, 1, _RET_IP_);
12926 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
12927 +       migrate_enable();
12928 +}
12929 +EXPORT_SYMBOL(rt_spin_unlock);
12930 +
12931 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
12932 +{
12933 +       int ret;
12934 +
12935 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12936 +       spin_release(&lock->dep_map, 1, _RET_IP_);
12937 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
12938 +       migrate_enable();
12939 +       return ret;
12940 +}
12941 +
12942 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
12943 +{
12944 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
12945 +}
12946 +EXPORT_SYMBOL(__rt_spin_unlock);
12947 +
12948 +/*
12949 + * Wait for the lock to get unlocked: instead of polling for an unlock
12950 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
12951 + * schedule if there's contention:
12952 + */
12953 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
12954 +{
12955 +       spin_lock(lock);
12956 +       spin_unlock(lock);
12957 +}
12958 +EXPORT_SYMBOL(rt_spin_unlock_wait);
12959 +
12960 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
12961 +{
12962 +       int ret;
12963 +
12964 +       ret = rt_mutex_trylock(&lock->lock);
12965 +       if (ret)
12966 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12967 +       return ret;
12968 +}
12969 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
12970 +
12971 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
12972 +{
12973 +       int ret;
12974 +
12975 +       migrate_disable();
12976 +       ret = rt_mutex_trylock(&lock->lock);
12977 +       if (ret)
12978 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12979 +       else
12980 +               migrate_enable();
12981 +       return ret;
12982 +}
12983 +EXPORT_SYMBOL(rt_spin_trylock);
12984 +
12985 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
12986 +{
12987 +       int ret;
12988 +
12989 +       local_bh_disable();
12990 +       ret = rt_mutex_trylock(&lock->lock);
12991 +       if (ret) {
12992 +               migrate_disable();
12993 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12994 +       } else
12995 +               local_bh_enable();
12996 +       return ret;
12997 +}
12998 +EXPORT_SYMBOL(rt_spin_trylock_bh);
12999 +
13000 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
13001 +{
13002 +       int ret;
13003 +
13004 +       *flags = 0;
13005 +       ret = rt_mutex_trylock(&lock->lock);
13006 +       if (ret) {
13007 +               migrate_disable();
13008 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13009 +       }
13010 +       return ret;
13011 +}
13012 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
13013 +
13014 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
13015 +{
13016 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
13017 +       if (atomic_add_unless(atomic, -1, 1))
13018 +               return 0;
13019 +       rt_spin_lock(lock);
13020 +       if (atomic_dec_and_test(atomic))
13021 +               return 1;
13022 +       rt_spin_unlock(lock);
13023 +       return 0;
13024 +}
13025 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
13026 +
13027 +       void
13028 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
13029 +{
13030 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13031 +       /*
13032 +        * Make sure we are not reinitializing a held lock:
13033 +        */
13034 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
13035 +       lockdep_init_map(&lock->dep_map, name, key, 0);
13036 +#endif
13037 +}
13038 +EXPORT_SYMBOL(__rt_spin_lock_init);
13039 +
13040 +#endif /* PREEMPT_RT_FULL */
13041 +
13042 +#ifdef CONFIG_PREEMPT_RT_FULL
13043 +       static inline int __sched
13044 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
13045 +{
13046 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
13047 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
13048 +
13049 +       if (!hold_ctx)
13050 +               return 0;
13051 +
13052 +       if (unlikely(ctx == hold_ctx))
13053 +               return -EALREADY;
13054 +
13055 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
13056 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
13057 +#ifdef CONFIG_DEBUG_MUTEXES
13058 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
13059 +               ctx->contending_lock = ww;
13060 +#endif
13061 +               return -EDEADLK;
13062 +       }
13063 +
13064 +       return 0;
13065 +}
13066 +#else
13067 +       static inline int __sched
13068 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
13069 +{
13070 +       BUG();
13071 +       return 0;
13072 +}
13073 +
13074 +#endif
13075 +
13076 +static inline int
13077 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13078 +                    struct rt_mutex_waiter *waiter)
13079 +{
13080 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
13081 +}
13082 +
13083  /*
13084   * Task blocks on lock.
13085   *
13086 @@ -971,6 +1440,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13087                 return -EDEADLK;
13088  
13089         raw_spin_lock(&task->pi_lock);
13090 +
13091 +       /*
13092 +        * In the case of futex requeue PI, this will be a proxy
13093 +        * lock. The task will wake unaware that it is enqueueed on
13094 +        * this lock. Avoid blocking on two locks and corrupting
13095 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
13096 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
13097 +        * before requeue (due to a signal or timeout). Do not enqueue
13098 +        * the task if PI_WAKEUP_INPROGRESS is set.
13099 +        */
13100 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
13101 +               raw_spin_unlock(&task->pi_lock);
13102 +               return -EAGAIN;
13103 +       }
13104 +
13105 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
13106 +
13107         __rt_mutex_adjust_prio(task);
13108         waiter->task = task;
13109         waiter->lock = lock;
13110 @@ -994,7 +1480,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13111                 rt_mutex_enqueue_pi(owner, waiter);
13112  
13113                 __rt_mutex_adjust_prio(owner);
13114 -               if (owner->pi_blocked_on)
13115 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
13116                         chain_walk = 1;
13117         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
13118                 chain_walk = 1;
13119 @@ -1036,6 +1522,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13120   * Called with lock->wait_lock held and interrupts disabled.
13121   */
13122  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
13123 +                                   struct wake_q_head *wake_sleeper_q,
13124                                     struct rt_mutex *lock)
13125  {
13126         struct rt_mutex_waiter *waiter;
13127 @@ -1064,7 +1551,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
13128  
13129         raw_spin_unlock(&current->pi_lock);
13130  
13131 -       wake_q_add(wake_q, waiter->task);
13132 +       if (waiter->savestate)
13133 +               wake_q_add(wake_sleeper_q, waiter->task);
13134 +       else
13135 +               wake_q_add(wake_q, waiter->task);
13136  }
13137  
13138  /*
13139 @@ -1078,7 +1568,7 @@ static void remove_waiter(struct rt_mutex *lock,
13140  {
13141         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
13142         struct task_struct *owner = rt_mutex_owner(lock);
13143 -       struct rt_mutex *next_lock;
13144 +       struct rt_mutex *next_lock = NULL;
13145  
13146         raw_spin_lock(&current->pi_lock);
13147         rt_mutex_dequeue(lock, waiter);
13148 @@ -1102,7 +1592,8 @@ static void remove_waiter(struct rt_mutex *lock,
13149         __rt_mutex_adjust_prio(owner);
13150  
13151         /* Store the lock on which owner is blocked or NULL */
13152 -       next_lock = task_blocked_on_lock(owner);
13153 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
13154 +               next_lock = task_blocked_on_lock(owner);
13155  
13156         raw_spin_unlock(&owner->pi_lock);
13157  
13158 @@ -1138,17 +1629,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
13159         raw_spin_lock_irqsave(&task->pi_lock, flags);
13160  
13161         waiter = task->pi_blocked_on;
13162 -       if (!waiter || (waiter->prio == task->prio &&
13163 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
13164                         !dl_prio(task->prio))) {
13165                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13166                 return;
13167         }
13168         next_lock = waiter->lock;
13169 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13170  
13171         /* gets dropped in rt_mutex_adjust_prio_chain()! */
13172         get_task_struct(task);
13173  
13174 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13175         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
13176                                    next_lock, NULL, task);
13177  }
13178 @@ -1166,7 +1657,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
13179  static int __sched
13180  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
13181                     struct hrtimer_sleeper *timeout,
13182 -                   struct rt_mutex_waiter *waiter)
13183 +                   struct rt_mutex_waiter *waiter,
13184 +                   struct ww_acquire_ctx *ww_ctx)
13185  {
13186         int ret = 0;
13187  
13188 @@ -1189,6 +1681,12 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
13189                                 break;
13190                 }
13191  
13192 +               if (ww_ctx && ww_ctx->acquired > 0) {
13193 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
13194 +                       if (ret)
13195 +                               break;
13196 +               }
13197 +
13198                 raw_spin_unlock_irq(&lock->wait_lock);
13199  
13200                 debug_rt_mutex_print_deadlock(waiter);
13201 @@ -1223,21 +1721,96 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
13202         }
13203  }
13204  
13205 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
13206 +                                                  struct ww_acquire_ctx *ww_ctx)
13207 +{
13208 +#ifdef CONFIG_DEBUG_MUTEXES
13209 +       /*
13210 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
13211 +        * but released with a normal mutex_unlock in this call.
13212 +        *
13213 +        * This should never happen, always use ww_mutex_unlock.
13214 +        */
13215 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
13216 +
13217 +       /*
13218 +        * Not quite done after calling ww_acquire_done() ?
13219 +        */
13220 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
13221 +
13222 +       if (ww_ctx->contending_lock) {
13223 +               /*
13224 +                * After -EDEADLK you tried to
13225 +                * acquire a different ww_mutex? Bad!
13226 +                */
13227 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
13228 +
13229 +               /*
13230 +                * You called ww_mutex_lock after receiving -EDEADLK,
13231 +                * but 'forgot' to unlock everything else first?
13232 +                */
13233 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
13234 +               ww_ctx->contending_lock = NULL;
13235 +       }
13236 +
13237 +       /*
13238 +        * Naughty, using a different class will lead to undefined behavior!
13239 +        */
13240 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
13241 +#endif
13242 +       ww_ctx->acquired++;
13243 +}
13244 +
13245 +#ifdef CONFIG_PREEMPT_RT_FULL
13246 +static void ww_mutex_account_lock(struct rt_mutex *lock,
13247 +                                 struct ww_acquire_ctx *ww_ctx)
13248 +{
13249 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
13250 +       struct rt_mutex_waiter *waiter, *n;
13251 +
13252 +       /*
13253 +        * This branch gets optimized out for the common case,
13254 +        * and is only important for ww_mutex_lock.
13255 +        */
13256 +       ww_mutex_lock_acquired(ww, ww_ctx);
13257 +       ww->ctx = ww_ctx;
13258 +
13259 +       /*
13260 +        * Give any possible sleeping processes the chance to wake up,
13261 +        * so they can recheck if they have to back off.
13262 +        */
13263 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
13264 +                                            tree_entry) {
13265 +               /* XXX debug rt mutex waiter wakeup */
13266 +
13267 +               BUG_ON(waiter->lock != lock);
13268 +               rt_mutex_wake_waiter(waiter);
13269 +       }
13270 +}
13271 +
13272 +#else
13273 +
13274 +static void ww_mutex_account_lock(struct rt_mutex *lock,
13275 +                                 struct ww_acquire_ctx *ww_ctx)
13276 +{
13277 +       BUG();
13278 +}
13279 +#endif
13280 +
13281  /*
13282   * Slow path lock function:
13283   */
13284  static int __sched
13285  rt_mutex_slowlock(struct rt_mutex *lock, int state,
13286                   struct hrtimer_sleeper *timeout,
13287 -                 enum rtmutex_chainwalk chwalk)
13288 +                 enum rtmutex_chainwalk chwalk,
13289 +                 struct ww_acquire_ctx *ww_ctx)
13290  {
13291         struct rt_mutex_waiter waiter;
13292         unsigned long flags;
13293         int ret = 0;
13294  
13295 -       debug_rt_mutex_init_waiter(&waiter);
13296 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
13297 -       RB_CLEAR_NODE(&waiter.tree_entry);
13298 +       rt_mutex_init_waiter(&waiter, false);
13299  
13300         /*
13301          * Technically we could use raw_spin_[un]lock_irq() here, but this can
13302 @@ -1251,6 +1824,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
13303  
13304         /* Try to acquire the lock again: */
13305         if (try_to_take_rt_mutex(lock, current, NULL)) {
13306 +               if (ww_ctx)
13307 +                       ww_mutex_account_lock(lock, ww_ctx);
13308                 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13309                 return 0;
13310         }
13311 @@ -1265,13 +1840,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
13312  
13313         if (likely(!ret))
13314                 /* sleep on the mutex */
13315 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
13316 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
13317 +                                         ww_ctx);
13318 +       else if (ww_ctx) {
13319 +               /* ww_mutex received EDEADLK, let it become EALREADY */
13320 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
13321 +               BUG_ON(!ret);
13322 +       }
13323  
13324         if (unlikely(ret)) {
13325                 __set_current_state(TASK_RUNNING);
13326                 if (rt_mutex_has_waiters(lock))
13327                         remove_waiter(lock, &waiter);
13328 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13329 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
13330 +               if (!ww_ctx)
13331 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13332 +       } else if (ww_ctx) {
13333 +               ww_mutex_account_lock(lock, ww_ctx);
13334         }
13335  
13336         /*
13337 @@ -1331,7 +1916,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
13338   * Return whether the current task needs to undo a potential priority boosting.
13339   */
13340  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13341 -                                       struct wake_q_head *wake_q)
13342 +                                       struct wake_q_head *wake_q,
13343 +                                       struct wake_q_head *wake_sleeper_q)
13344  {
13345         unsigned long flags;
13346  
13347 @@ -1387,7 +1973,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13348          *
13349          * Queue the next waiter for wakeup once we release the wait_lock.
13350          */
13351 -       mark_wakeup_next_waiter(wake_q, lock);
13352 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
13353  
13354         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13355  
13356 @@ -1403,31 +1989,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13357   */
13358  static inline int
13359  rt_mutex_fastlock(struct rt_mutex *lock, int state,
13360 +                 struct ww_acquire_ctx *ww_ctx,
13361                   int (*slowfn)(struct rt_mutex *lock, int state,
13362                                 struct hrtimer_sleeper *timeout,
13363 -                               enum rtmutex_chainwalk chwalk))
13364 +                               enum rtmutex_chainwalk chwalk,
13365 +                               struct ww_acquire_ctx *ww_ctx))
13366  {
13367         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13368                 rt_mutex_deadlock_account_lock(lock, current);
13369                 return 0;
13370         } else
13371 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
13372 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
13373 +                             ww_ctx);
13374  }
13375  
13376  static inline int
13377  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
13378                         struct hrtimer_sleeper *timeout,
13379                         enum rtmutex_chainwalk chwalk,
13380 +                       struct ww_acquire_ctx *ww_ctx,
13381                         int (*slowfn)(struct rt_mutex *lock, int state,
13382                                       struct hrtimer_sleeper *timeout,
13383 -                                     enum rtmutex_chainwalk chwalk))
13384 +                                     enum rtmutex_chainwalk chwalk,
13385 +                                     struct ww_acquire_ctx *ww_ctx))
13386  {
13387         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
13388             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13389                 rt_mutex_deadlock_account_lock(lock, current);
13390                 return 0;
13391         } else
13392 -               return slowfn(lock, state, timeout, chwalk);
13393 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
13394  }
13395  
13396  static inline int
13397 @@ -1444,17 +2035,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
13398  static inline void
13399  rt_mutex_fastunlock(struct rt_mutex *lock,
13400                     bool (*slowfn)(struct rt_mutex *lock,
13401 -                                  struct wake_q_head *wqh))
13402 +                                  struct wake_q_head *wqh,
13403 +                                  struct wake_q_head *wq_sleeper))
13404  {
13405         WAKE_Q(wake_q);
13406 +       WAKE_Q(wake_sleeper_q);
13407  
13408         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13409                 rt_mutex_deadlock_account_unlock(current);
13410  
13411         } else {
13412 -               bool deboost = slowfn(lock, &wake_q);
13413 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
13414  
13415                 wake_up_q(&wake_q);
13416 +               wake_up_q_sleeper(&wake_sleeper_q);
13417  
13418                 /* Undo pi boosting if necessary: */
13419                 if (deboost)
13420 @@ -1471,7 +2065,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
13421  {
13422         might_sleep();
13423  
13424 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
13425 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
13426  }
13427  EXPORT_SYMBOL_GPL(rt_mutex_lock);
13428  
13429 @@ -1488,7 +2082,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
13430  {
13431         might_sleep();
13432  
13433 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
13434 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
13435  }
13436  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
13437  
13438 @@ -1501,11 +2095,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
13439         might_sleep();
13440  
13441         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13442 -                                      RT_MUTEX_FULL_CHAINWALK,
13443 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
13444                                        rt_mutex_slowlock);
13445  }
13446  
13447  /**
13448 + * rt_mutex_lock_killable - lock a rt_mutex killable
13449 + *
13450 + * @lock:              the rt_mutex to be locked
13451 + * @detect_deadlock:   deadlock detection on/off
13452 + *
13453 + * Returns:
13454 + *  0          on success
13455 + * -EINTR      when interrupted by a signal
13456 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
13457 + */
13458 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
13459 +{
13460 +       might_sleep();
13461 +
13462 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
13463 +}
13464 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
13465 +
13466 +/**
13467   * rt_mutex_timed_lock - lock a rt_mutex interruptible
13468   *                     the timeout structure is provided
13469   *                     by the caller
13470 @@ -1525,6 +2138,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
13471  
13472         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13473                                        RT_MUTEX_MIN_CHAINWALK,
13474 +                                      NULL,
13475                                        rt_mutex_slowlock);
13476  }
13477  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
13478 @@ -1542,7 +2156,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
13479   */
13480  int __sched rt_mutex_trylock(struct rt_mutex *lock)
13481  {
13482 +#ifdef CONFIG_PREEMPT_RT_FULL
13483 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
13484 +#else
13485         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
13486 +#endif
13487                 return 0;
13488  
13489         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
13490 @@ -1568,13 +2186,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
13491   * required or not.
13492   */
13493  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
13494 -                                  struct wake_q_head *wqh)
13495 +                                  struct wake_q_head *wqh,
13496 +                                  struct wake_q_head *wq_sleeper)
13497  {
13498         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13499                 rt_mutex_deadlock_account_unlock(current);
13500                 return false;
13501         }
13502 -       return rt_mutex_slowunlock(lock, wqh);
13503 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
13504  }
13505  
13506  /**
13507 @@ -1607,13 +2226,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
13508  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
13509  {
13510         lock->owner = NULL;
13511 -       raw_spin_lock_init(&lock->wait_lock);
13512         lock->waiters = RB_ROOT;
13513         lock->waiters_leftmost = NULL;
13514  
13515         debug_rt_mutex_init(lock, name);
13516  }
13517 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
13518 +EXPORT_SYMBOL(__rt_mutex_init);
13519  
13520  /**
13521   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
13522 @@ -1628,7 +2246,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
13523  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13524                                 struct task_struct *proxy_owner)
13525  {
13526 -       __rt_mutex_init(lock, NULL);
13527 +       rt_mutex_init(lock);
13528         debug_rt_mutex_proxy_lock(lock, proxy_owner);
13529         rt_mutex_set_owner(lock, proxy_owner);
13530         rt_mutex_deadlock_account_lock(lock, proxy_owner);
13531 @@ -1676,6 +2294,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
13532                 return 1;
13533         }
13534  
13535 +#ifdef CONFIG_PREEMPT_RT_FULL
13536 +       /*
13537 +        * In PREEMPT_RT there's an added race.
13538 +        * If the task, that we are about to requeue, times out,
13539 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
13540 +        * to skip this task. But right after the task sets
13541 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
13542 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
13543 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
13544 +        * lock that it blocks on. We *must not* place this task
13545 +        * on this proxy lock in that case.
13546 +        *
13547 +        * To prevent this race, we first take the task's pi_lock
13548 +        * and check if it has updated its pi_blocked_on. If it has,
13549 +        * we assume that it woke up and we return -EAGAIN.
13550 +        * Otherwise, we set the task's pi_blocked_on to
13551 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
13552 +        * it will know that we are in the process of requeuing it.
13553 +        */
13554 +       raw_spin_lock(&task->pi_lock);
13555 +       if (task->pi_blocked_on) {
13556 +               raw_spin_unlock(&task->pi_lock);
13557 +               raw_spin_unlock_irq(&lock->wait_lock);
13558 +               return -EAGAIN;
13559 +       }
13560 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
13561 +       raw_spin_unlock(&task->pi_lock);
13562 +#endif
13563 +
13564         /* We enforce deadlock detection for futexes */
13565         ret = task_blocks_on_rt_mutex(lock, waiter, task,
13566                                       RT_MUTEX_FULL_CHAINWALK);
13567 @@ -1690,7 +2337,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
13568                 ret = 0;
13569         }
13570  
13571 -       if (unlikely(ret))
13572 +       if (ret && rt_mutex_has_waiters(lock))
13573                 remove_waiter(lock, waiter);
13574  
13575         raw_spin_unlock_irq(&lock->wait_lock);
13576 @@ -1746,7 +2393,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
13577         set_current_state(TASK_INTERRUPTIBLE);
13578  
13579         /* sleep on the mutex */
13580 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
13581 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
13582  
13583         if (unlikely(ret))
13584                 remove_waiter(lock, waiter);
13585 @@ -1761,3 +2408,89 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
13586  
13587         return ret;
13588  }
13589 +
13590 +static inline int
13591 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
13592 +{
13593 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
13594 +       unsigned tmp;
13595 +
13596 +       if (ctx->deadlock_inject_countdown-- == 0) {
13597 +               tmp = ctx->deadlock_inject_interval;
13598 +               if (tmp > UINT_MAX/4)
13599 +                       tmp = UINT_MAX;
13600 +               else
13601 +                       tmp = tmp*2 + tmp + tmp/2;
13602 +
13603 +               ctx->deadlock_inject_interval = tmp;
13604 +               ctx->deadlock_inject_countdown = tmp;
13605 +               ctx->contending_lock = lock;
13606 +
13607 +               ww_mutex_unlock(lock);
13608 +
13609 +               return -EDEADLK;
13610 +       }
13611 +#endif
13612 +
13613 +       return 0;
13614 +}
13615 +
13616 +#ifdef CONFIG_PREEMPT_RT_FULL
13617 +int __sched
13618 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
13619 +{
13620 +       int ret;
13621 +
13622 +       might_sleep();
13623 +
13624 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
13625 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
13626 +       if (ret)
13627 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
13628 +       else if (!ret && ww_ctx->acquired > 1)
13629 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
13630 +
13631 +       return ret;
13632 +}
13633 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
13634 +
13635 +int __sched
13636 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
13637 +{
13638 +       int ret;
13639 +
13640 +       might_sleep();
13641 +
13642 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
13643 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
13644 +       if (ret)
13645 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
13646 +       else if (!ret && ww_ctx->acquired > 1)
13647 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
13648 +
13649 +       return ret;
13650 +}
13651 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
13652 +
13653 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
13654 +{
13655 +       int nest = !!lock->ctx;
13656 +
13657 +       /*
13658 +        * The unlocking fastpath is the 0->1 transition from 'locked'
13659 +        * into 'unlocked' state:
13660 +        */
13661 +       if (nest) {
13662 +#ifdef CONFIG_DEBUG_MUTEXES
13663 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
13664 +#endif
13665 +               if (lock->ctx->acquired > 0)
13666 +                       lock->ctx->acquired--;
13667 +               lock->ctx = NULL;
13668 +       }
13669 +
13670 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
13671 +       rt_mutex_unlock(&lock->base.lock);
13672 +}
13673 +EXPORT_SYMBOL(ww_mutex_unlock);
13674 +#endif
13675 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
13676 index e317e1cbb3eb..f457c7574920 100644
13677 --- a/kernel/locking/rtmutex_common.h
13678 +++ b/kernel/locking/rtmutex_common.h
13679 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
13680         struct rb_node          pi_tree_entry;
13681         struct task_struct      *task;
13682         struct rt_mutex         *lock;
13683 +       bool                    savestate;
13684  #ifdef CONFIG_DEBUG_RT_MUTEXES
13685         unsigned long           ip;
13686         struct pid              *deadlock_task_pid;
13687 @@ -98,6 +99,9 @@ enum rtmutex_chainwalk {
13688  /*
13689   * PI-futex support (proxy locking functions, etc.):
13690   */
13691 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
13692 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
13693 +
13694  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
13695  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13696                                        struct task_struct *proxy_owner);
13697 @@ -111,7 +115,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
13698                                       struct rt_mutex_waiter *waiter);
13699  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
13700  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
13701 -                                 struct wake_q_head *wqh);
13702 +                                 struct wake_q_head *wqh,
13703 +                                 struct wake_q_head *wq_sleeper);
13704  extern void rt_mutex_adjust_prio(struct task_struct *task);
13705  
13706  #ifdef CONFIG_DEBUG_RT_MUTEXES
13707 @@ -120,4 +125,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
13708  # include "rtmutex.h"
13709  #endif
13710  
13711 +static inline void
13712 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
13713 +{
13714 +       debug_rt_mutex_init_waiter(waiter);
13715 +       waiter->task = NULL;
13716 +       waiter->savestate = savestate;
13717 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
13718 +       RB_CLEAR_NODE(&waiter->tree_entry);
13719 +}
13720 +
13721  #endif
13722 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
13723 index db3ccb1dd614..909779647bd1 100644
13724 --- a/kernel/locking/spinlock.c
13725 +++ b/kernel/locking/spinlock.c
13726 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
13727   *         __[spin|read|write]_lock_bh()
13728   */
13729  BUILD_LOCK_OPS(spin, raw_spinlock);
13730 +
13731 +#ifndef CONFIG_PREEMPT_RT_FULL
13732  BUILD_LOCK_OPS(read, rwlock);
13733  BUILD_LOCK_OPS(write, rwlock);
13734 +#endif
13735  
13736  #endif
13737  
13738 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
13739  EXPORT_SYMBOL(_raw_spin_unlock_bh);
13740  #endif
13741  
13742 +#ifndef CONFIG_PREEMPT_RT_FULL
13743 +
13744  #ifndef CONFIG_INLINE_READ_TRYLOCK
13745  int __lockfunc _raw_read_trylock(rwlock_t *lock)
13746  {
13747 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
13748  EXPORT_SYMBOL(_raw_write_unlock_bh);
13749  #endif
13750  
13751 +#endif /* !PREEMPT_RT_FULL */
13752 +
13753  #ifdef CONFIG_DEBUG_LOCK_ALLOC
13754  
13755  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
13756 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
13757 index 0374a596cffa..94970338d518 100644
13758 --- a/kernel/locking/spinlock_debug.c
13759 +++ b/kernel/locking/spinlock_debug.c
13760 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
13761  
13762  EXPORT_SYMBOL(__raw_spin_lock_init);
13763  
13764 +#ifndef CONFIG_PREEMPT_RT_FULL
13765  void __rwlock_init(rwlock_t *lock, const char *name,
13766                    struct lock_class_key *key)
13767  {
13768 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
13769  }
13770  
13771  EXPORT_SYMBOL(__rwlock_init);
13772 +#endif
13773  
13774  static void spin_dump(raw_spinlock_t *lock, const char *msg)
13775  {
13776 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
13777         arch_spin_unlock(&lock->raw_lock);
13778  }
13779  
13780 +#ifndef CONFIG_PREEMPT_RT_FULL
13781  static void rwlock_bug(rwlock_t *lock, const char *msg)
13782  {
13783         if (!debug_locks_off())
13784 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
13785         debug_write_unlock(lock);
13786         arch_write_unlock(&lock->raw_lock);
13787  }
13788 +
13789 +#endif
13790 diff --git a/kernel/panic.c b/kernel/panic.c
13791 index e6480e20379e..7e9c1918a94e 100644
13792 --- a/kernel/panic.c
13793 +++ b/kernel/panic.c
13794 @@ -482,9 +482,11 @@ static u64 oops_id;
13795  
13796  static int init_oops_id(void)
13797  {
13798 +#ifndef CONFIG_PREEMPT_RT_FULL
13799         if (!oops_id)
13800                 get_random_bytes(&oops_id, sizeof(oops_id));
13801         else
13802 +#endif
13803                 oops_id++;
13804  
13805         return 0;
13806 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
13807 index b26dbc48c75b..968255f27a33 100644
13808 --- a/kernel/power/hibernate.c
13809 +++ b/kernel/power/hibernate.c
13810 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
13811  
13812         local_irq_disable();
13813  
13814 +       system_state = SYSTEM_SUSPEND;
13815 +
13816         error = syscore_suspend();
13817         if (error) {
13818                 printk(KERN_ERR "PM: Some system devices failed to power down, "
13819 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
13820         syscore_resume();
13821  
13822   Enable_irqs:
13823 +       system_state = SYSTEM_RUNNING;
13824         local_irq_enable();
13825  
13826   Enable_cpus:
13827 @@ -446,6 +449,7 @@ static int resume_target_kernel(bool platform_mode)
13828                 goto Enable_cpus;
13829  
13830         local_irq_disable();
13831 +       system_state = SYSTEM_SUSPEND;
13832  
13833         error = syscore_suspend();
13834         if (error)
13835 @@ -479,6 +483,7 @@ static int resume_target_kernel(bool platform_mode)
13836         syscore_resume();
13837  
13838   Enable_irqs:
13839 +       system_state = SYSTEM_RUNNING;
13840         local_irq_enable();
13841  
13842   Enable_cpus:
13843 @@ -564,6 +569,7 @@ int hibernation_platform_enter(void)
13844                 goto Enable_cpus;
13845  
13846         local_irq_disable();
13847 +       system_state = SYSTEM_SUSPEND;
13848         syscore_suspend();
13849         if (pm_wakeup_pending()) {
13850                 error = -EAGAIN;
13851 @@ -576,6 +582,7 @@ int hibernation_platform_enter(void)
13852  
13853   Power_up:
13854         syscore_resume();
13855 +       system_state = SYSTEM_RUNNING;
13856         local_irq_enable();
13857  
13858   Enable_cpus:
13859 @@ -676,6 +683,10 @@ static int load_image_and_restore(void)
13860         return error;
13861  }
13862  
13863 +#ifndef CONFIG_SUSPEND
13864 +bool pm_in_action;
13865 +#endif
13866 +
13867  /**
13868   * hibernate - Carry out system hibernation, including saving the image.
13869   */
13870 @@ -689,6 +700,8 @@ int hibernate(void)
13871                 return -EPERM;
13872         }
13873  
13874 +       pm_in_action = true;
13875 +
13876         lock_system_sleep();
13877         /* The snapshot device should not be opened while we're running */
13878         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
13879 @@ -766,6 +779,7 @@ int hibernate(void)
13880         atomic_inc(&snapshot_device_available);
13881   Unlock:
13882         unlock_system_sleep();
13883 +       pm_in_action = false;
13884         return error;
13885  }
13886  
13887 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
13888 index 6ccb08f57fcb..c8cbb5ed2fe3 100644
13889 --- a/kernel/power/suspend.c
13890 +++ b/kernel/power/suspend.c
13891 @@ -369,6 +369,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
13892         arch_suspend_disable_irqs();
13893         BUG_ON(!irqs_disabled());
13894  
13895 +       system_state = SYSTEM_SUSPEND;
13896 +
13897         error = syscore_suspend();
13898         if (!error) {
13899                 *wakeup = pm_wakeup_pending();
13900 @@ -385,6 +387,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
13901                 syscore_resume();
13902         }
13903  
13904 +       system_state = SYSTEM_RUNNING;
13905 +
13906         arch_suspend_enable_irqs();
13907         BUG_ON(irqs_disabled());
13908  
13909 @@ -527,6 +531,8 @@ static int enter_state(suspend_state_t state)
13910         return error;
13911  }
13912  
13913 +bool pm_in_action;
13914 +
13915  /**
13916   * pm_suspend - Externally visible function for suspending the system.
13917   * @state: System sleep state to enter.
13918 @@ -541,6 +547,8 @@ int pm_suspend(suspend_state_t state)
13919         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
13920                 return -EINVAL;
13921  
13922 +       pm_in_action = true;
13923 +
13924         error = enter_state(state);
13925         if (error) {
13926                 suspend_stats.fail++;
13927 @@ -548,6 +556,7 @@ int pm_suspend(suspend_state_t state)
13928         } else {
13929                 suspend_stats.success++;
13930         }
13931 +       pm_in_action = false;
13932         return error;
13933  }
13934  EXPORT_SYMBOL(pm_suspend);
13935 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
13936 index f7a55e9ff2f7..9277ee033271 100644
13937 --- a/kernel/printk/printk.c
13938 +++ b/kernel/printk/printk.c
13939 @@ -351,6 +351,65 @@ __packed __aligned(4)
13940   */
13941  DEFINE_RAW_SPINLOCK(logbuf_lock);
13942  
13943 +#ifdef CONFIG_EARLY_PRINTK
13944 +struct console *early_console;
13945 +
13946 +static void early_vprintk(const char *fmt, va_list ap)
13947 +{
13948 +       if (early_console) {
13949 +               char buf[512];
13950 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
13951 +
13952 +               early_console->write(early_console, buf, n);
13953 +       }
13954 +}
13955 +
13956 +asmlinkage void early_printk(const char *fmt, ...)
13957 +{
13958 +       va_list ap;
13959 +
13960 +       va_start(ap, fmt);
13961 +       early_vprintk(fmt, ap);
13962 +       va_end(ap);
13963 +}
13964 +
13965 +/*
13966 + * This is independent of any log levels - a global
13967 + * kill switch that turns off all of printk.
13968 + *
13969 + * Used by the NMI watchdog if early-printk is enabled.
13970 + */
13971 +static bool __read_mostly printk_killswitch;
13972 +
13973 +static int __init force_early_printk_setup(char *str)
13974 +{
13975 +       printk_killswitch = true;
13976 +       return 0;
13977 +}
13978 +early_param("force_early_printk", force_early_printk_setup);
13979 +
13980 +void printk_kill(void)
13981 +{
13982 +       printk_killswitch = true;
13983 +}
13984 +
13985 +#ifdef CONFIG_PRINTK
13986 +static int forced_early_printk(const char *fmt, va_list ap)
13987 +{
13988 +       if (!printk_killswitch)
13989 +               return 0;
13990 +       early_vprintk(fmt, ap);
13991 +       return 1;
13992 +}
13993 +#endif
13994 +
13995 +#else
13996 +static inline int forced_early_printk(const char *fmt, va_list ap)
13997 +{
13998 +       return 0;
13999 +}
14000 +#endif
14001 +
14002  #ifdef CONFIG_PRINTK
14003  DECLARE_WAIT_QUEUE_HEAD(log_wait);
14004  /* the next printk record to read by syslog(READ) or /proc/kmsg */
14005 @@ -1337,6 +1396,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14006  {
14007         char *text;
14008         int len = 0;
14009 +       int attempts = 0;
14010  
14011         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
14012         if (!text)
14013 @@ -1348,6 +1408,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14014                 u64 seq;
14015                 u32 idx;
14016                 enum log_flags prev;
14017 +               int num_msg;
14018 +try_again:
14019 +               attempts++;
14020 +               if (attempts > 10) {
14021 +                       len = -EBUSY;
14022 +                       goto out;
14023 +               }
14024 +               num_msg = 0;
14025  
14026                 /*
14027                  * Find first record that fits, including all following records,
14028 @@ -1363,6 +1431,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14029                         prev = msg->flags;
14030                         idx = log_next(idx);
14031                         seq++;
14032 +                       num_msg++;
14033 +                       if (num_msg > 5) {
14034 +                               num_msg = 0;
14035 +                               raw_spin_unlock_irq(&logbuf_lock);
14036 +                               raw_spin_lock_irq(&logbuf_lock);
14037 +                               if (clear_seq < log_first_seq)
14038 +                                       goto try_again;
14039 +                       }
14040                 }
14041  
14042                 /* move first record forward until length fits into the buffer */
14043 @@ -1376,6 +1452,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14044                         prev = msg->flags;
14045                         idx = log_next(idx);
14046                         seq++;
14047 +                       num_msg++;
14048 +                       if (num_msg > 5) {
14049 +                               num_msg = 0;
14050 +                               raw_spin_unlock_irq(&logbuf_lock);
14051 +                               raw_spin_lock_irq(&logbuf_lock);
14052 +                               if (clear_seq < log_first_seq)
14053 +                                       goto try_again;
14054 +                       }
14055                 }
14056  
14057                 /* last message fitting into this dump */
14058 @@ -1416,6 +1500,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14059                 clear_seq = log_next_seq;
14060                 clear_idx = log_next_idx;
14061         }
14062 +out:
14063         raw_spin_unlock_irq(&logbuf_lock);
14064  
14065         kfree(text);
14066 @@ -1569,6 +1654,12 @@ static void call_console_drivers(int level,
14067         if (!console_drivers)
14068                 return;
14069  
14070 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
14071 +               if (in_irq() || in_nmi())
14072 +                       return;
14073 +       }
14074 +
14075 +       migrate_disable();
14076         for_each_console(con) {
14077                 if (exclusive_console && con != exclusive_console)
14078                         continue;
14079 @@ -1584,6 +1675,7 @@ static void call_console_drivers(int level,
14080                 else
14081                         con->write(con, text, len);
14082         }
14083 +       migrate_enable();
14084  }
14085  
14086  /*
14087 @@ -1781,6 +1873,13 @@ asmlinkage int vprintk_emit(int facility, int level,
14088         /* cpu currently holding logbuf_lock in this function */
14089         static unsigned int logbuf_cpu = UINT_MAX;
14090  
14091 +       /*
14092 +        * Fall back to early_printk if a debugging subsystem has
14093 +        * killed printk output
14094 +        */
14095 +       if (unlikely(forced_early_printk(fmt, args)))
14096 +               return 1;
14097 +
14098         if (level == LOGLEVEL_SCHED) {
14099                 level = LOGLEVEL_DEFAULT;
14100                 in_sched = true;
14101 @@ -1885,13 +1984,23 @@ asmlinkage int vprintk_emit(int facility, int level,
14102  
14103         /* If called from the scheduler, we can not call up(). */
14104         if (!in_sched) {
14105 +               int may_trylock = 1;
14106 +
14107                 lockdep_off();
14108 +#ifdef CONFIG_PREEMPT_RT_FULL
14109 +               /*
14110 +                * we can't take a sleeping lock with IRQs or preeption disabled
14111 +                * so we can't print in these contexts
14112 +                */
14113 +               if (!(preempt_count() == 0 && !irqs_disabled()))
14114 +                       may_trylock = 0;
14115 +#endif
14116                 /*
14117                  * Try to acquire and then immediately release the console
14118                  * semaphore.  The release will print out buffers and wake up
14119                  * /dev/kmsg and syslog() users.
14120                  */
14121 -               if (console_trylock())
14122 +               if (may_trylock && console_trylock())
14123                         console_unlock();
14124                 lockdep_on();
14125         }
14126 @@ -2014,26 +2123,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
14127  
14128  #endif /* CONFIG_PRINTK */
14129  
14130 -#ifdef CONFIG_EARLY_PRINTK
14131 -struct console *early_console;
14132 -
14133 -asmlinkage __visible void early_printk(const char *fmt, ...)
14134 -{
14135 -       va_list ap;
14136 -       char buf[512];
14137 -       int n;
14138 -
14139 -       if (!early_console)
14140 -               return;
14141 -
14142 -       va_start(ap, fmt);
14143 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
14144 -       va_end(ap);
14145 -
14146 -       early_console->write(early_console, buf, n);
14147 -}
14148 -#endif
14149 -
14150  static int __add_preferred_console(char *name, int idx, char *options,
14151                                    char *brl_options)
14152  {
14153 @@ -2303,11 +2392,16 @@ static void console_cont_flush(char *text, size_t size)
14154                 goto out;
14155  
14156         len = cont_print_text(text, size);
14157 +#ifdef CONFIG_PREEMPT_RT_FULL
14158 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
14159 +       call_console_drivers(cont.level, NULL, 0, text, len);
14160 +#else
14161         raw_spin_unlock(&logbuf_lock);
14162         stop_critical_timings();
14163         call_console_drivers(cont.level, NULL, 0, text, len);
14164         start_critical_timings();
14165         local_irq_restore(flags);
14166 +#endif
14167         return;
14168  out:
14169         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
14170 @@ -2431,13 +2525,17 @@ void console_unlock(void)
14171                 console_idx = log_next(console_idx);
14172                 console_seq++;
14173                 console_prev = msg->flags;
14174 +#ifdef CONFIG_PREEMPT_RT_FULL
14175 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
14176 +               call_console_drivers(level, ext_text, ext_len, text, len);
14177 +#else
14178                 raw_spin_unlock(&logbuf_lock);
14179  
14180                 stop_critical_timings();        /* don't trace print latency */
14181                 call_console_drivers(level, ext_text, ext_len, text, len);
14182                 start_critical_timings();
14183                 local_irq_restore(flags);
14184 -
14185 +#endif
14186                 if (do_cond_resched)
14187                         cond_resched();
14188         }
14189 @@ -2489,6 +2587,11 @@ void console_unblank(void)
14190  {
14191         struct console *c;
14192  
14193 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
14194 +               if (in_irq() || in_nmi())
14195 +                       return;
14196 +       }
14197 +
14198         /*
14199          * console_unblank can no longer be called in interrupt context unless
14200          * oops_in_progress is set to 1..
14201 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
14202 index 49ba7c1ade9d..44f44b47ec07 100644
14203 --- a/kernel/ptrace.c
14204 +++ b/kernel/ptrace.c
14205 @@ -166,7 +166,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
14206  
14207         spin_lock_irq(&task->sighand->siglock);
14208         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
14209 -               task->state = __TASK_TRACED;
14210 +               unsigned long flags;
14211 +
14212 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
14213 +               if (task->state & __TASK_TRACED)
14214 +                       task->state = __TASK_TRACED;
14215 +               else
14216 +                       task->saved_state = __TASK_TRACED;
14217 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14218                 ret = true;
14219         }
14220         spin_unlock_irq(&task->sighand->siglock);
14221 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
14222 index bf08fee53dc7..eeb8ce4ad7b6 100644
14223 --- a/kernel/rcu/rcutorture.c
14224 +++ b/kernel/rcu/rcutorture.c
14225 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
14226         .name           = "rcu"
14227  };
14228  
14229 +#ifndef CONFIG_PREEMPT_RT_FULL
14230  /*
14231   * Definitions for rcu_bh torture testing.
14232   */
14233 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
14234         .name           = "rcu_bh"
14235  };
14236  
14237 +#else
14238 +static struct rcu_torture_ops rcu_bh_ops = {
14239 +       .ttype          = INVALID_RCU_FLAVOR,
14240 +};
14241 +#endif
14242 +
14243  /*
14244   * Don't even think about trying any of these in real life!!!
14245   * The names includes "busted", and they really means it!
14246 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
14247 index 69a5611a7e7c..64d91f306eda 100644
14248 --- a/kernel/rcu/tree.c
14249 +++ b/kernel/rcu/tree.c
14250 @@ -55,6 +55,11 @@
14251  #include <linux/random.h>
14252  #include <linux/trace_events.h>
14253  #include <linux/suspend.h>
14254 +#include <linux/delay.h>
14255 +#include <linux/gfp.h>
14256 +#include <linux/oom.h>
14257 +#include <linux/smpboot.h>
14258 +#include "../time/tick-internal.h"
14259  
14260  #include "tree.h"
14261  #include "rcu.h"
14262 @@ -257,6 +262,19 @@ void rcu_sched_qs(void)
14263                            this_cpu_ptr(&rcu_sched_data), true);
14264  }
14265  
14266 +#ifdef CONFIG_PREEMPT_RT_FULL
14267 +static void rcu_preempt_qs(void);
14268 +
14269 +void rcu_bh_qs(void)
14270 +{
14271 +       unsigned long flags;
14272 +
14273 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
14274 +       local_irq_save(flags);
14275 +       rcu_preempt_qs();
14276 +       local_irq_restore(flags);
14277 +}
14278 +#else
14279  void rcu_bh_qs(void)
14280  {
14281         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
14282 @@ -266,6 +284,7 @@ void rcu_bh_qs(void)
14283                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
14284         }
14285  }
14286 +#endif
14287  
14288  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
14289  
14290 @@ -446,11 +465,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
14291  /*
14292   * Return the number of RCU BH batches started thus far for debug & stats.
14293   */
14294 +#ifndef CONFIG_PREEMPT_RT_FULL
14295  unsigned long rcu_batches_started_bh(void)
14296  {
14297         return rcu_bh_state.gpnum;
14298  }
14299  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
14300 +#endif
14301  
14302  /*
14303   * Return the number of RCU batches completed thus far for debug & stats.
14304 @@ -470,6 +491,7 @@ unsigned long rcu_batches_completed_sched(void)
14305  }
14306  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
14307  
14308 +#ifndef CONFIG_PREEMPT_RT_FULL
14309  /*
14310   * Return the number of RCU BH batches completed thus far for debug & stats.
14311   */
14312 @@ -478,6 +500,7 @@ unsigned long rcu_batches_completed_bh(void)
14313         return rcu_bh_state.completed;
14314  }
14315  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
14316 +#endif
14317  
14318  /*
14319   * Return the number of RCU expedited batches completed thus far for
14320 @@ -501,6 +524,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
14321  }
14322  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
14323  
14324 +#ifndef CONFIG_PREEMPT_RT_FULL
14325  /*
14326   * Force a quiescent state.
14327   */
14328 @@ -519,6 +543,13 @@ void rcu_bh_force_quiescent_state(void)
14329  }
14330  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
14331  
14332 +#else
14333 +void rcu_force_quiescent_state(void)
14334 +{
14335 +}
14336 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
14337 +#endif
14338 +
14339  /*
14340   * Force a quiescent state for RCU-sched.
14341   */
14342 @@ -569,9 +600,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
14343         case RCU_FLAVOR:
14344                 rsp = rcu_state_p;
14345                 break;
14346 +#ifndef CONFIG_PREEMPT_RT_FULL
14347         case RCU_BH_FLAVOR:
14348                 rsp = &rcu_bh_state;
14349                 break;
14350 +#endif
14351         case RCU_SCHED_FLAVOR:
14352                 rsp = &rcu_sched_state;
14353                 break;
14354 @@ -3013,18 +3046,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
14355  /*
14356   * Do RCU core processing for the current CPU.
14357   */
14358 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
14359 +static __latent_entropy void rcu_process_callbacks(void)
14360  {
14361         struct rcu_state *rsp;
14362  
14363         if (cpu_is_offline(smp_processor_id()))
14364                 return;
14365 -       trace_rcu_utilization(TPS("Start RCU core"));
14366         for_each_rcu_flavor(rsp)
14367                 __rcu_process_callbacks(rsp);
14368 -       trace_rcu_utilization(TPS("End RCU core"));
14369  }
14370  
14371 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14372  /*
14373   * Schedule RCU callback invocation.  If the specified type of RCU
14374   * does not support RCU priority boosting, just do a direct call,
14375 @@ -3036,19 +3068,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
14376  {
14377         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
14378                 return;
14379 -       if (likely(!rsp->boost)) {
14380 -               rcu_do_batch(rsp, rdp);
14381 -               return;
14382 -       }
14383 -       invoke_rcu_callbacks_kthread();
14384 +       rcu_do_batch(rsp, rdp);
14385  }
14386  
14387 +static void rcu_wake_cond(struct task_struct *t, int status)
14388 +{
14389 +       /*
14390 +        * If the thread is yielding, only wake it when this
14391 +        * is invoked from idle
14392 +        */
14393 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
14394 +               wake_up_process(t);
14395 +}
14396 +
14397 +/*
14398 + * Wake up this CPU's rcuc kthread to do RCU core processing.
14399 + */
14400  static void invoke_rcu_core(void)
14401  {
14402 -       if (cpu_online(smp_processor_id()))
14403 -               raise_softirq(RCU_SOFTIRQ);
14404 +       unsigned long flags;
14405 +       struct task_struct *t;
14406 +
14407 +       if (!cpu_online(smp_processor_id()))
14408 +               return;
14409 +       local_irq_save(flags);
14410 +       __this_cpu_write(rcu_cpu_has_work, 1);
14411 +       t = __this_cpu_read(rcu_cpu_kthread_task);
14412 +       if (t != NULL && current != t)
14413 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
14414 +       local_irq_restore(flags);
14415  }
14416  
14417 +static void rcu_cpu_kthread_park(unsigned int cpu)
14418 +{
14419 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14420 +}
14421 +
14422 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
14423 +{
14424 +       return __this_cpu_read(rcu_cpu_has_work);
14425 +}
14426 +
14427 +/*
14428 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
14429 + * RCU softirq used in flavors and configurations of RCU that do not
14430 + * support RCU priority boosting.
14431 + */
14432 +static void rcu_cpu_kthread(unsigned int cpu)
14433 +{
14434 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14435 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14436 +       int spincnt;
14437 +
14438 +       for (spincnt = 0; spincnt < 10; spincnt++) {
14439 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14440 +               local_bh_disable();
14441 +               *statusp = RCU_KTHREAD_RUNNING;
14442 +               this_cpu_inc(rcu_cpu_kthread_loops);
14443 +               local_irq_disable();
14444 +               work = *workp;
14445 +               *workp = 0;
14446 +               local_irq_enable();
14447 +               if (work)
14448 +                       rcu_process_callbacks();
14449 +               local_bh_enable();
14450 +               if (*workp == 0) {
14451 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14452 +                       *statusp = RCU_KTHREAD_WAITING;
14453 +                       return;
14454 +               }
14455 +       }
14456 +       *statusp = RCU_KTHREAD_YIELDING;
14457 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14458 +       schedule_timeout_interruptible(2);
14459 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14460 +       *statusp = RCU_KTHREAD_WAITING;
14461 +}
14462 +
14463 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14464 +       .store                  = &rcu_cpu_kthread_task,
14465 +       .thread_should_run      = rcu_cpu_kthread_should_run,
14466 +       .thread_fn              = rcu_cpu_kthread,
14467 +       .thread_comm            = "rcuc/%u",
14468 +       .setup                  = rcu_cpu_kthread_setup,
14469 +       .park                   = rcu_cpu_kthread_park,
14470 +};
14471 +
14472 +/*
14473 + * Spawn per-CPU RCU core processing kthreads.
14474 + */
14475 +static int __init rcu_spawn_core_kthreads(void)
14476 +{
14477 +       int cpu;
14478 +
14479 +       for_each_possible_cpu(cpu)
14480 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
14481 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14482 +       return 0;
14483 +}
14484 +early_initcall(rcu_spawn_core_kthreads);
14485 +
14486  /*
14487   * Handle any core-RCU processing required by a call_rcu() invocation.
14488   */
14489 @@ -3192,6 +3311,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
14490  }
14491  EXPORT_SYMBOL_GPL(call_rcu_sched);
14492  
14493 +#ifndef CONFIG_PREEMPT_RT_FULL
14494  /*
14495   * Queue an RCU callback for invocation after a quicker grace period.
14496   */
14497 @@ -3200,6 +3320,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
14498         __call_rcu(head, func, &rcu_bh_state, -1, 0);
14499  }
14500  EXPORT_SYMBOL_GPL(call_rcu_bh);
14501 +#endif
14502  
14503  /*
14504   * Queue an RCU callback for lazy invocation after a grace period.
14505 @@ -3291,6 +3412,7 @@ void synchronize_sched(void)
14506  }
14507  EXPORT_SYMBOL_GPL(synchronize_sched);
14508  
14509 +#ifndef CONFIG_PREEMPT_RT_FULL
14510  /**
14511   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
14512   *
14513 @@ -3317,6 +3439,7 @@ void synchronize_rcu_bh(void)
14514                 wait_rcu_gp(call_rcu_bh);
14515  }
14516  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
14517 +#endif
14518  
14519  /**
14520   * get_state_synchronize_rcu - Snapshot current RCU state
14521 @@ -3695,6 +3818,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
14522         mutex_unlock(&rsp->barrier_mutex);
14523  }
14524  
14525 +#ifndef CONFIG_PREEMPT_RT_FULL
14526  /**
14527   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
14528   */
14529 @@ -3703,6 +3827,7 @@ void rcu_barrier_bh(void)
14530         _rcu_barrier(&rcu_bh_state);
14531  }
14532  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
14533 +#endif
14534  
14535  /**
14536   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
14537 @@ -4220,12 +4345,13 @@ void __init rcu_init(void)
14538  
14539         rcu_bootup_announce();
14540         rcu_init_geometry();
14541 +#ifndef CONFIG_PREEMPT_RT_FULL
14542         rcu_init_one(&rcu_bh_state);
14543 +#endif
14544         rcu_init_one(&rcu_sched_state);
14545         if (dump_tree)
14546                 rcu_dump_rcu_node_tree(&rcu_sched_state);
14547         __rcu_init_preempt();
14548 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
14549  
14550         /*
14551          * We don't need protection against CPU-hotplug here because
14552 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
14553 index e99a5234d9ed..958ac107062c 100644
14554 --- a/kernel/rcu/tree.h
14555 +++ b/kernel/rcu/tree.h
14556 @@ -588,18 +588,18 @@ extern struct list_head rcu_struct_flavors;
14557   */
14558  extern struct rcu_state rcu_sched_state;
14559  
14560 +#ifndef CONFIG_PREEMPT_RT_FULL
14561  extern struct rcu_state rcu_bh_state;
14562 +#endif
14563  
14564  #ifdef CONFIG_PREEMPT_RCU
14565  extern struct rcu_state rcu_preempt_state;
14566  #endif /* #ifdef CONFIG_PREEMPT_RCU */
14567  
14568 -#ifdef CONFIG_RCU_BOOST
14569  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14570  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
14571  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14572  DECLARE_PER_CPU(char, rcu_cpu_has_work);
14573 -#endif /* #ifdef CONFIG_RCU_BOOST */
14574  
14575  #ifndef RCU_TREE_NONCORE
14576  
14577 @@ -619,10 +619,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
14578  static void __init __rcu_init_preempt(void);
14579  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
14580  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
14581 -static void invoke_rcu_callbacks_kthread(void);
14582  static bool rcu_is_callbacks_kthread(void);
14583 +static void rcu_cpu_kthread_setup(unsigned int cpu);
14584  #ifdef CONFIG_RCU_BOOST
14585 -static void rcu_preempt_do_callbacks(void);
14586  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
14587                                                  struct rcu_node *rnp);
14588  #endif /* #ifdef CONFIG_RCU_BOOST */
14589 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
14590 index 85c5a883c6e3..dbbda005c1f9 100644
14591 --- a/kernel/rcu/tree_plugin.h
14592 +++ b/kernel/rcu/tree_plugin.h
14593 @@ -24,25 +24,10 @@
14594   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
14595   */
14596  
14597 -#include <linux/delay.h>
14598 -#include <linux/gfp.h>
14599 -#include <linux/oom.h>
14600 -#include <linux/smpboot.h>
14601 -#include "../time/tick-internal.h"
14602 -
14603  #ifdef CONFIG_RCU_BOOST
14604  
14605  #include "../locking/rtmutex_common.h"
14606  
14607 -/*
14608 - * Control variables for per-CPU and per-rcu_node kthreads.  These
14609 - * handle all flavors of RCU.
14610 - */
14611 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14612 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14613 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14614 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
14615 -
14616  #else /* #ifdef CONFIG_RCU_BOOST */
14617  
14618  /*
14619 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
14620  
14621  #endif /* #else #ifdef CONFIG_RCU_BOOST */
14622  
14623 +/*
14624 + * Control variables for per-CPU and per-rcu_node kthreads.  These
14625 + * handle all flavors of RCU.
14626 + */
14627 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14628 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14629 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
14630 +
14631  #ifdef CONFIG_RCU_NOCB_CPU
14632  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
14633  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
14634 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
14635         }
14636  
14637         /* Hardware IRQ handlers cannot block, complain if they get here. */
14638 -       if (in_irq() || in_serving_softirq()) {
14639 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
14640                 lockdep_rcu_suspicious(__FILE__, __LINE__,
14641                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
14642                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
14643 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
14644                 t->rcu_read_unlock_special.b.need_qs = true;
14645  }
14646  
14647 -#ifdef CONFIG_RCU_BOOST
14648 -
14649 -static void rcu_preempt_do_callbacks(void)
14650 -{
14651 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
14652 -}
14653 -
14654 -#endif /* #ifdef CONFIG_RCU_BOOST */
14655 -
14656  /*
14657   * Queue a preemptible-RCU callback for invocation after a grace period.
14658   */
14659 @@ -829,6 +813,19 @@ void exit_rcu(void)
14660  
14661  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14662  
14663 +/*
14664 + * If boosting, set rcuc kthreads to realtime priority.
14665 + */
14666 +static void rcu_cpu_kthread_setup(unsigned int cpu)
14667 +{
14668 +#ifdef CONFIG_RCU_BOOST
14669 +       struct sched_param sp;
14670 +
14671 +       sp.sched_priority = kthread_prio;
14672 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
14673 +#endif /* #ifdef CONFIG_RCU_BOOST */
14674 +}
14675 +
14676  #ifdef CONFIG_RCU_BOOST
14677  
14678  #include "../locking/rtmutex_common.h"
14679 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
14680  
14681  #endif /* #else #ifdef CONFIG_RCU_TRACE */
14682  
14683 -static void rcu_wake_cond(struct task_struct *t, int status)
14684 -{
14685 -       /*
14686 -        * If the thread is yielding, only wake it when this
14687 -        * is invoked from idle
14688 -        */
14689 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
14690 -               wake_up_process(t);
14691 -}
14692 -
14693  /*
14694   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
14695   * or ->boost_tasks, advancing the pointer to the next task in the
14696 @@ -1013,23 +1000,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
14697  }
14698  
14699  /*
14700 - * Wake up the per-CPU kthread to invoke RCU callbacks.
14701 - */
14702 -static void invoke_rcu_callbacks_kthread(void)
14703 -{
14704 -       unsigned long flags;
14705 -
14706 -       local_irq_save(flags);
14707 -       __this_cpu_write(rcu_cpu_has_work, 1);
14708 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
14709 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
14710 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
14711 -                             __this_cpu_read(rcu_cpu_kthread_status));
14712 -       }
14713 -       local_irq_restore(flags);
14714 -}
14715 -
14716 -/*
14717   * Is the current CPU running the RCU-callbacks kthread?
14718   * Caller must have preemption disabled.
14719   */
14720 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
14721         return 0;
14722  }
14723  
14724 -static void rcu_kthread_do_work(void)
14725 -{
14726 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
14727 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
14728 -       rcu_preempt_do_callbacks();
14729 -}
14730 -
14731 -static void rcu_cpu_kthread_setup(unsigned int cpu)
14732 -{
14733 -       struct sched_param sp;
14734 -
14735 -       sp.sched_priority = kthread_prio;
14736 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
14737 -}
14738 -
14739 -static void rcu_cpu_kthread_park(unsigned int cpu)
14740 -{
14741 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14742 -}
14743 -
14744 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
14745 -{
14746 -       return __this_cpu_read(rcu_cpu_has_work);
14747 -}
14748 -
14749 -/*
14750 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
14751 - * RCU softirq used in flavors and configurations of RCU that do not
14752 - * support RCU priority boosting.
14753 - */
14754 -static void rcu_cpu_kthread(unsigned int cpu)
14755 -{
14756 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14757 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14758 -       int spincnt;
14759 -
14760 -       for (spincnt = 0; spincnt < 10; spincnt++) {
14761 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14762 -               local_bh_disable();
14763 -               *statusp = RCU_KTHREAD_RUNNING;
14764 -               this_cpu_inc(rcu_cpu_kthread_loops);
14765 -               local_irq_disable();
14766 -               work = *workp;
14767 -               *workp = 0;
14768 -               local_irq_enable();
14769 -               if (work)
14770 -                       rcu_kthread_do_work();
14771 -               local_bh_enable();
14772 -               if (*workp == 0) {
14773 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14774 -                       *statusp = RCU_KTHREAD_WAITING;
14775 -                       return;
14776 -               }
14777 -       }
14778 -       *statusp = RCU_KTHREAD_YIELDING;
14779 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14780 -       schedule_timeout_interruptible(2);
14781 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14782 -       *statusp = RCU_KTHREAD_WAITING;
14783 -}
14784 -
14785  /*
14786   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
14787   * served by the rcu_node in question.  The CPU hotplug lock is still
14788 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
14789         free_cpumask_var(cm);
14790  }
14791  
14792 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14793 -       .store                  = &rcu_cpu_kthread_task,
14794 -       .thread_should_run      = rcu_cpu_kthread_should_run,
14795 -       .thread_fn              = rcu_cpu_kthread,
14796 -       .thread_comm            = "rcuc/%u",
14797 -       .setup                  = rcu_cpu_kthread_setup,
14798 -       .park                   = rcu_cpu_kthread_park,
14799 -};
14800 -
14801  /*
14802   * Spawn boost kthreads -- called as soon as the scheduler is running.
14803   */
14804  static void __init rcu_spawn_boost_kthreads(void)
14805  {
14806         struct rcu_node *rnp;
14807 -       int cpu;
14808 -
14809 -       for_each_possible_cpu(cpu)
14810 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
14811 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14812         rcu_for_each_leaf_node(rcu_state_p, rnp)
14813                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
14814  }
14815 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
14816         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
14817  }
14818  
14819 -static void invoke_rcu_callbacks_kthread(void)
14820 -{
14821 -       WARN_ON_ONCE(1);
14822 -}
14823 -
14824  static bool rcu_is_callbacks_kthread(void)
14825  {
14826         return false;
14827 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
14828  
14829  #endif /* #else #ifdef CONFIG_RCU_BOOST */
14830  
14831 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
14832 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
14833  
14834  /*
14835   * Check to see if any future RCU-related work will need to be done
14836 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
14837         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
14838                ? 0 : rcu_cpu_has_callbacks(NULL);
14839  }
14840 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
14841  
14842 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
14843  /*
14844   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
14845   * after it.
14846 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
14847         return cbs_ready;
14848  }
14849  
14850 +#ifndef CONFIG_PREEMPT_RT_FULL
14851 +
14852  /*
14853   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
14854   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
14855 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
14856         *nextevt = basemono + dj * TICK_NSEC;
14857         return 0;
14858  }
14859 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
14860  
14861  /*
14862   * Prepare a CPU for idle from an RCU perspective.  The first major task
14863 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
14864 index f19271dce0a9..6b5ab88b6103 100644
14865 --- a/kernel/rcu/update.c
14866 +++ b/kernel/rcu/update.c
14867 @@ -62,7 +62,7 @@
14868  #ifndef CONFIG_TINY_RCU
14869  module_param(rcu_expedited, int, 0);
14870  module_param(rcu_normal, int, 0);
14871 -static int rcu_normal_after_boot;
14872 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
14873  module_param(rcu_normal_after_boot, int, 0);
14874  #endif /* #ifndef CONFIG_TINY_RCU */
14875  
14876 @@ -129,8 +129,7 @@ bool rcu_gp_is_normal(void)
14877  }
14878  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
14879  
14880 -static atomic_t rcu_expedited_nesting =
14881 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
14882 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
14883  
14884  /*
14885   * Should normal grace-period primitives be expedited?  Intended for
14886 @@ -178,8 +177,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
14887   */
14888  void rcu_end_inkernel_boot(void)
14889  {
14890 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
14891 -               rcu_unexpedite_gp();
14892 +       rcu_unexpedite_gp();
14893         if (rcu_normal_after_boot)
14894                 WRITE_ONCE(rcu_normal, 1);
14895  }
14896 @@ -294,6 +292,7 @@ int rcu_read_lock_held(void)
14897  }
14898  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
14899  
14900 +#ifndef CONFIG_PREEMPT_RT_FULL
14901  /**
14902   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
14903   *
14904 @@ -320,6 +319,7 @@ int rcu_read_lock_bh_held(void)
14905         return in_softirq() || irqs_disabled();
14906  }
14907  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
14908 +#endif
14909  
14910  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
14911  
14912 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
14913 index 5e59b832ae2b..7337a7f60e3f 100644
14914 --- a/kernel/sched/Makefile
14915 +++ b/kernel/sched/Makefile
14916 @@ -17,7 +17,7 @@ endif
14917  
14918  obj-y += core.o loadavg.o clock.o cputime.o
14919  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
14920 -obj-y += wait.o swait.o completion.o idle.o
14921 +obj-y += wait.o swait.o swork.o completion.o idle.o
14922  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
14923  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
14924  obj-$(CONFIG_SCHEDSTATS) += stats.o
14925 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
14926 index 8d0f35debf35..b62cf6400fe0 100644
14927 --- a/kernel/sched/completion.c
14928 +++ b/kernel/sched/completion.c
14929 @@ -30,10 +30,10 @@ void complete(struct completion *x)
14930  {
14931         unsigned long flags;
14932  
14933 -       spin_lock_irqsave(&x->wait.lock, flags);
14934 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
14935         x->done++;
14936 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
14937 -       spin_unlock_irqrestore(&x->wait.lock, flags);
14938 +       swake_up_locked(&x->wait);
14939 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14940  }
14941  EXPORT_SYMBOL(complete);
14942  
14943 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
14944  {
14945         unsigned long flags;
14946  
14947 -       spin_lock_irqsave(&x->wait.lock, flags);
14948 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
14949         x->done += UINT_MAX/2;
14950 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
14951 -       spin_unlock_irqrestore(&x->wait.lock, flags);
14952 +       swake_up_all_locked(&x->wait);
14953 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14954  }
14955  EXPORT_SYMBOL(complete_all);
14956  
14957 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
14958                    long (*action)(long), long timeout, int state)
14959  {
14960         if (!x->done) {
14961 -               DECLARE_WAITQUEUE(wait, current);
14962 +               DECLARE_SWAITQUEUE(wait);
14963  
14964 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
14965 +               __prepare_to_swait(&x->wait, &wait);
14966                 do {
14967                         if (signal_pending_state(state, current)) {
14968                                 timeout = -ERESTARTSYS;
14969                                 break;
14970                         }
14971                         __set_current_state(state);
14972 -                       spin_unlock_irq(&x->wait.lock);
14973 +                       raw_spin_unlock_irq(&x->wait.lock);
14974                         timeout = action(timeout);
14975 -                       spin_lock_irq(&x->wait.lock);
14976 +                       raw_spin_lock_irq(&x->wait.lock);
14977                 } while (!x->done && timeout);
14978 -               __remove_wait_queue(&x->wait, &wait);
14979 +               __finish_swait(&x->wait, &wait);
14980                 if (!x->done)
14981                         return timeout;
14982         }
14983 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
14984  {
14985         might_sleep();
14986  
14987 -       spin_lock_irq(&x->wait.lock);
14988 +       raw_spin_lock_irq(&x->wait.lock);
14989         timeout = do_wait_for_common(x, action, timeout, state);
14990 -       spin_unlock_irq(&x->wait.lock);
14991 +       raw_spin_unlock_irq(&x->wait.lock);
14992         return timeout;
14993  }
14994  
14995 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
14996         if (!READ_ONCE(x->done))
14997                 return 0;
14998  
14999 -       spin_lock_irqsave(&x->wait.lock, flags);
15000 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
15001         if (!x->done)
15002                 ret = 0;
15003         else
15004                 x->done--;
15005 -       spin_unlock_irqrestore(&x->wait.lock, flags);
15006 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
15007         return ret;
15008  }
15009  EXPORT_SYMBOL(try_wait_for_completion);
15010 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
15011          * after it's acquired the lock.
15012          */
15013         smp_rmb();
15014 -       spin_unlock_wait(&x->wait.lock);
15015 +       raw_spin_unlock_wait(&x->wait.lock);
15016         return true;
15017  }
15018  EXPORT_SYMBOL(completion_done);
15019 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
15020 index 154fd689fe02..a6aa5801b21e 100644
15021 --- a/kernel/sched/core.c
15022 +++ b/kernel/sched/core.c
15023 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
15024   * Number of tasks to iterate in a single balance run.
15025   * Limited because this is done with IRQs disabled.
15026   */
15027 +#ifndef CONFIG_PREEMPT_RT_FULL
15028  const_debug unsigned int sysctl_sched_nr_migrate = 32;
15029 +#else
15030 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
15031 +#endif
15032  
15033  /*
15034   * period over which we average the RT time consumption, measured
15035 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
15036  
15037         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15038         rq->hrtick_timer.function = hrtick;
15039 +       rq->hrtick_timer.irqsafe = 1;
15040  }
15041  #else  /* CONFIG_SCHED_HRTICK */
15042  static inline void hrtick_clear(struct rq *rq)
15043 @@ -449,7 +454,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
15044         head->lastp = &node->next;
15045  }
15046  
15047 -void wake_up_q(struct wake_q_head *head)
15048 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
15049  {
15050         struct wake_q_node *node = head->first;
15051  
15052 @@ -466,7 +471,10 @@ void wake_up_q(struct wake_q_head *head)
15053                  * wake_up_process() implies a wmb() to pair with the queueing
15054                  * in wake_q_add() so as not to miss wakeups.
15055                  */
15056 -               wake_up_process(task);
15057 +               if (sleeper)
15058 +                       wake_up_lock_sleeper(task);
15059 +               else
15060 +                       wake_up_process(task);
15061                 put_task_struct(task);
15062         }
15063  }
15064 @@ -502,6 +510,38 @@ void resched_curr(struct rq *rq)
15065                 trace_sched_wake_idle_without_ipi(cpu);
15066  }
15067  
15068 +#ifdef CONFIG_PREEMPT_LAZY
15069 +void resched_curr_lazy(struct rq *rq)
15070 +{
15071 +       struct task_struct *curr = rq->curr;
15072 +       int cpu;
15073 +
15074 +       if (!sched_feat(PREEMPT_LAZY)) {
15075 +               resched_curr(rq);
15076 +               return;
15077 +       }
15078 +
15079 +       lockdep_assert_held(&rq->lock);
15080 +
15081 +       if (test_tsk_need_resched(curr))
15082 +               return;
15083 +
15084 +       if (test_tsk_need_resched_lazy(curr))
15085 +               return;
15086 +
15087 +       set_tsk_need_resched_lazy(curr);
15088 +
15089 +       cpu = cpu_of(rq);
15090 +       if (cpu == smp_processor_id())
15091 +               return;
15092 +
15093 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
15094 +       smp_mb();
15095 +       if (!tsk_is_polling(curr))
15096 +               smp_send_reschedule(cpu);
15097 +}
15098 +#endif
15099 +
15100  void resched_cpu(int cpu)
15101  {
15102         struct rq *rq = cpu_rq(cpu);
15103 @@ -525,11 +565,14 @@ void resched_cpu(int cpu)
15104   */
15105  int get_nohz_timer_target(void)
15106  {
15107 -       int i, cpu = smp_processor_id();
15108 +       int i, cpu;
15109         struct sched_domain *sd;
15110  
15111 +       preempt_disable_rt();
15112 +       cpu = smp_processor_id();
15113 +
15114         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
15115 -               return cpu;
15116 +               goto preempt_en_rt;
15117  
15118         rcu_read_lock();
15119         for_each_domain(cpu, sd) {
15120 @@ -548,6 +591,8 @@ int get_nohz_timer_target(void)
15121                 cpu = housekeeping_any_cpu();
15122  unlock:
15123         rcu_read_unlock();
15124 +preempt_en_rt:
15125 +       preempt_enable_rt();
15126         return cpu;
15127  }
15128  /*
15129 @@ -1100,6 +1145,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
15130  
15131         lockdep_assert_held(&p->pi_lock);
15132  
15133 +       if (__migrate_disabled(p)) {
15134 +               cpumask_copy(&p->cpus_allowed, new_mask);
15135 +               return;
15136 +       }
15137 +
15138         queued = task_on_rq_queued(p);
15139         running = task_current(rq, p);
15140  
15141 @@ -1122,6 +1172,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
15142                 set_curr_task(rq, p);
15143  }
15144  
15145 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
15146 +static DEFINE_MUTEX(sched_down_mutex);
15147 +static cpumask_t sched_down_cpumask;
15148 +
15149 +void tell_sched_cpu_down_begin(int cpu)
15150 +{
15151 +       mutex_lock(&sched_down_mutex);
15152 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
15153 +       mutex_unlock(&sched_down_mutex);
15154 +}
15155 +
15156 +void tell_sched_cpu_down_done(int cpu)
15157 +{
15158 +       mutex_lock(&sched_down_mutex);
15159 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
15160 +       mutex_unlock(&sched_down_mutex);
15161 +}
15162 +
15163 +/**
15164 + * migrate_me - try to move the current task off this cpu
15165 + *
15166 + * Used by the pin_current_cpu() code to try to get tasks
15167 + * to move off the current CPU as it is going down.
15168 + * It will only move the task if the task isn't pinned to
15169 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
15170 + * and the task has to be in a RUNNING state. Otherwise the
15171 + * movement of the task will wake it up (change its state
15172 + * to running) when the task did not expect it.
15173 + *
15174 + * Returns 1 if it succeeded in moving the current task
15175 + *         0 otherwise.
15176 + */
15177 +int migrate_me(void)
15178 +{
15179 +       struct task_struct *p = current;
15180 +       struct migration_arg arg;
15181 +       struct cpumask *cpumask;
15182 +       struct cpumask *mask;
15183 +       unsigned int dest_cpu;
15184 +       struct rq_flags rf;
15185 +       struct rq *rq;
15186 +
15187 +       /*
15188 +        * We can not migrate tasks bounded to a CPU or tasks not
15189 +        * running. The movement of the task will wake it up.
15190 +        */
15191 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
15192 +               return 0;
15193 +
15194 +       mutex_lock(&sched_down_mutex);
15195 +       rq = task_rq_lock(p, &rf);
15196 +
15197 +       cpumask = this_cpu_ptr(&sched_cpumasks);
15198 +       mask = &p->cpus_allowed;
15199 +
15200 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
15201 +
15202 +       if (!cpumask_weight(cpumask)) {
15203 +               /* It's only on this CPU? */
15204 +               task_rq_unlock(rq, p, &rf);
15205 +               mutex_unlock(&sched_down_mutex);
15206 +               return 0;
15207 +       }
15208 +
15209 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
15210 +
15211 +       arg.task = p;
15212 +       arg.dest_cpu = dest_cpu;
15213 +
15214 +       task_rq_unlock(rq, p, &rf);
15215 +
15216 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
15217 +       tlb_migrate_finish(p->mm);
15218 +       mutex_unlock(&sched_down_mutex);
15219 +
15220 +       return 1;
15221 +}
15222 +
15223  /*
15224   * Change a given task's CPU affinity. Migrate the thread to a
15225   * proper CPU and schedule it away if the CPU it's executing on
15226 @@ -1179,7 +1307,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
15227         }
15228  
15229         /* Can the task run on the task's current CPU? If so, we're done */
15230 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
15231 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
15232                 goto out;
15233  
15234         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
15235 @@ -1366,6 +1494,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
15236         return ret;
15237  }
15238  
15239 +static bool check_task_state(struct task_struct *p, long match_state)
15240 +{
15241 +       bool match = false;
15242 +
15243 +       raw_spin_lock_irq(&p->pi_lock);
15244 +       if (p->state == match_state || p->saved_state == match_state)
15245 +               match = true;
15246 +       raw_spin_unlock_irq(&p->pi_lock);
15247 +
15248 +       return match;
15249 +}
15250 +
15251  /*
15252   * wait_task_inactive - wait for a thread to unschedule.
15253   *
15254 @@ -1410,7 +1550,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
15255                  * is actually now running somewhere else!
15256                  */
15257                 while (task_running(rq, p)) {
15258 -                       if (match_state && unlikely(p->state != match_state))
15259 +                       if (match_state && !check_task_state(p, match_state))
15260                                 return 0;
15261                         cpu_relax();
15262                 }
15263 @@ -1425,7 +1565,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
15264                 running = task_running(rq, p);
15265                 queued = task_on_rq_queued(p);
15266                 ncsw = 0;
15267 -               if (!match_state || p->state == match_state)
15268 +               if (!match_state || p->state == match_state ||
15269 +                   p->saved_state == match_state)
15270                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
15271                 task_rq_unlock(rq, p, &rf);
15272  
15273 @@ -1680,10 +1821,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
15274  {
15275         activate_task(rq, p, en_flags);
15276         p->on_rq = TASK_ON_RQ_QUEUED;
15277 -
15278 -       /* if a worker is waking up, notify workqueue */
15279 -       if (p->flags & PF_WQ_WORKER)
15280 -               wq_worker_waking_up(p, cpu_of(rq));
15281  }
15282  
15283  /*
15284 @@ -2018,8 +2155,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
15285          */
15286         smp_mb__before_spinlock();
15287         raw_spin_lock_irqsave(&p->pi_lock, flags);
15288 -       if (!(p->state & state))
15289 +       if (!(p->state & state)) {
15290 +               /*
15291 +                * The task might be running due to a spinlock sleeper
15292 +                * wakeup. Check the saved state and set it to running
15293 +                * if the wakeup condition is true.
15294 +                */
15295 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
15296 +                       if (p->saved_state & state) {
15297 +                               p->saved_state = TASK_RUNNING;
15298 +                               success = 1;
15299 +                       }
15300 +               }
15301                 goto out;
15302 +       }
15303 +
15304 +       /*
15305 +        * If this is a regular wakeup, then we can unconditionally
15306 +        * clear the saved state of a "lock sleeper".
15307 +        */
15308 +       if (!(wake_flags & WF_LOCK_SLEEPER))
15309 +               p->saved_state = TASK_RUNNING;
15310  
15311         trace_sched_waking(p);
15312  
15313 @@ -2102,53 +2258,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
15314  }
15315  
15316  /**
15317 - * try_to_wake_up_local - try to wake up a local task with rq lock held
15318 - * @p: the thread to be awakened
15319 - * @cookie: context's cookie for pinning
15320 - *
15321 - * Put @p on the run-queue if it's not already there. The caller must
15322 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
15323 - * the current task.
15324 - */
15325 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
15326 -{
15327 -       struct rq *rq = task_rq(p);
15328 -
15329 -       if (WARN_ON_ONCE(rq != this_rq()) ||
15330 -           WARN_ON_ONCE(p == current))
15331 -               return;
15332 -
15333 -       lockdep_assert_held(&rq->lock);
15334 -
15335 -       if (!raw_spin_trylock(&p->pi_lock)) {
15336 -               /*
15337 -                * This is OK, because current is on_cpu, which avoids it being
15338 -                * picked for load-balance and preemption/IRQs are still
15339 -                * disabled avoiding further scheduler activity on it and we've
15340 -                * not yet picked a replacement task.
15341 -                */
15342 -               lockdep_unpin_lock(&rq->lock, cookie);
15343 -               raw_spin_unlock(&rq->lock);
15344 -               raw_spin_lock(&p->pi_lock);
15345 -               raw_spin_lock(&rq->lock);
15346 -               lockdep_repin_lock(&rq->lock, cookie);
15347 -       }
15348 -
15349 -       if (!(p->state & TASK_NORMAL))
15350 -               goto out;
15351 -
15352 -       trace_sched_waking(p);
15353 -
15354 -       if (!task_on_rq_queued(p))
15355 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
15356 -
15357 -       ttwu_do_wakeup(rq, p, 0, cookie);
15358 -       ttwu_stat(p, smp_processor_id(), 0);
15359 -out:
15360 -       raw_spin_unlock(&p->pi_lock);
15361 -}
15362 -
15363 -/**
15364   * wake_up_process - Wake up a specific process
15365   * @p: The process to be woken up.
15366   *
15367 @@ -2166,6 +2275,18 @@ int wake_up_process(struct task_struct *p)
15368  }
15369  EXPORT_SYMBOL(wake_up_process);
15370  
15371 +/**
15372 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
15373 + * @p: The process to be woken up.
15374 + *
15375 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
15376 + * the nature of the wakeup.
15377 + */
15378 +int wake_up_lock_sleeper(struct task_struct *p)
15379 +{
15380 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
15381 +}
15382 +
15383  int wake_up_state(struct task_struct *p, unsigned int state)
15384  {
15385         return try_to_wake_up(p, state, 0);
15386 @@ -2442,6 +2563,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
15387         p->on_cpu = 0;
15388  #endif
15389         init_task_preempt_count(p);
15390 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15391 +       task_thread_info(p)->preempt_lazy_count = 0;
15392 +#endif
15393  #ifdef CONFIG_SMP
15394         plist_node_init(&p->pushable_tasks, MAX_PRIO);
15395         RB_CLEAR_NODE(&p->pushable_dl_tasks);
15396 @@ -2770,21 +2894,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
15397         finish_arch_post_lock_switch();
15398  
15399         fire_sched_in_preempt_notifiers(current);
15400 +       /*
15401 +        * We use mmdrop_delayed() here so we don't have to do the
15402 +        * full __mmdrop() when we are the last user.
15403 +        */
15404         if (mm)
15405 -               mmdrop(mm);
15406 +               mmdrop_delayed(mm);
15407         if (unlikely(prev_state == TASK_DEAD)) {
15408                 if (prev->sched_class->task_dead)
15409                         prev->sched_class->task_dead(prev);
15410  
15411 -               /*
15412 -                * Remove function-return probe instances associated with this
15413 -                * task and put them back on the free list.
15414 -                */
15415 -               kprobe_flush_task(prev);
15416 -
15417 -               /* Task is done with its stack. */
15418 -               put_task_stack(prev);
15419 -
15420                 put_task_struct(prev);
15421         }
15422  
15423 @@ -3252,6 +3371,77 @@ static inline void schedule_debug(struct task_struct *prev)
15424         schedstat_inc(this_rq()->sched_count);
15425  }
15426  
15427 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
15428 +
15429 +void migrate_disable(void)
15430 +{
15431 +       struct task_struct *p = current;
15432 +
15433 +       if (in_atomic() || irqs_disabled()) {
15434 +#ifdef CONFIG_SCHED_DEBUG
15435 +               p->migrate_disable_atomic++;
15436 +#endif
15437 +               return;
15438 +       }
15439 +
15440 +#ifdef CONFIG_SCHED_DEBUG
15441 +       if (unlikely(p->migrate_disable_atomic)) {
15442 +               tracing_off();
15443 +               WARN_ON_ONCE(1);
15444 +       }
15445 +#endif
15446 +
15447 +       if (p->migrate_disable) {
15448 +               p->migrate_disable++;
15449 +               return;
15450 +       }
15451 +
15452 +       preempt_disable();
15453 +       preempt_lazy_disable();
15454 +       pin_current_cpu();
15455 +       p->migrate_disable = 1;
15456 +       preempt_enable();
15457 +}
15458 +EXPORT_SYMBOL(migrate_disable);
15459 +
15460 +void migrate_enable(void)
15461 +{
15462 +       struct task_struct *p = current;
15463 +
15464 +       if (in_atomic() || irqs_disabled()) {
15465 +#ifdef CONFIG_SCHED_DEBUG
15466 +               p->migrate_disable_atomic--;
15467 +#endif
15468 +               return;
15469 +       }
15470 +
15471 +#ifdef CONFIG_SCHED_DEBUG
15472 +       if (unlikely(p->migrate_disable_atomic)) {
15473 +               tracing_off();
15474 +               WARN_ON_ONCE(1);
15475 +       }
15476 +#endif
15477 +       WARN_ON_ONCE(p->migrate_disable <= 0);
15478 +
15479 +       if (p->migrate_disable > 1) {
15480 +               p->migrate_disable--;
15481 +               return;
15482 +       }
15483 +
15484 +       preempt_disable();
15485 +       /*
15486 +        * Clearing migrate_disable causes tsk_cpus_allowed to
15487 +        * show the tasks original cpu affinity.
15488 +        */
15489 +       p->migrate_disable = 0;
15490 +
15491 +       unpin_current_cpu();
15492 +       preempt_enable();
15493 +       preempt_lazy_enable();
15494 +}
15495 +EXPORT_SYMBOL(migrate_enable);
15496 +#endif
15497 +
15498  /*
15499   * Pick up the highest-prio task:
15500   */
15501 @@ -3368,19 +3558,6 @@ static void __sched notrace __schedule(bool preempt)
15502                 } else {
15503                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
15504                         prev->on_rq = 0;
15505 -
15506 -                       /*
15507 -                        * If a worker went to sleep, notify and ask workqueue
15508 -                        * whether it wants to wake up a task to maintain
15509 -                        * concurrency.
15510 -                        */
15511 -                       if (prev->flags & PF_WQ_WORKER) {
15512 -                               struct task_struct *to_wakeup;
15513 -
15514 -                               to_wakeup = wq_worker_sleeping(prev);
15515 -                               if (to_wakeup)
15516 -                                       try_to_wake_up_local(to_wakeup, cookie);
15517 -                       }
15518                 }
15519                 switch_count = &prev->nvcsw;
15520         }
15521 @@ -3390,6 +3567,7 @@ static void __sched notrace __schedule(bool preempt)
15522  
15523         next = pick_next_task(rq, prev, cookie);
15524         clear_tsk_need_resched(prev);
15525 +       clear_tsk_need_resched_lazy(prev);
15526         clear_preempt_need_resched();
15527         rq->clock_skip_update = 0;
15528  
15529 @@ -3437,9 +3615,20 @@ void __noreturn do_task_dead(void)
15530  
15531  static inline void sched_submit_work(struct task_struct *tsk)
15532  {
15533 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
15534 +       if (!tsk->state)
15535                 return;
15536         /*
15537 +        * If a worker went to sleep, notify and ask workqueue whether
15538 +        * it wants to wake up a task to maintain concurrency.
15539 +        */
15540 +       if (tsk->flags & PF_WQ_WORKER)
15541 +               wq_worker_sleeping(tsk);
15542 +
15543 +
15544 +       if (tsk_is_pi_blocked(tsk))
15545 +               return;
15546 +
15547 +       /*
15548          * If we are going to sleep and we have plugged IO queued,
15549          * make sure to submit it to avoid deadlocks.
15550          */
15551 @@ -3447,6 +3636,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
15552                 blk_schedule_flush_plug(tsk);
15553  }
15554  
15555 +static void sched_update_worker(struct task_struct *tsk)
15556 +{
15557 +       if (tsk->flags & PF_WQ_WORKER)
15558 +               wq_worker_running(tsk);
15559 +}
15560 +
15561  asmlinkage __visible void __sched schedule(void)
15562  {
15563         struct task_struct *tsk = current;
15564 @@ -3457,6 +3652,7 @@ asmlinkage __visible void __sched schedule(void)
15565                 __schedule(false);
15566                 sched_preempt_enable_no_resched();
15567         } while (need_resched());
15568 +       sched_update_worker(tsk);
15569  }
15570  EXPORT_SYMBOL(schedule);
15571  
15572 @@ -3520,6 +3716,30 @@ static void __sched notrace preempt_schedule_common(void)
15573         } while (need_resched());
15574  }
15575  
15576 +#ifdef CONFIG_PREEMPT_LAZY
15577 +/*
15578 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
15579 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
15580 + * preempt_lazy_count counter >0.
15581 + */
15582 +static __always_inline int preemptible_lazy(void)
15583 +{
15584 +       if (test_thread_flag(TIF_NEED_RESCHED))
15585 +               return 1;
15586 +       if (current_thread_info()->preempt_lazy_count)
15587 +               return 0;
15588 +       return 1;
15589 +}
15590 +
15591 +#else
15592 +
15593 +static inline int preemptible_lazy(void)
15594 +{
15595 +       return 1;
15596 +}
15597 +
15598 +#endif
15599 +
15600  #ifdef CONFIG_PREEMPT
15601  /*
15602   * this is the entry point to schedule() from in-kernel preemption
15603 @@ -3534,7 +3754,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
15604          */
15605         if (likely(!preemptible()))
15606                 return;
15607 -
15608 +       if (!preemptible_lazy())
15609 +               return;
15610         preempt_schedule_common();
15611  }
15612  NOKPROBE_SYMBOL(preempt_schedule);
15613 @@ -3561,6 +3782,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
15614         if (likely(!preemptible()))
15615                 return;
15616  
15617 +       if (!preemptible_lazy())
15618 +               return;
15619 +
15620         do {
15621                 /*
15622                  * Because the function tracer can trace preempt_count_sub()
15623 @@ -3583,7 +3807,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
15624                  * an infinite recursion.
15625                  */
15626                 prev_ctx = exception_enter();
15627 +               /*
15628 +                * The add/subtract must not be traced by the function
15629 +                * tracer. But we still want to account for the
15630 +                * preempt off latency tracer. Since the _notrace versions
15631 +                * of add/subtract skip the accounting for latency tracer
15632 +                * we must force it manually.
15633 +                */
15634 +               start_critical_timings();
15635                 __schedule(true);
15636 +               stop_critical_timings();
15637                 exception_exit(prev_ctx);
15638  
15639                 preempt_latency_stop(1);
15640 @@ -4939,6 +5172,7 @@ int __cond_resched_lock(spinlock_t *lock)
15641  }
15642  EXPORT_SYMBOL(__cond_resched_lock);
15643  
15644 +#ifndef CONFIG_PREEMPT_RT_FULL
15645  int __sched __cond_resched_softirq(void)
15646  {
15647         BUG_ON(!in_softirq());
15648 @@ -4952,6 +5186,7 @@ int __sched __cond_resched_softirq(void)
15649         return 0;
15650  }
15651  EXPORT_SYMBOL(__cond_resched_softirq);
15652 +#endif
15653  
15654  /**
15655   * yield - yield the current processor to other threads.
15656 @@ -5315,7 +5550,9 @@ void init_idle(struct task_struct *idle, int cpu)
15657  
15658         /* Set the preempt count _outside_ the spinlocks! */
15659         init_idle_preempt_count(idle, cpu);
15660 -
15661 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15662 +       task_thread_info(idle)->preempt_lazy_count = 0;
15663 +#endif
15664         /*
15665          * The idle tasks have their own, simple scheduling class:
15666          */
15667 @@ -5458,6 +5695,8 @@ void sched_setnuma(struct task_struct *p, int nid)
15668  #endif /* CONFIG_NUMA_BALANCING */
15669  
15670  #ifdef CONFIG_HOTPLUG_CPU
15671 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
15672 +
15673  /*
15674   * Ensures that the idle task is using init_mm right before its cpu goes
15675   * offline.
15676 @@ -5472,7 +5711,12 @@ void idle_task_exit(void)
15677                 switch_mm_irqs_off(mm, &init_mm, current);
15678                 finish_arch_post_lock_switch();
15679         }
15680 -       mmdrop(mm);
15681 +       /*
15682 +        * Defer the cleanup to an alive cpu. On RT we can neither
15683 +        * call mmdrop() nor mmdrop_delayed() from here.
15684 +        */
15685 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
15686 +
15687  }
15688  
15689  /*
15690 @@ -7418,6 +7662,10 @@ int sched_cpu_dying(unsigned int cpu)
15691         update_max_interval();
15692         nohz_balance_exit_idle(cpu);
15693         hrtick_clear(rq);
15694 +       if (per_cpu(idle_last_mm, cpu)) {
15695 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
15696 +               per_cpu(idle_last_mm, cpu) = NULL;
15697 +       }
15698         return 0;
15699  }
15700  #endif
15701 @@ -7698,7 +7946,7 @@ void __init sched_init(void)
15702  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
15703  static inline int preempt_count_equals(int preempt_offset)
15704  {
15705 -       int nested = preempt_count() + rcu_preempt_depth();
15706 +       int nested = preempt_count() + sched_rcu_preempt_depth();
15707  
15708         return (nested == preempt_offset);
15709  }
15710 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
15711 index 37e2449186c4..26dcaabde8b3 100644
15712 --- a/kernel/sched/deadline.c
15713 +++ b/kernel/sched/deadline.c
15714 @@ -687,6 +687,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
15715  
15716         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15717         timer->function = dl_task_timer;
15718 +       timer->irqsafe = 1;
15719  }
15720  
15721  static
15722 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
15723 index fa178b62ea79..935224123441 100644
15724 --- a/kernel/sched/debug.c
15725 +++ b/kernel/sched/debug.c
15726 @@ -558,6 +558,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
15727         P(rt_throttled);
15728         PN(rt_time);
15729         PN(rt_runtime);
15730 +#ifdef CONFIG_SMP
15731 +       P(rt_nr_migratory);
15732 +#endif
15733  
15734  #undef PN
15735  #undef P
15736 @@ -953,6 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
15737  #endif
15738         P(policy);
15739         P(prio);
15740 +#ifdef CONFIG_PREEMPT_RT_FULL
15741 +       P(migrate_disable);
15742 +#endif
15743 +       P(nr_cpus_allowed);
15744  #undef PN_SCHEDSTAT
15745  #undef PN
15746  #undef __PN
15747 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
15748 index c242944f5cbd..4aeb2e2e41bc 100644
15749 --- a/kernel/sched/fair.c
15750 +++ b/kernel/sched/fair.c
15751 @@ -3518,7 +3518,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
15752         ideal_runtime = sched_slice(cfs_rq, curr);
15753         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
15754         if (delta_exec > ideal_runtime) {
15755 -               resched_curr(rq_of(cfs_rq));
15756 +               resched_curr_lazy(rq_of(cfs_rq));
15757                 /*
15758                  * The current task ran long enough, ensure it doesn't get
15759                  * re-elected due to buddy favours.
15760 @@ -3542,7 +3542,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
15761                 return;
15762  
15763         if (delta > ideal_runtime)
15764 -               resched_curr(rq_of(cfs_rq));
15765 +               resched_curr_lazy(rq_of(cfs_rq));
15766  }
15767  
15768  static void
15769 @@ -3684,7 +3684,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
15770          * validating it and just reschedule.
15771          */
15772         if (queued) {
15773 -               resched_curr(rq_of(cfs_rq));
15774 +               resched_curr_lazy(rq_of(cfs_rq));
15775                 return;
15776         }
15777         /*
15778 @@ -3866,7 +3866,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
15779          * hierarchy can be throttled
15780          */
15781         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
15782 -               resched_curr(rq_of(cfs_rq));
15783 +               resched_curr_lazy(rq_of(cfs_rq));
15784  }
15785  
15786  static __always_inline
15787 @@ -4494,7 +4494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
15788  
15789                 if (delta < 0) {
15790                         if (rq->curr == p)
15791 -                               resched_curr(rq);
15792 +                               resched_curr_lazy(rq);
15793                         return;
15794                 }
15795                 hrtick_start(rq, delta);
15796 @@ -5905,7 +5905,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
15797         return;
15798  
15799  preempt:
15800 -       resched_curr(rq);
15801 +       resched_curr_lazy(rq);
15802         /*
15803          * Only set the backward buddy when the current task is still
15804          * on the rq. This can happen when a wakeup gets interleaved
15805 @@ -8631,7 +8631,7 @@ static void task_fork_fair(struct task_struct *p)
15806                  * 'current' within the tree based on its new key value.
15807                  */
15808                 swap(curr->vruntime, se->vruntime);
15809 -               resched_curr(rq);
15810 +               resched_curr_lazy(rq);
15811         }
15812  
15813         se->vruntime -= cfs_rq->min_vruntime;
15814 @@ -8655,7 +8655,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
15815          */
15816         if (rq->curr == p) {
15817                 if (p->prio > oldprio)
15818 -                       resched_curr(rq);
15819 +                       resched_curr_lazy(rq);
15820         } else
15821                 check_preempt_curr(rq, p, 0);
15822  }
15823 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
15824 index 69631fa46c2f..6d28fcd08872 100644
15825 --- a/kernel/sched/features.h
15826 +++ b/kernel/sched/features.h
15827 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
15828   */
15829  SCHED_FEAT(NONTASK_CAPACITY, true)
15830  
15831 +#ifdef CONFIG_PREEMPT_RT_FULL
15832 +SCHED_FEAT(TTWU_QUEUE, false)
15833 +# ifdef CONFIG_PREEMPT_LAZY
15834 +SCHED_FEAT(PREEMPT_LAZY, true)
15835 +# endif
15836 +#else
15837 +
15838  /*
15839   * Queue remote wakeups on the target CPU and process them
15840   * using the scheduler IPI. Reduces rq->lock contention/bounces.
15841   */
15842  SCHED_FEAT(TTWU_QUEUE, true)
15843 +#endif
15844  
15845  #ifdef HAVE_RT_PUSH_IPI
15846  /*
15847 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
15848 index 2516b8df6dbb..2556baa0a97e 100644
15849 --- a/kernel/sched/rt.c
15850 +++ b/kernel/sched/rt.c
15851 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
15852  
15853         hrtimer_init(&rt_b->rt_period_timer,
15854                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15855 +       rt_b->rt_period_timer.irqsafe = 1;
15856         rt_b->rt_period_timer.function = sched_rt_period_timer;
15857  }
15858  
15859 @@ -101,6 +102,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
15860         rt_rq->push_cpu = nr_cpu_ids;
15861         raw_spin_lock_init(&rt_rq->push_lock);
15862         init_irq_work(&rt_rq->push_work, push_irq_work_func);
15863 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
15864  #endif
15865  #endif /* CONFIG_SMP */
15866         /* We start is dequeued state, because no RT tasks are queued */
15867 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
15868 index 055f935d4421..19324ac27026 100644
15869 --- a/kernel/sched/sched.h
15870 +++ b/kernel/sched/sched.h
15871 @@ -1163,6 +1163,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
15872  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
15873  #define WF_FORK                0x02            /* child wakeup after fork */
15874  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
15875 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
15876  
15877  /*
15878   * To aid in avoiding the subversion of "niceness" due to uneven distribution
15879 @@ -1346,6 +1347,15 @@ extern void init_sched_fair_class(void);
15880  extern void resched_curr(struct rq *rq);
15881  extern void resched_cpu(int cpu);
15882  
15883 +#ifdef CONFIG_PREEMPT_LAZY
15884 +extern void resched_curr_lazy(struct rq *rq);
15885 +#else
15886 +static inline void resched_curr_lazy(struct rq *rq)
15887 +{
15888 +       resched_curr(rq);
15889 +}
15890 +#endif
15891 +
15892  extern struct rt_bandwidth def_rt_bandwidth;
15893  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
15894  
15895 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
15896 index 82f0dff90030..ef027ff3250a 100644
15897 --- a/kernel/sched/swait.c
15898 +++ b/kernel/sched/swait.c
15899 @@ -1,5 +1,6 @@
15900  #include <linux/sched.h>
15901  #include <linux/swait.h>
15902 +#include <linux/suspend.h>
15903  
15904  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15905                              struct lock_class_key *key)
15906 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
15907  }
15908  EXPORT_SYMBOL(swake_up_locked);
15909  
15910 +void swake_up_all_locked(struct swait_queue_head *q)
15911 +{
15912 +       struct swait_queue *curr;
15913 +       int wakes = 0;
15914 +
15915 +       while (!list_empty(&q->task_list)) {
15916 +
15917 +               curr = list_first_entry(&q->task_list, typeof(*curr),
15918 +                                       task_list);
15919 +               wake_up_process(curr->task);
15920 +               list_del_init(&curr->task_list);
15921 +               wakes++;
15922 +       }
15923 +       if (pm_in_action)
15924 +               return;
15925 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
15926 +}
15927 +EXPORT_SYMBOL(swake_up_all_locked);
15928 +
15929  void swake_up(struct swait_queue_head *q)
15930  {
15931         unsigned long flags;
15932 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
15933         if (!swait_active(q))
15934                 return;
15935  
15936 +       WARN_ON(irqs_disabled());
15937         raw_spin_lock_irq(&q->lock);
15938         list_splice_init(&q->task_list, &tmp);
15939         while (!list_empty(&tmp)) {
15940 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
15941 new file mode 100644
15942 index 000000000000..1950f40ca725
15943 --- /dev/null
15944 +++ b/kernel/sched/swork.c
15945 @@ -0,0 +1,173 @@
15946 +/*
15947 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
15948 + *
15949 + * Provides a framework for enqueuing callbacks from irq context
15950 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
15951 + */
15952 +
15953 +#include <linux/swait.h>
15954 +#include <linux/swork.h>
15955 +#include <linux/kthread.h>
15956 +#include <linux/slab.h>
15957 +#include <linux/spinlock.h>
15958 +#include <linux/export.h>
15959 +
15960 +#define SWORK_EVENT_PENDING     (1 << 0)
15961 +
15962 +static DEFINE_MUTEX(worker_mutex);
15963 +static struct sworker *glob_worker;
15964 +
15965 +struct sworker {
15966 +       struct list_head events;
15967 +       struct swait_queue_head wq;
15968 +
15969 +       raw_spinlock_t lock;
15970 +
15971 +       struct task_struct *task;
15972 +       int refs;
15973 +};
15974 +
15975 +static bool swork_readable(struct sworker *worker)
15976 +{
15977 +       bool r;
15978 +
15979 +       if (kthread_should_stop())
15980 +               return true;
15981 +
15982 +       raw_spin_lock_irq(&worker->lock);
15983 +       r = !list_empty(&worker->events);
15984 +       raw_spin_unlock_irq(&worker->lock);
15985 +
15986 +       return r;
15987 +}
15988 +
15989 +static int swork_kthread(void *arg)
15990 +{
15991 +       struct sworker *worker = arg;
15992 +
15993 +       for (;;) {
15994 +               swait_event_interruptible(worker->wq,
15995 +                                       swork_readable(worker));
15996 +               if (kthread_should_stop())
15997 +                       break;
15998 +
15999 +               raw_spin_lock_irq(&worker->lock);
16000 +               while (!list_empty(&worker->events)) {
16001 +                       struct swork_event *sev;
16002 +
16003 +                       sev = list_first_entry(&worker->events,
16004 +                                       struct swork_event, item);
16005 +                       list_del(&sev->item);
16006 +                       raw_spin_unlock_irq(&worker->lock);
16007 +
16008 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
16009 +                                                        &sev->flags));
16010 +                       sev->func(sev);
16011 +                       raw_spin_lock_irq(&worker->lock);
16012 +               }
16013 +               raw_spin_unlock_irq(&worker->lock);
16014 +       }
16015 +       return 0;
16016 +}
16017 +
16018 +static struct sworker *swork_create(void)
16019 +{
16020 +       struct sworker *worker;
16021 +
16022 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
16023 +       if (!worker)
16024 +               return ERR_PTR(-ENOMEM);
16025 +
16026 +       INIT_LIST_HEAD(&worker->events);
16027 +       raw_spin_lock_init(&worker->lock);
16028 +       init_swait_queue_head(&worker->wq);
16029 +
16030 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
16031 +       if (IS_ERR(worker->task)) {
16032 +               kfree(worker);
16033 +               return ERR_PTR(-ENOMEM);
16034 +       }
16035 +
16036 +       return worker;
16037 +}
16038 +
16039 +static void swork_destroy(struct sworker *worker)
16040 +{
16041 +       kthread_stop(worker->task);
16042 +
16043 +       WARN_ON(!list_empty(&worker->events));
16044 +       kfree(worker);
16045 +}
16046 +
16047 +/**
16048 + * swork_queue - queue swork
16049 + *
16050 + * Returns %false if @work was already on a queue, %true otherwise.
16051 + *
16052 + * The work is queued and processed on a random CPU
16053 + */
16054 +bool swork_queue(struct swork_event *sev)
16055 +{
16056 +       unsigned long flags;
16057 +
16058 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
16059 +               return false;
16060 +
16061 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
16062 +       list_add_tail(&sev->item, &glob_worker->events);
16063 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
16064 +
16065 +       swake_up(&glob_worker->wq);
16066 +       return true;
16067 +}
16068 +EXPORT_SYMBOL_GPL(swork_queue);
16069 +
16070 +/**
16071 + * swork_get - get an instance of the sworker
16072 + *
16073 + * Returns an negative error code if the initialization if the worker did not
16074 + * work, %0 otherwise.
16075 + *
16076 + */
16077 +int swork_get(void)
16078 +{
16079 +       struct sworker *worker;
16080 +
16081 +       mutex_lock(&worker_mutex);
16082 +       if (!glob_worker) {
16083 +               worker = swork_create();
16084 +               if (IS_ERR(worker)) {
16085 +                       mutex_unlock(&worker_mutex);
16086 +                       return -ENOMEM;
16087 +               }
16088 +
16089 +               glob_worker = worker;
16090 +       }
16091 +
16092 +       glob_worker->refs++;
16093 +       mutex_unlock(&worker_mutex);
16094 +
16095 +       return 0;
16096 +}
16097 +EXPORT_SYMBOL_GPL(swork_get);
16098 +
16099 +/**
16100 + * swork_put - puts an instance of the sworker
16101 + *
16102 + * Will destroy the sworker thread. This function must not be called until all
16103 + * queued events have been completed.
16104 + */
16105 +void swork_put(void)
16106 +{
16107 +       mutex_lock(&worker_mutex);
16108 +
16109 +       glob_worker->refs--;
16110 +       if (glob_worker->refs > 0)
16111 +               goto out;
16112 +
16113 +       swork_destroy(glob_worker);
16114 +       glob_worker = NULL;
16115 +out:
16116 +       mutex_unlock(&worker_mutex);
16117 +}
16118 +EXPORT_SYMBOL_GPL(swork_put);
16119 diff --git a/kernel/signal.c b/kernel/signal.c
16120 index 75761acc77cf..ae0773c76bb0 100644
16121 --- a/kernel/signal.c
16122 +++ b/kernel/signal.c
16123 @@ -14,6 +14,7 @@
16124  #include <linux/export.h>
16125  #include <linux/init.h>
16126  #include <linux/sched.h>
16127 +#include <linux/sched/rt.h>
16128  #include <linux/fs.h>
16129  #include <linux/tty.h>
16130  #include <linux/binfmts.h>
16131 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
16132         return false;
16133  }
16134  
16135 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
16136 +{
16137 +       struct sigqueue *q = t->sigqueue_cache;
16138 +
16139 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
16140 +               return NULL;
16141 +       return q;
16142 +}
16143 +
16144 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
16145 +{
16146 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
16147 +               return 0;
16148 +       return 1;
16149 +}
16150 +
16151  /*
16152   * allocate a new signal queue record
16153   * - this may be called without locks if and only if t == current, otherwise an
16154   *   appropriate lock must be held to stop the target task from exiting
16155   */
16156  static struct sigqueue *
16157 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
16158 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
16159 +                   int override_rlimit, int fromslab)
16160  {
16161         struct sigqueue *q = NULL;
16162         struct user_struct *user;
16163 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
16164         if (override_rlimit ||
16165             atomic_read(&user->sigpending) <=
16166                         task_rlimit(t, RLIMIT_SIGPENDING)) {
16167 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
16168 +               if (!fromslab)
16169 +                       q = get_task_cache(t);
16170 +               if (!q)
16171 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
16172         } else {
16173                 print_dropped_signal(sig);
16174         }
16175 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
16176         return q;
16177  }
16178  
16179 +static struct sigqueue *
16180 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
16181 +                int override_rlimit)
16182 +{
16183 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
16184 +}
16185 +
16186  static void __sigqueue_free(struct sigqueue *q)
16187  {
16188         if (q->flags & SIGQUEUE_PREALLOC)
16189 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
16190         kmem_cache_free(sigqueue_cachep, q);
16191  }
16192  
16193 +static void sigqueue_free_current(struct sigqueue *q)
16194 +{
16195 +       struct user_struct *up;
16196 +
16197 +       if (q->flags & SIGQUEUE_PREALLOC)
16198 +               return;
16199 +
16200 +       up = q->user;
16201 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
16202 +               atomic_dec(&up->sigpending);
16203 +               free_uid(up);
16204 +       } else
16205 +                 __sigqueue_free(q);
16206 +}
16207 +
16208  void flush_sigqueue(struct sigpending *queue)
16209  {
16210         struct sigqueue *q;
16211 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
16212  }
16213  
16214  /*
16215 + * Called from __exit_signal. Flush tsk->pending and
16216 + * tsk->sigqueue_cache
16217 + */
16218 +void flush_task_sigqueue(struct task_struct *tsk)
16219 +{
16220 +       struct sigqueue *q;
16221 +
16222 +       flush_sigqueue(&tsk->pending);
16223 +
16224 +       q = get_task_cache(tsk);
16225 +       if (q)
16226 +               kmem_cache_free(sigqueue_cachep, q);
16227 +}
16228 +
16229 +/*
16230   * Flush all pending signals for this kthread.
16231   */
16232  void flush_signals(struct task_struct *t)
16233 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
16234  still_pending:
16235                 list_del_init(&first->list);
16236                 copy_siginfo(info, &first->info);
16237 -               __sigqueue_free(first);
16238 +               sigqueue_free_current(first);
16239         } else {
16240                 /*
16241                  * Ok, it wasn't in the queue.  This must be
16242 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
16243  {
16244         int signr;
16245  
16246 +       WARN_ON_ONCE(tsk != current);
16247 +
16248         /* We only dequeue private signals from ourselves, we don't let
16249          * signalfd steal them
16250          */
16251 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
16252   * We don't want to have recursive SIGSEGV's etc, for example,
16253   * that is why we also clear SIGNAL_UNKILLABLE.
16254   */
16255 -int
16256 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16257 +static int
16258 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16259  {
16260         unsigned long int flags;
16261         int ret, blocked, ignored;
16262 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16263         return ret;
16264  }
16265  
16266 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16267 +{
16268 +/*
16269 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
16270 + * since it can not enable preemption, and the signal code's spin_locks
16271 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
16272 + * send the signal on exit of the trap.
16273 + */
16274 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
16275 +       if (in_atomic()) {
16276 +               if (WARN_ON_ONCE(t != current))
16277 +                       return 0;
16278 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
16279 +                       return 0;
16280 +
16281 +               if (is_si_special(info)) {
16282 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
16283 +                       t->forced_info.si_signo = sig;
16284 +                       t->forced_info.si_errno = 0;
16285 +                       t->forced_info.si_code = SI_KERNEL;
16286 +                       t->forced_info.si_pid = 0;
16287 +                       t->forced_info.si_uid = 0;
16288 +               } else {
16289 +                       t->forced_info = *info;
16290 +               }
16291 +
16292 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
16293 +               return 0;
16294 +       }
16295 +#endif
16296 +       return do_force_sig_info(sig, info, t);
16297 +}
16298 +
16299  /*
16300   * Nuke all other threads in the group.
16301   */
16302 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
16303                  * Disable interrupts early to avoid deadlocks.
16304                  * See rcu_read_unlock() comment header for details.
16305                  */
16306 -               local_irq_save(*flags);
16307 +               local_irq_save_nort(*flags);
16308                 rcu_read_lock();
16309                 sighand = rcu_dereference(tsk->sighand);
16310                 if (unlikely(sighand == NULL)) {
16311                         rcu_read_unlock();
16312 -                       local_irq_restore(*flags);
16313 +                       local_irq_restore_nort(*flags);
16314                         break;
16315                 }
16316                 /*
16317 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
16318                 }
16319                 spin_unlock(&sighand->siglock);
16320                 rcu_read_unlock();
16321 -               local_irq_restore(*flags);
16322 +               local_irq_restore_nort(*flags);
16323         }
16324  
16325         return sighand;
16326 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
16327   */
16328  struct sigqueue *sigqueue_alloc(void)
16329  {
16330 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
16331 +       /* Preallocated sigqueue objects always from the slabcache ! */
16332 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
16333  
16334         if (q)
16335                 q->flags |= SIGQUEUE_PREALLOC;
16336 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
16337                 if (gstop_done && ptrace_reparented(current))
16338                         do_notify_parent_cldstop(current, false, why);
16339  
16340 -               /*
16341 -                * Don't want to allow preemption here, because
16342 -                * sys_ptrace() needs this task to be inactive.
16343 -                *
16344 -                * XXX: implement read_unlock_no_resched().
16345 -                */
16346 -               preempt_disable();
16347                 read_unlock(&tasklist_lock);
16348 -               preempt_enable_no_resched();
16349                 freezable_schedule();
16350         } else {
16351                 /*
16352 diff --git a/kernel/softirq.c b/kernel/softirq.c
16353 index 744fa611cae0..1431d08e6f21 100644
16354 --- a/kernel/softirq.c
16355 +++ b/kernel/softirq.c
16356 @@ -21,10 +21,12 @@
16357  #include <linux/freezer.h>
16358  #include <linux/kthread.h>
16359  #include <linux/rcupdate.h>
16360 +#include <linux/delay.h>
16361  #include <linux/ftrace.h>
16362  #include <linux/smp.h>
16363  #include <linux/smpboot.h>
16364  #include <linux/tick.h>
16365 +#include <linux/locallock.h>
16366  #include <linux/irq.h>
16367  
16368  #define CREATE_TRACE_POINTS
16369 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
16370  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
16371  
16372  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
16373 +#ifdef CONFIG_PREEMPT_RT_FULL
16374 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
16375 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
16376 +#endif
16377  
16378  const char * const softirq_to_name[NR_SOFTIRQS] = {
16379         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
16380         "TASKLET", "SCHED", "HRTIMER", "RCU"
16381  };
16382  
16383 +#ifdef CONFIG_NO_HZ_COMMON
16384 +# ifdef CONFIG_PREEMPT_RT_FULL
16385 +
16386 +struct softirq_runner {
16387 +       struct task_struct *runner[NR_SOFTIRQS];
16388 +};
16389 +
16390 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
16391 +
16392 +static inline void softirq_set_runner(unsigned int sirq)
16393 +{
16394 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16395 +
16396 +       sr->runner[sirq] = current;
16397 +}
16398 +
16399 +static inline void softirq_clr_runner(unsigned int sirq)
16400 +{
16401 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16402 +
16403 +       sr->runner[sirq] = NULL;
16404 +}
16405 +
16406 +/*
16407 + * On preempt-rt a softirq running context might be blocked on a
16408 + * lock. There might be no other runnable task on this CPU because the
16409 + * lock owner runs on some other CPU. So we have to go into idle with
16410 + * the pending bit set. Therefor we need to check this otherwise we
16411 + * warn about false positives which confuses users and defeats the
16412 + * whole purpose of this test.
16413 + *
16414 + * This code is called with interrupts disabled.
16415 + */
16416 +void softirq_check_pending_idle(void)
16417 +{
16418 +       static int rate_limit;
16419 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16420 +       u32 warnpending;
16421 +       int i;
16422 +
16423 +       if (rate_limit >= 10)
16424 +               return;
16425 +
16426 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
16427 +       for (i = 0; i < NR_SOFTIRQS; i++) {
16428 +               struct task_struct *tsk = sr->runner[i];
16429 +
16430 +               /*
16431 +                * The wakeup code in rtmutex.c wakes up the task
16432 +                * _before_ it sets pi_blocked_on to NULL under
16433 +                * tsk->pi_lock. So we need to check for both: state
16434 +                * and pi_blocked_on.
16435 +                */
16436 +               if (tsk) {
16437 +                       raw_spin_lock(&tsk->pi_lock);
16438 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
16439 +                               /* Clear all bits pending in that task */
16440 +                               warnpending &= ~(tsk->softirqs_raised);
16441 +                               warnpending &= ~(1 << i);
16442 +                       }
16443 +                       raw_spin_unlock(&tsk->pi_lock);
16444 +               }
16445 +       }
16446 +
16447 +       if (warnpending) {
16448 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16449 +                      warnpending);
16450 +               rate_limit++;
16451 +       }
16452 +}
16453 +# else
16454 +/*
16455 + * On !PREEMPT_RT we just printk rate limited:
16456 + */
16457 +void softirq_check_pending_idle(void)
16458 +{
16459 +       static int rate_limit;
16460 +
16461 +       if (rate_limit < 10 &&
16462 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
16463 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16464 +                      local_softirq_pending());
16465 +               rate_limit++;
16466 +       }
16467 +}
16468 +# endif
16469 +
16470 +#else /* !CONFIG_NO_HZ_COMMON */
16471 +static inline void softirq_set_runner(unsigned int sirq) { }
16472 +static inline void softirq_clr_runner(unsigned int sirq) { }
16473 +#endif
16474 +
16475  /*
16476   * we cannot loop indefinitely here to avoid userspace starvation,
16477   * but we also don't want to introduce a worst case 1/HZ latency
16478 @@ -77,6 +175,37 @@ static void wakeup_softirqd(void)
16479                 wake_up_process(tsk);
16480  }
16481  
16482 +#ifdef CONFIG_PREEMPT_RT_FULL
16483 +static void wakeup_timer_softirqd(void)
16484 +{
16485 +       /* Interrupts are disabled: no need to stop preemption */
16486 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
16487 +
16488 +       if (tsk && tsk->state != TASK_RUNNING)
16489 +               wake_up_process(tsk);
16490 +}
16491 +#endif
16492 +
16493 +static void handle_softirq(unsigned int vec_nr)
16494 +{
16495 +       struct softirq_action *h = softirq_vec + vec_nr;
16496 +       int prev_count;
16497 +
16498 +       prev_count = preempt_count();
16499 +
16500 +       kstat_incr_softirqs_this_cpu(vec_nr);
16501 +
16502 +       trace_softirq_entry(vec_nr);
16503 +       h->action(h);
16504 +       trace_softirq_exit(vec_nr);
16505 +       if (unlikely(prev_count != preempt_count())) {
16506 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16507 +                      vec_nr, softirq_to_name[vec_nr], h->action,
16508 +                      prev_count, preempt_count());
16509 +               preempt_count_set(prev_count);
16510 +       }
16511 +}
16512 +
16513  /*
16514   * If ksoftirqd is scheduled, we do not want to process pending softirqs
16515   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
16516 @@ -88,6 +217,48 @@ static bool ksoftirqd_running(void)
16517         return tsk && (tsk->state == TASK_RUNNING);
16518  }
16519  
16520 +#ifndef CONFIG_PREEMPT_RT_FULL
16521 +static inline int ksoftirqd_softirq_pending(void)
16522 +{
16523 +       return local_softirq_pending();
16524 +}
16525 +
16526 +static void handle_pending_softirqs(u32 pending)
16527 +{
16528 +       struct softirq_action *h = softirq_vec;
16529 +       int softirq_bit;
16530 +
16531 +       local_irq_enable();
16532 +
16533 +       h = softirq_vec;
16534 +
16535 +       while ((softirq_bit = ffs(pending))) {
16536 +               unsigned int vec_nr;
16537 +
16538 +               h += softirq_bit - 1;
16539 +               vec_nr = h - softirq_vec;
16540 +               handle_softirq(vec_nr);
16541 +
16542 +               h++;
16543 +               pending >>= softirq_bit;
16544 +       }
16545 +
16546 +       rcu_bh_qs();
16547 +       local_irq_disable();
16548 +}
16549 +
16550 +static void run_ksoftirqd(unsigned int cpu)
16551 +{
16552 +       local_irq_disable();
16553 +       if (ksoftirqd_softirq_pending()) {
16554 +               __do_softirq();
16555 +               local_irq_enable();
16556 +               cond_resched_rcu_qs();
16557 +               return;
16558 +       }
16559 +       local_irq_enable();
16560 +}
16561 +
16562  /*
16563   * preempt_count and SOFTIRQ_OFFSET usage:
16564   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
16565 @@ -243,10 +414,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
16566         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
16567         unsigned long old_flags = current->flags;
16568         int max_restart = MAX_SOFTIRQ_RESTART;
16569 -       struct softirq_action *h;
16570         bool in_hardirq;
16571         __u32 pending;
16572 -       int softirq_bit;
16573  
16574         /*
16575          * Mask out PF_MEMALLOC s current task context is borrowed for the
16576 @@ -265,36 +434,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
16577         /* Reset the pending bitmask before enabling irqs */
16578         set_softirq_pending(0);
16579  
16580 -       local_irq_enable();
16581 -
16582 -       h = softirq_vec;
16583 -
16584 -       while ((softirq_bit = ffs(pending))) {
16585 -               unsigned int vec_nr;
16586 -               int prev_count;
16587 -
16588 -               h += softirq_bit - 1;
16589 -
16590 -               vec_nr = h - softirq_vec;
16591 -               prev_count = preempt_count();
16592 -
16593 -               kstat_incr_softirqs_this_cpu(vec_nr);
16594 -
16595 -               trace_softirq_entry(vec_nr);
16596 -               h->action(h);
16597 -               trace_softirq_exit(vec_nr);
16598 -               if (unlikely(prev_count != preempt_count())) {
16599 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16600 -                              vec_nr, softirq_to_name[vec_nr], h->action,
16601 -                              prev_count, preempt_count());
16602 -                       preempt_count_set(prev_count);
16603 -               }
16604 -               h++;
16605 -               pending >>= softirq_bit;
16606 -       }
16607 -
16608 -       rcu_bh_qs();
16609 -       local_irq_disable();
16610 +       handle_pending_softirqs(pending);
16611  
16612         pending = local_softirq_pending();
16613         if (pending) {
16614 @@ -331,6 +471,309 @@ asmlinkage __visible void do_softirq(void)
16615  }
16616  
16617  /*
16618 + * This function must run with irqs disabled!
16619 + */
16620 +void raise_softirq_irqoff(unsigned int nr)
16621 +{
16622 +       __raise_softirq_irqoff(nr);
16623 +
16624 +       /*
16625 +        * If we're in an interrupt or softirq, we're done
16626 +        * (this also catches softirq-disabled code). We will
16627 +        * actually run the softirq once we return from
16628 +        * the irq or softirq.
16629 +        *
16630 +        * Otherwise we wake up ksoftirqd to make sure we
16631 +        * schedule the softirq soon.
16632 +        */
16633 +       if (!in_interrupt())
16634 +               wakeup_softirqd();
16635 +}
16636 +
16637 +void __raise_softirq_irqoff(unsigned int nr)
16638 +{
16639 +       trace_softirq_raise(nr);
16640 +       or_softirq_pending(1UL << nr);
16641 +}
16642 +
16643 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
16644 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
16645 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
16646 +
16647 +#else /* !PREEMPT_RT_FULL */
16648 +
16649 +/*
16650 + * On RT we serialize softirq execution with a cpu local lock per softirq
16651 + */
16652 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
16653 +
16654 +void __init softirq_early_init(void)
16655 +{
16656 +       int i;
16657 +
16658 +       for (i = 0; i < NR_SOFTIRQS; i++)
16659 +               local_irq_lock_init(local_softirq_locks[i]);
16660 +}
16661 +
16662 +static void lock_softirq(int which)
16663 +{
16664 +       local_lock(local_softirq_locks[which]);
16665 +}
16666 +
16667 +static void unlock_softirq(int which)
16668 +{
16669 +       local_unlock(local_softirq_locks[which]);
16670 +}
16671 +
16672 +static void do_single_softirq(int which)
16673 +{
16674 +       unsigned long old_flags = current->flags;
16675 +
16676 +       current->flags &= ~PF_MEMALLOC;
16677 +       vtime_account_irq_enter(current);
16678 +       current->flags |= PF_IN_SOFTIRQ;
16679 +       lockdep_softirq_enter();
16680 +       local_irq_enable();
16681 +       handle_softirq(which);
16682 +       local_irq_disable();
16683 +       lockdep_softirq_exit();
16684 +       current->flags &= ~PF_IN_SOFTIRQ;
16685 +       vtime_account_irq_enter(current);
16686 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
16687 +}
16688 +
16689 +/*
16690 + * Called with interrupts disabled. Process softirqs which were raised
16691 + * in current context (or on behalf of ksoftirqd).
16692 + */
16693 +static void do_current_softirqs(void)
16694 +{
16695 +       while (current->softirqs_raised) {
16696 +               int i = __ffs(current->softirqs_raised);
16697 +               unsigned int pending, mask = (1U << i);
16698 +
16699 +               current->softirqs_raised &= ~mask;
16700 +               local_irq_enable();
16701 +
16702 +               /*
16703 +                * If the lock is contended, we boost the owner to
16704 +                * process the softirq or leave the critical section
16705 +                * now.
16706 +                */
16707 +               lock_softirq(i);
16708 +               local_irq_disable();
16709 +               softirq_set_runner(i);
16710 +               /*
16711 +                * Check with the local_softirq_pending() bits,
16712 +                * whether we need to process this still or if someone
16713 +                * else took care of it.
16714 +                */
16715 +               pending = local_softirq_pending();
16716 +               if (pending & mask) {
16717 +                       set_softirq_pending(pending & ~mask);
16718 +                       do_single_softirq(i);
16719 +               }
16720 +               softirq_clr_runner(i);
16721 +               WARN_ON(current->softirq_nestcnt != 1);
16722 +               local_irq_enable();
16723 +               unlock_softirq(i);
16724 +               local_irq_disable();
16725 +       }
16726 +}
16727 +
16728 +void __local_bh_disable(void)
16729 +{
16730 +       if (++current->softirq_nestcnt == 1)
16731 +               migrate_disable();
16732 +}
16733 +EXPORT_SYMBOL(__local_bh_disable);
16734 +
16735 +void __local_bh_enable(void)
16736 +{
16737 +       if (WARN_ON(current->softirq_nestcnt == 0))
16738 +               return;
16739 +
16740 +       local_irq_disable();
16741 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
16742 +               do_current_softirqs();
16743 +       local_irq_enable();
16744 +
16745 +       if (--current->softirq_nestcnt == 0)
16746 +               migrate_enable();
16747 +}
16748 +EXPORT_SYMBOL(__local_bh_enable);
16749 +
16750 +void _local_bh_enable(void)
16751 +{
16752 +       if (WARN_ON(current->softirq_nestcnt == 0))
16753 +               return;
16754 +       if (--current->softirq_nestcnt == 0)
16755 +               migrate_enable();
16756 +}
16757 +EXPORT_SYMBOL(_local_bh_enable);
16758 +
16759 +int in_serving_softirq(void)
16760 +{
16761 +       return current->flags & PF_IN_SOFTIRQ;
16762 +}
16763 +EXPORT_SYMBOL(in_serving_softirq);
16764 +
16765 +/* Called with preemption disabled */
16766 +static void run_ksoftirqd(unsigned int cpu)
16767 +{
16768 +       local_irq_disable();
16769 +       current->softirq_nestcnt++;
16770 +
16771 +       do_current_softirqs();
16772 +       current->softirq_nestcnt--;
16773 +       local_irq_enable();
16774 +       cond_resched_rcu_qs();
16775 +}
16776 +
16777 +/*
16778 + * Called from netif_rx_ni(). Preemption enabled, but migration
16779 + * disabled. So the cpu can't go away under us.
16780 + */
16781 +void thread_do_softirq(void)
16782 +{
16783 +       if (!in_serving_softirq() && current->softirqs_raised) {
16784 +               current->softirq_nestcnt++;
16785 +               do_current_softirqs();
16786 +               current->softirq_nestcnt--;
16787 +       }
16788 +}
16789 +
16790 +static void do_raise_softirq_irqoff(unsigned int nr)
16791 +{
16792 +       unsigned int mask;
16793 +
16794 +       mask = 1UL << nr;
16795 +
16796 +       trace_softirq_raise(nr);
16797 +       or_softirq_pending(mask);
16798 +
16799 +       /*
16800 +        * If we are not in a hard interrupt and inside a bh disabled
16801 +        * region, we simply raise the flag on current. local_bh_enable()
16802 +        * will make sure that the softirq is executed. Otherwise we
16803 +        * delegate it to ksoftirqd.
16804 +        */
16805 +       if (!in_irq() && current->softirq_nestcnt)
16806 +               current->softirqs_raised |= mask;
16807 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
16808 +               return;
16809 +
16810 +       if (mask & TIMER_SOFTIRQS)
16811 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
16812 +       else
16813 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
16814 +}
16815 +
16816 +static void wakeup_proper_softirq(unsigned int nr)
16817 +{
16818 +       if ((1UL << nr) & TIMER_SOFTIRQS)
16819 +               wakeup_timer_softirqd();
16820 +       else
16821 +               wakeup_softirqd();
16822 +}
16823 +
16824 +void __raise_softirq_irqoff(unsigned int nr)
16825 +{
16826 +       do_raise_softirq_irqoff(nr);
16827 +       if (!in_irq() && !current->softirq_nestcnt)
16828 +               wakeup_proper_softirq(nr);
16829 +}
16830 +
16831 +/*
16832 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
16833 + */
16834 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
16835 +{
16836 +       unsigned int mask;
16837 +
16838 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
16839 +                        !__this_cpu_read(ktimer_softirqd)))
16840 +               return;
16841 +       mask = 1UL << nr;
16842 +
16843 +       trace_softirq_raise(nr);
16844 +       or_softirq_pending(mask);
16845 +       if (mask & TIMER_SOFTIRQS)
16846 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
16847 +       else
16848 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
16849 +       wakeup_proper_softirq(nr);
16850 +}
16851 +
16852 +/*
16853 + * This function must run with irqs disabled!
16854 + */
16855 +void raise_softirq_irqoff(unsigned int nr)
16856 +{
16857 +       do_raise_softirq_irqoff(nr);
16858 +
16859 +       /*
16860 +        * If we're in an hard interrupt we let irq return code deal
16861 +        * with the wakeup of ksoftirqd.
16862 +        */
16863 +       if (in_irq())
16864 +               return;
16865 +       /*
16866 +        * If we are in thread context but outside of a bh disabled
16867 +        * region, we need to wake ksoftirqd as well.
16868 +        *
16869 +        * CHECKME: Some of the places which do that could be wrapped
16870 +        * into local_bh_disable/enable pairs. Though it's unclear
16871 +        * whether this is worth the effort. To find those places just
16872 +        * raise a WARN() if the condition is met.
16873 +        */
16874 +       if (!current->softirq_nestcnt)
16875 +               wakeup_proper_softirq(nr);
16876 +}
16877 +
16878 +static inline int ksoftirqd_softirq_pending(void)
16879 +{
16880 +       return current->softirqs_raised;
16881 +}
16882 +
16883 +static inline void local_bh_disable_nort(void) { }
16884 +static inline void _local_bh_enable_nort(void) { }
16885 +
16886 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
16887 +{
16888 +       /* Take over all but timer pending softirqs when starting */
16889 +       local_irq_disable();
16890 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
16891 +       local_irq_enable();
16892 +}
16893 +
16894 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
16895 +{
16896 +       struct sched_param param = { .sched_priority = 1 };
16897 +
16898 +       sched_setscheduler(current, SCHED_FIFO, &param);
16899 +
16900 +       /* Take over timer pending softirqs when starting */
16901 +       local_irq_disable();
16902 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
16903 +       local_irq_enable();
16904 +}
16905 +
16906 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
16907 +                                                   bool online)
16908 +{
16909 +       struct sched_param param = { .sched_priority = 0 };
16910 +
16911 +       sched_setscheduler(current, SCHED_NORMAL, &param);
16912 +}
16913 +
16914 +static int ktimer_softirqd_should_run(unsigned int cpu)
16915 +{
16916 +       return current->softirqs_raised;
16917 +}
16918 +
16919 +#endif /* PREEMPT_RT_FULL */
16920 +/*
16921   * Enter an interrupt context.
16922   */
16923  void irq_enter(void)
16924 @@ -341,9 +784,9 @@ void irq_enter(void)
16925                  * Prevent raise_softirq from needlessly waking up ksoftirqd
16926                  * here, as softirq will be serviced on return from interrupt.
16927                  */
16928 -               local_bh_disable();
16929 +               local_bh_disable_nort();
16930                 tick_irq_enter();
16931 -               _local_bh_enable();
16932 +               _local_bh_enable_nort();
16933         }
16934  
16935         __irq_enter();
16936 @@ -351,9 +794,13 @@ void irq_enter(void)
16937  
16938  static inline void invoke_softirq(void)
16939  {
16940 +#ifdef CONFIG_PREEMPT_RT_FULL
16941 +       unsigned long flags;
16942 +#endif
16943 +
16944         if (ksoftirqd_running())
16945                 return;
16946 -
16947 +#ifndef CONFIG_PREEMPT_RT_FULL
16948         if (!force_irqthreads) {
16949  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
16950                 /*
16951 @@ -373,6 +820,17 @@ static inline void invoke_softirq(void)
16952         } else {
16953                 wakeup_softirqd();
16954         }
16955 +#else /* PREEMPT_RT_FULL */
16956 +
16957 +       local_irq_save(flags);
16958 +       if (__this_cpu_read(ksoftirqd) &&
16959 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
16960 +               wakeup_softirqd();
16961 +       if (__this_cpu_read(ktimer_softirqd) &&
16962 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
16963 +               wakeup_timer_softirqd();
16964 +       local_irq_restore(flags);
16965 +#endif
16966  }
16967  
16968  static inline void tick_irq_exit(void)
16969 @@ -409,26 +867,6 @@ void irq_exit(void)
16970         trace_hardirq_exit(); /* must be last! */
16971  }
16972  
16973 -/*
16974 - * This function must run with irqs disabled!
16975 - */
16976 -inline void raise_softirq_irqoff(unsigned int nr)
16977 -{
16978 -       __raise_softirq_irqoff(nr);
16979 -
16980 -       /*
16981 -        * If we're in an interrupt or softirq, we're done
16982 -        * (this also catches softirq-disabled code). We will
16983 -        * actually run the softirq once we return from
16984 -        * the irq or softirq.
16985 -        *
16986 -        * Otherwise we wake up ksoftirqd to make sure we
16987 -        * schedule the softirq soon.
16988 -        */
16989 -       if (!in_interrupt())
16990 -               wakeup_softirqd();
16991 -}
16992 -
16993  void raise_softirq(unsigned int nr)
16994  {
16995         unsigned long flags;
16996 @@ -438,12 +876,6 @@ void raise_softirq(unsigned int nr)
16997         local_irq_restore(flags);
16998  }
16999  
17000 -void __raise_softirq_irqoff(unsigned int nr)
17001 -{
17002 -       trace_softirq_raise(nr);
17003 -       or_softirq_pending(1UL << nr);
17004 -}
17005 -
17006  void open_softirq(int nr, void (*action)(struct softirq_action *))
17007  {
17008         softirq_vec[nr].action = action;
17009 @@ -460,15 +892,45 @@ struct tasklet_head {
17010  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
17011  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
17012  
17013 +static void inline
17014 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
17015 +{
17016 +       if (tasklet_trylock(t)) {
17017 +again:
17018 +               /* We may have been preempted before tasklet_trylock
17019 +                * and __tasklet_action may have already run.
17020 +                * So double check the sched bit while the takslet
17021 +                * is locked before adding it to the list.
17022 +                */
17023 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
17024 +                       t->next = NULL;
17025 +                       *head->tail = t;
17026 +                       head->tail = &(t->next);
17027 +                       raise_softirq_irqoff(nr);
17028 +                       tasklet_unlock(t);
17029 +               } else {
17030 +                       /* This is subtle. If we hit the corner case above
17031 +                        * It is possible that we get preempted right here,
17032 +                        * and another task has successfully called
17033 +                        * tasklet_schedule(), then this function, and
17034 +                        * failed on the trylock. Thus we must be sure
17035 +                        * before releasing the tasklet lock, that the
17036 +                        * SCHED_BIT is clear. Otherwise the tasklet
17037 +                        * may get its SCHED_BIT set, but not added to the
17038 +                        * list
17039 +                        */
17040 +                       if (!tasklet_tryunlock(t))
17041 +                               goto again;
17042 +               }
17043 +       }
17044 +}
17045 +
17046  void __tasklet_schedule(struct tasklet_struct *t)
17047  {
17048         unsigned long flags;
17049  
17050         local_irq_save(flags);
17051 -       t->next = NULL;
17052 -       *__this_cpu_read(tasklet_vec.tail) = t;
17053 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
17054 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
17055 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
17056         local_irq_restore(flags);
17057  }
17058  EXPORT_SYMBOL(__tasklet_schedule);
17059 @@ -478,10 +940,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
17060         unsigned long flags;
17061  
17062         local_irq_save(flags);
17063 -       t->next = NULL;
17064 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
17065 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
17066 -       raise_softirq_irqoff(HI_SOFTIRQ);
17067 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
17068         local_irq_restore(flags);
17069  }
17070  EXPORT_SYMBOL(__tasklet_hi_schedule);
17071 @@ -490,82 +949,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
17072  {
17073         BUG_ON(!irqs_disabled());
17074  
17075 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
17076 -       __this_cpu_write(tasklet_hi_vec.head, t);
17077 -       __raise_softirq_irqoff(HI_SOFTIRQ);
17078 +       __tasklet_hi_schedule(t);
17079  }
17080  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
17081  
17082 -static __latent_entropy void tasklet_action(struct softirq_action *a)
17083 +void  tasklet_enable(struct tasklet_struct *t)
17084  {
17085 -       struct tasklet_struct *list;
17086 +       if (!atomic_dec_and_test(&t->count))
17087 +               return;
17088 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
17089 +               tasklet_schedule(t);
17090 +}
17091 +EXPORT_SYMBOL(tasklet_enable);
17092  
17093 -       local_irq_disable();
17094 -       list = __this_cpu_read(tasklet_vec.head);
17095 -       __this_cpu_write(tasklet_vec.head, NULL);
17096 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
17097 -       local_irq_enable();
17098 +static void __tasklet_action(struct softirq_action *a,
17099 +                            struct tasklet_struct *list)
17100 +{
17101 +       int loops = 1000000;
17102  
17103         while (list) {
17104                 struct tasklet_struct *t = list;
17105  
17106                 list = list->next;
17107  
17108 -               if (tasklet_trylock(t)) {
17109 -                       if (!atomic_read(&t->count)) {
17110 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
17111 -                                                       &t->state))
17112 -                                       BUG();
17113 -                               t->func(t->data);
17114 -                               tasklet_unlock(t);
17115 -                               continue;
17116 -                       }
17117 -                       tasklet_unlock(t);
17118 +               /*
17119 +                * Should always succeed - after a tasklist got on the
17120 +                * list (after getting the SCHED bit set from 0 to 1),
17121 +                * nothing but the tasklet softirq it got queued to can
17122 +                * lock it:
17123 +                */
17124 +               if (!tasklet_trylock(t)) {
17125 +                       WARN_ON(1);
17126 +                       continue;
17127                 }
17128  
17129 -               local_irq_disable();
17130                 t->next = NULL;
17131 -               *__this_cpu_read(tasklet_vec.tail) = t;
17132 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
17133 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
17134 -               local_irq_enable();
17135 +
17136 +               /*
17137 +                * If we cannot handle the tasklet because it's disabled,
17138 +                * mark it as pending. tasklet_enable() will later
17139 +                * re-schedule the tasklet.
17140 +                */
17141 +               if (unlikely(atomic_read(&t->count))) {
17142 +out_disabled:
17143 +                       /* implicit unlock: */
17144 +                       wmb();
17145 +                       t->state = TASKLET_STATEF_PENDING;
17146 +                       continue;
17147 +               }
17148 +
17149 +               /*
17150 +                * After this point on the tasklet might be rescheduled
17151 +                * on another CPU, but it can only be added to another
17152 +                * CPU's tasklet list if we unlock the tasklet (which we
17153 +                * dont do yet).
17154 +                */
17155 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
17156 +                       WARN_ON(1);
17157 +
17158 +again:
17159 +               t->func(t->data);
17160 +
17161 +               /*
17162 +                * Try to unlock the tasklet. We must use cmpxchg, because
17163 +                * another CPU might have scheduled or disabled the tasklet.
17164 +                * We only allow the STATE_RUN -> 0 transition here.
17165 +                */
17166 +               while (!tasklet_tryunlock(t)) {
17167 +                       /*
17168 +                        * If it got disabled meanwhile, bail out:
17169 +                        */
17170 +                       if (atomic_read(&t->count))
17171 +                               goto out_disabled;
17172 +                       /*
17173 +                        * If it got scheduled meanwhile, re-execute
17174 +                        * the tasklet function:
17175 +                        */
17176 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
17177 +                               goto again;
17178 +                       if (!--loops) {
17179 +                               printk("hm, tasklet state: %08lx\n", t->state);
17180 +                               WARN_ON(1);
17181 +                               tasklet_unlock(t);
17182 +                               break;
17183 +                       }
17184 +               }
17185         }
17186  }
17187  
17188 +static void tasklet_action(struct softirq_action *a)
17189 +{
17190 +       struct tasklet_struct *list;
17191 +
17192 +       local_irq_disable();
17193 +
17194 +       list = __this_cpu_read(tasklet_vec.head);
17195 +       __this_cpu_write(tasklet_vec.head, NULL);
17196 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
17197 +
17198 +       local_irq_enable();
17199 +
17200 +       __tasklet_action(a, list);
17201 +}
17202 +
17203  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
17204  {
17205         struct tasklet_struct *list;
17206  
17207         local_irq_disable();
17208 +
17209         list = __this_cpu_read(tasklet_hi_vec.head);
17210         __this_cpu_write(tasklet_hi_vec.head, NULL);
17211         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
17212 +
17213         local_irq_enable();
17214  
17215 -       while (list) {
17216 -               struct tasklet_struct *t = list;
17217 -
17218 -               list = list->next;
17219 -
17220 -               if (tasklet_trylock(t)) {
17221 -                       if (!atomic_read(&t->count)) {
17222 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
17223 -                                                       &t->state))
17224 -                                       BUG();
17225 -                               t->func(t->data);
17226 -                               tasklet_unlock(t);
17227 -                               continue;
17228 -                       }
17229 -                       tasklet_unlock(t);
17230 -               }
17231 -
17232 -               local_irq_disable();
17233 -               t->next = NULL;
17234 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
17235 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
17236 -               __raise_softirq_irqoff(HI_SOFTIRQ);
17237 -               local_irq_enable();
17238 -       }
17239 +       __tasklet_action(a, list);
17240  }
17241  
17242  void tasklet_init(struct tasklet_struct *t,
17243 @@ -586,7 +1085,7 @@ void tasklet_kill(struct tasklet_struct *t)
17244  
17245         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
17246                 do {
17247 -                       yield();
17248 +                       msleep(1);
17249                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
17250         }
17251         tasklet_unlock_wait(t);
17252 @@ -660,25 +1159,26 @@ void __init softirq_init(void)
17253         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
17254  }
17255  
17256 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
17257 +void tasklet_unlock_wait(struct tasklet_struct *t)
17258 +{
17259 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
17260 +               /*
17261 +                * Hack for now to avoid this busy-loop:
17262 +                */
17263 +#ifdef CONFIG_PREEMPT_RT_FULL
17264 +               msleep(1);
17265 +#else
17266 +               barrier();
17267 +#endif
17268 +       }
17269 +}
17270 +EXPORT_SYMBOL(tasklet_unlock_wait);
17271 +#endif
17272 +
17273  static int ksoftirqd_should_run(unsigned int cpu)
17274  {
17275 -       return local_softirq_pending();
17276 -}
17277 -
17278 -static void run_ksoftirqd(unsigned int cpu)
17279 -{
17280 -       local_irq_disable();
17281 -       if (local_softirq_pending()) {
17282 -               /*
17283 -                * We can safely run softirq on inline stack, as we are not deep
17284 -                * in the task stack here.
17285 -                */
17286 -               __do_softirq();
17287 -               local_irq_enable();
17288 -               cond_resched_rcu_qs();
17289 -               return;
17290 -       }
17291 -       local_irq_enable();
17292 +       return ksoftirqd_softirq_pending();
17293  }
17294  
17295  #ifdef CONFIG_HOTPLUG_CPU
17296 @@ -745,17 +1245,31 @@ static int takeover_tasklets(unsigned int cpu)
17297  
17298  static struct smp_hotplug_thread softirq_threads = {
17299         .store                  = &ksoftirqd,
17300 +       .setup                  = ksoftirqd_set_sched_params,
17301         .thread_should_run      = ksoftirqd_should_run,
17302         .thread_fn              = run_ksoftirqd,
17303         .thread_comm            = "ksoftirqd/%u",
17304  };
17305  
17306 +#ifdef CONFIG_PREEMPT_RT_FULL
17307 +static struct smp_hotplug_thread softirq_timer_threads = {
17308 +       .store                  = &ktimer_softirqd,
17309 +       .setup                  = ktimer_softirqd_set_sched_params,
17310 +       .cleanup                = ktimer_softirqd_clr_sched_params,
17311 +       .thread_should_run      = ktimer_softirqd_should_run,
17312 +       .thread_fn              = run_ksoftirqd,
17313 +       .thread_comm            = "ktimersoftd/%u",
17314 +};
17315 +#endif
17316 +
17317  static __init int spawn_ksoftirqd(void)
17318  {
17319         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
17320                                   takeover_tasklets);
17321         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
17322 -
17323 +#ifdef CONFIG_PREEMPT_RT_FULL
17324 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
17325 +#endif
17326         return 0;
17327  }
17328  early_initcall(spawn_ksoftirqd);
17329 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
17330 index ec9ab2f01489..8b89dbedeaff 100644
17331 --- a/kernel/stop_machine.c
17332 +++ b/kernel/stop_machine.c
17333 @@ -36,7 +36,7 @@ struct cpu_stop_done {
17334  struct cpu_stopper {
17335         struct task_struct      *thread;
17336  
17337 -       spinlock_t              lock;
17338 +       raw_spinlock_t          lock;
17339         bool                    enabled;        /* is this stopper enabled? */
17340         struct list_head        works;          /* list of pending works */
17341  
17342 @@ -78,14 +78,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
17343         unsigned long flags;
17344         bool enabled;
17345  
17346 -       spin_lock_irqsave(&stopper->lock, flags);
17347 +       raw_spin_lock_irqsave(&stopper->lock, flags);
17348         enabled = stopper->enabled;
17349         if (enabled)
17350                 __cpu_stop_queue_work(stopper, work);
17351         else if (work->done)
17352                 cpu_stop_signal_done(work->done);
17353 -       spin_unlock_irqrestore(&stopper->lock, flags);
17354  
17355 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
17356         return enabled;
17357  }
17358  
17359 @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
17360         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
17361         int err;
17362  retry:
17363 -       spin_lock_irq(&stopper1->lock);
17364 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17365 +       raw_spin_lock_irq(&stopper1->lock);
17366 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17367  
17368         err = -ENOENT;
17369         if (!stopper1->enabled || !stopper2->enabled)
17370 @@ -255,8 +255,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
17371         __cpu_stop_queue_work(stopper1, work1);
17372         __cpu_stop_queue_work(stopper2, work2);
17373  unlock:
17374 -       spin_unlock(&stopper2->lock);
17375 -       spin_unlock_irq(&stopper1->lock);
17376 +       raw_spin_unlock(&stopper2->lock);
17377 +       raw_spin_unlock_irq(&stopper1->lock);
17378  
17379         if (unlikely(err == -EDEADLK)) {
17380                 while (stop_cpus_in_progress)
17381 @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
17382         unsigned long flags;
17383         int run;
17384  
17385 -       spin_lock_irqsave(&stopper->lock, flags);
17386 +       raw_spin_lock_irqsave(&stopper->lock, flags);
17387         run = !list_empty(&stopper->works);
17388 -       spin_unlock_irqrestore(&stopper->lock, flags);
17389 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
17390         return run;
17391  }
17392  
17393 @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
17394  
17395  repeat:
17396         work = NULL;
17397 -       spin_lock_irq(&stopper->lock);
17398 +       raw_spin_lock_irq(&stopper->lock);
17399         if (!list_empty(&stopper->works)) {
17400                 work = list_first_entry(&stopper->works,
17401                                         struct cpu_stop_work, list);
17402                 list_del_init(&work->list);
17403         }
17404 -       spin_unlock_irq(&stopper->lock);
17405 +       raw_spin_unlock_irq(&stopper->lock);
17406  
17407         if (work) {
17408                 cpu_stop_fn_t fn = work->fn;
17409 @@ -475,6 +475,8 @@ static void cpu_stopper_thread(unsigned int cpu)
17410                 struct cpu_stop_done *done = work->done;
17411                 int ret;
17412  
17413 +               /* XXX */
17414 +
17415                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
17416                 preempt_count_inc();
17417                 ret = fn(arg);
17418 @@ -541,7 +543,7 @@ static int __init cpu_stop_init(void)
17419         for_each_possible_cpu(cpu) {
17420                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
17421  
17422 -               spin_lock_init(&stopper->lock);
17423 +               raw_spin_lock_init(&stopper->lock);
17424                 INIT_LIST_HEAD(&stopper->works);
17425         }
17426  
17427 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
17428 index bb5ec425dfe0..8338b14ed3a3 100644
17429 --- a/kernel/time/hrtimer.c
17430 +++ b/kernel/time/hrtimer.c
17431 @@ -53,6 +53,7 @@
17432  #include <asm/uaccess.h>
17433  
17434  #include <trace/events/timer.h>
17435 +#include <trace/events/hist.h>
17436  
17437  #include "tick-internal.h"
17438  
17439 @@ -695,6 +696,29 @@ static void hrtimer_switch_to_hres(void)
17440         retrigger_next_event(NULL);
17441  }
17442  
17443 +#ifdef CONFIG_PREEMPT_RT_FULL
17444 +
17445 +static struct swork_event clock_set_delay_work;
17446 +
17447 +static void run_clock_set_delay(struct swork_event *event)
17448 +{
17449 +       clock_was_set();
17450 +}
17451 +
17452 +void clock_was_set_delayed(void)
17453 +{
17454 +       swork_queue(&clock_set_delay_work);
17455 +}
17456 +
17457 +static __init int create_clock_set_delay_thread(void)
17458 +{
17459 +       WARN_ON(swork_get());
17460 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
17461 +       return 0;
17462 +}
17463 +early_initcall(create_clock_set_delay_thread);
17464 +#else /* PREEMPT_RT_FULL */
17465 +
17466  static void clock_was_set_work(struct work_struct *work)
17467  {
17468         clock_was_set();
17469 @@ -710,6 +734,7 @@ void clock_was_set_delayed(void)
17470  {
17471         schedule_work(&hrtimer_work);
17472  }
17473 +#endif
17474  
17475  #else
17476  
17477 @@ -719,11 +744,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
17478  static inline void hrtimer_switch_to_hres(void) { }
17479  static inline void
17480  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
17481 -static inline int hrtimer_reprogram(struct hrtimer *timer,
17482 -                                   struct hrtimer_clock_base *base)
17483 -{
17484 -       return 0;
17485 -}
17486 +static inline void hrtimer_reprogram(struct hrtimer *timer,
17487 +                                    struct hrtimer_clock_base *base) { }
17488  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
17489  static inline void retrigger_next_event(void *arg) { }
17490  
17491 @@ -855,6 +877,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
17492  }
17493  EXPORT_SYMBOL_GPL(hrtimer_forward);
17494  
17495 +#ifdef CONFIG_PREEMPT_RT_BASE
17496 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
17497 +
17498 +/**
17499 + * hrtimer_wait_for_timer - Wait for a running timer
17500 + *
17501 + * @timer:     timer to wait for
17502 + *
17503 + * The function waits in case the timers callback function is
17504 + * currently executed on the waitqueue of the timer base. The
17505 + * waitqueue is woken up after the timer callback function has
17506 + * finished execution.
17507 + */
17508 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
17509 +{
17510 +       struct hrtimer_clock_base *base = timer->base;
17511 +
17512 +       if (base && base->cpu_base && !timer->irqsafe)
17513 +               wait_event(base->cpu_base->wait,
17514 +                               !(hrtimer_callback_running(timer)));
17515 +}
17516 +
17517 +#else
17518 +# define wake_up_timer_waiters(b)      do { } while (0)
17519 +#endif
17520 +
17521  /*
17522   * enqueue_hrtimer - internal function to (re)start a timer
17523   *
17524 @@ -896,6 +944,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
17525         if (!(state & HRTIMER_STATE_ENQUEUED))
17526                 return;
17527  
17528 +       if (unlikely(!list_empty(&timer->cb_entry))) {
17529 +               list_del_init(&timer->cb_entry);
17530 +               return;
17531 +       }
17532 +
17533         if (!timerqueue_del(&base->active, &timer->node))
17534                 cpu_base->active_bases &= ~(1 << base->index);
17535  
17536 @@ -991,7 +1044,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
17537         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
17538  
17539         timer_stats_hrtimer_set_start_info(timer);
17540 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
17541 +       {
17542 +               ktime_t now = new_base->get_time();
17543  
17544 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
17545 +                       timer->praecox = now;
17546 +               else
17547 +                       timer->praecox = ktime_set(0, 0);
17548 +       }
17549 +#endif
17550         leftmost = enqueue_hrtimer(timer, new_base);
17551         if (!leftmost)
17552                 goto unlock;
17553 @@ -1063,7 +1125,7 @@ int hrtimer_cancel(struct hrtimer *timer)
17554  
17555                 if (ret >= 0)
17556                         return ret;
17557 -               cpu_relax();
17558 +               hrtimer_wait_for_timer(timer);
17559         }
17560  }
17561  EXPORT_SYMBOL_GPL(hrtimer_cancel);
17562 @@ -1127,6 +1189,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
17563  
17564         base = hrtimer_clockid_to_base(clock_id);
17565         timer->base = &cpu_base->clock_base[base];
17566 +       INIT_LIST_HEAD(&timer->cb_entry);
17567         timerqueue_init(&timer->node);
17568  
17569  #ifdef CONFIG_TIMER_STATS
17570 @@ -1167,6 +1230,7 @@ bool hrtimer_active(const struct hrtimer *timer)
17571                 seq = raw_read_seqcount_begin(&cpu_base->seq);
17572  
17573                 if (timer->state != HRTIMER_STATE_INACTIVE ||
17574 +                   cpu_base->running_soft == timer ||
17575                     cpu_base->running == timer)
17576                         return true;
17577  
17578 @@ -1265,10 +1329,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
17579         cpu_base->running = NULL;
17580  }
17581  
17582 +#ifdef CONFIG_PREEMPT_RT_BASE
17583 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
17584 +                                struct hrtimer_clock_base *base)
17585 +{
17586 +       int leftmost;
17587 +
17588 +       if (restart != HRTIMER_NORESTART &&
17589 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
17590 +
17591 +               leftmost = enqueue_hrtimer(timer, base);
17592 +               if (!leftmost)
17593 +                       return;
17594 +#ifdef CONFIG_HIGH_RES_TIMERS
17595 +               if (!hrtimer_is_hres_active(timer)) {
17596 +                       /*
17597 +                        * Kick to reschedule the next tick to handle the new timer
17598 +                        * on dynticks target.
17599 +                        */
17600 +                       if (base->cpu_base->nohz_active)
17601 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
17602 +               } else {
17603 +
17604 +                       hrtimer_reprogram(timer, base);
17605 +               }
17606 +#endif
17607 +       }
17608 +}
17609 +
17610 +/*
17611 + * The changes in mainline which removed the callback modes from
17612 + * hrtimer are not yet working with -rt. The non wakeup_process()
17613 + * based callbacks which involve sleeping locks need to be treated
17614 + * seperately.
17615 + */
17616 +static void hrtimer_rt_run_pending(void)
17617 +{
17618 +       enum hrtimer_restart (*fn)(struct hrtimer *);
17619 +       struct hrtimer_cpu_base *cpu_base;
17620 +       struct hrtimer_clock_base *base;
17621 +       struct hrtimer *timer;
17622 +       int index, restart;
17623 +
17624 +       local_irq_disable();
17625 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
17626 +
17627 +       raw_spin_lock(&cpu_base->lock);
17628 +
17629 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
17630 +               base = &cpu_base->clock_base[index];
17631 +
17632 +               while (!list_empty(&base->expired)) {
17633 +                       timer = list_first_entry(&base->expired,
17634 +                                                struct hrtimer, cb_entry);
17635 +
17636 +                       /*
17637 +                        * Same as the above __run_hrtimer function
17638 +                        * just we run with interrupts enabled.
17639 +                        */
17640 +                       debug_deactivate(timer);
17641 +                       cpu_base->running_soft = timer;
17642 +                       raw_write_seqcount_barrier(&cpu_base->seq);
17643 +
17644 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
17645 +                       timer_stats_account_hrtimer(timer);
17646 +                       fn = timer->function;
17647 +
17648 +                       raw_spin_unlock_irq(&cpu_base->lock);
17649 +                       restart = fn(timer);
17650 +                       raw_spin_lock_irq(&cpu_base->lock);
17651 +
17652 +                       hrtimer_rt_reprogram(restart, timer, base);
17653 +                       raw_write_seqcount_barrier(&cpu_base->seq);
17654 +
17655 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
17656 +                       cpu_base->running_soft = NULL;
17657 +               }
17658 +       }
17659 +
17660 +       raw_spin_unlock_irq(&cpu_base->lock);
17661 +
17662 +       wake_up_timer_waiters(cpu_base);
17663 +}
17664 +
17665 +static int hrtimer_rt_defer(struct hrtimer *timer)
17666 +{
17667 +       if (timer->irqsafe)
17668 +               return 0;
17669 +
17670 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
17671 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
17672 +       return 1;
17673 +}
17674 +
17675 +#else
17676 +
17677 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
17678 +
17679 +#endif
17680 +
17681 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
17682 +
17683  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
17684  {
17685         struct hrtimer_clock_base *base = cpu_base->clock_base;
17686         unsigned int active = cpu_base->active_bases;
17687 +       int raise = 0;
17688  
17689         for (; active; base++, active >>= 1) {
17690                 struct timerqueue_node *node;
17691 @@ -1284,6 +1450,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
17692  
17693                         timer = container_of(node, struct hrtimer, node);
17694  
17695 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
17696 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
17697 +                               timer->praecox : hrtimer_get_expires(timer),
17698 +                               basenow)),
17699 +                           current,
17700 +                           timer->function == hrtimer_wakeup ?
17701 +                           container_of(timer, struct hrtimer_sleeper,
17702 +                               timer)->task : NULL);
17703 +
17704                         /*
17705                          * The immediate goal for using the softexpires is
17706                          * minimizing wakeups, not running timers at the
17707 @@ -1299,9 +1474,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
17708                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
17709                                 break;
17710  
17711 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
17712 +                       if (!hrtimer_rt_defer(timer))
17713 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
17714 +                       else
17715 +                               raise = 1;
17716                 }
17717         }
17718 +       if (raise)
17719 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
17720  }
17721  
17722  #ifdef CONFIG_HIGH_RES_TIMERS
17723 @@ -1464,16 +1644,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
17724  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
17725  {
17726         sl->timer.function = hrtimer_wakeup;
17727 +       sl->timer.irqsafe = 1;
17728         sl->task = task;
17729  }
17730  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
17731  
17732 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
17733 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
17734 +                               unsigned long state)
17735  {
17736         hrtimer_init_sleeper(t, current);
17737  
17738         do {
17739 -               set_current_state(TASK_INTERRUPTIBLE);
17740 +               set_current_state(state);
17741                 hrtimer_start_expires(&t->timer, mode);
17742  
17743                 if (likely(t->task))
17744 @@ -1515,7 +1697,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
17745                                 HRTIMER_MODE_ABS);
17746         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
17747  
17748 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
17749 +       /* cpu_chill() does not care about restart state. */
17750 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
17751                 goto out;
17752  
17753         rmtp = restart->nanosleep.rmtp;
17754 @@ -1532,8 +1715,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
17755         return ret;
17756  }
17757  
17758 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17759 -                      const enum hrtimer_mode mode, const clockid_t clockid)
17760 +static long
17761 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17762 +                   const enum hrtimer_mode mode, const clockid_t clockid,
17763 +                   unsigned long state)
17764  {
17765         struct restart_block *restart;
17766         struct hrtimer_sleeper t;
17767 @@ -1546,7 +1731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17768  
17769         hrtimer_init_on_stack(&t.timer, clockid, mode);
17770         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
17771 -       if (do_nanosleep(&t, mode))
17772 +       if (do_nanosleep(&t, mode, state))
17773                 goto out;
17774  
17775         /* Absolute timers do not update the rmtp value and restart: */
17776 @@ -1573,6 +1758,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17777         return ret;
17778  }
17779  
17780 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17781 +                      const enum hrtimer_mode mode, const clockid_t clockid)
17782 +{
17783 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
17784 +}
17785 +
17786  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
17787                 struct timespec __user *, rmtp)
17788  {
17789 @@ -1587,6 +1778,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
17790         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
17791  }
17792  
17793 +#ifdef CONFIG_PREEMPT_RT_FULL
17794 +/*
17795 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
17796 + */
17797 +void cpu_chill(void)
17798 +{
17799 +       struct timespec tu = {
17800 +               .tv_nsec = NSEC_PER_MSEC,
17801 +       };
17802 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
17803 +
17804 +       current->flags |= PF_NOFREEZE;
17805 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
17806 +                           TASK_UNINTERRUPTIBLE);
17807 +       if (!freeze_flag)
17808 +               current->flags &= ~PF_NOFREEZE;
17809 +}
17810 +EXPORT_SYMBOL(cpu_chill);
17811 +#endif
17812 +
17813  /*
17814   * Functions related to boot-time initialization:
17815   */
17816 @@ -1598,10 +1809,14 @@ int hrtimers_prepare_cpu(unsigned int cpu)
17817         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
17818                 cpu_base->clock_base[i].cpu_base = cpu_base;
17819                 timerqueue_init_head(&cpu_base->clock_base[i].active);
17820 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
17821         }
17822  
17823         cpu_base->cpu = cpu;
17824         hrtimer_init_hres(cpu_base);
17825 +#ifdef CONFIG_PREEMPT_RT_BASE
17826 +       init_waitqueue_head(&cpu_base->wait);
17827 +#endif
17828         return 0;
17829  }
17830  
17831 @@ -1671,9 +1886,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
17832  
17833  #endif /* CONFIG_HOTPLUG_CPU */
17834  
17835 +#ifdef CONFIG_PREEMPT_RT_BASE
17836 +
17837 +static void run_hrtimer_softirq(struct softirq_action *h)
17838 +{
17839 +       hrtimer_rt_run_pending();
17840 +}
17841 +
17842 +static void hrtimers_open_softirq(void)
17843 +{
17844 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
17845 +}
17846 +
17847 +#else
17848 +static void hrtimers_open_softirq(void) { }
17849 +#endif
17850 +
17851  void __init hrtimers_init(void)
17852  {
17853         hrtimers_prepare_cpu(smp_processor_id());
17854 +       hrtimers_open_softirq();
17855  }
17856  
17857  /**
17858 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
17859 index 1d5c7204ddc9..184de6751180 100644
17860 --- a/kernel/time/itimer.c
17861 +++ b/kernel/time/itimer.c
17862 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
17863                 /* We are sharing ->siglock with it_real_fn() */
17864                 if (hrtimer_try_to_cancel(timer) < 0) {
17865                         spin_unlock_irq(&tsk->sighand->siglock);
17866 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
17867                         goto again;
17868                 }
17869                 expires = timeval_to_ktime(value->it_value);
17870 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
17871 index 555e21f7b966..a5d6435fabbb 100644
17872 --- a/kernel/time/jiffies.c
17873 +++ b/kernel/time/jiffies.c
17874 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
17875         .max_cycles     = 10,
17876  };
17877  
17878 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
17879 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
17880 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
17881  
17882  #if (BITS_PER_LONG < 64)
17883  u64 get_jiffies_64(void)
17884 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
17885         u64 ret;
17886  
17887         do {
17888 -               seq = read_seqbegin(&jiffies_lock);
17889 +               seq = read_seqcount_begin(&jiffies_seq);
17890                 ret = jiffies_64;
17891 -       } while (read_seqretry(&jiffies_lock, seq));
17892 +       } while (read_seqcount_retry(&jiffies_seq, seq));
17893         return ret;
17894  }
17895  EXPORT_SYMBOL(get_jiffies_64);
17896 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
17897 index 6df8927c58a5..05b7391bf9bd 100644
17898 --- a/kernel/time/ntp.c
17899 +++ b/kernel/time/ntp.c
17900 @@ -17,6 +17,7 @@
17901  #include <linux/module.h>
17902  #include <linux/rtc.h>
17903  #include <linux/math64.h>
17904 +#include <linux/swork.h>
17905  
17906  #include "ntp_internal.h"
17907  #include "timekeeping_internal.h"
17908 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
17909                            &sync_cmos_work, timespec64_to_jiffies(&next));
17910  }
17911  
17912 +#ifdef CONFIG_PREEMPT_RT_FULL
17913 +
17914 +static void run_clock_set_delay(struct swork_event *event)
17915 +{
17916 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
17917 +}
17918 +
17919 +static struct swork_event ntp_cmos_swork;
17920 +
17921 +void ntp_notify_cmos_timer(void)
17922 +{
17923 +       swork_queue(&ntp_cmos_swork);
17924 +}
17925 +
17926 +static __init int create_cmos_delay_thread(void)
17927 +{
17928 +       WARN_ON(swork_get());
17929 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
17930 +       return 0;
17931 +}
17932 +early_initcall(create_cmos_delay_thread);
17933 +
17934 +#else
17935 +
17936  void ntp_notify_cmos_timer(void)
17937  {
17938         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
17939  }
17940 +#endif /* CONFIG_PREEMPT_RT_FULL */
17941  
17942  #else
17943  void ntp_notify_cmos_timer(void) { }
17944 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
17945 index 39008d78927a..633f4eaca9e7 100644
17946 --- a/kernel/time/posix-cpu-timers.c
17947 +++ b/kernel/time/posix-cpu-timers.c
17948 @@ -3,6 +3,7 @@
17949   */
17950  
17951  #include <linux/sched.h>
17952 +#include <linux/sched/rt.h>
17953  #include <linux/posix-timers.h>
17954  #include <linux/errno.h>
17955  #include <linux/math64.h>
17956 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
17957         /*
17958          * Disarm any old timer after extracting its expiry time.
17959          */
17960 -       WARN_ON_ONCE(!irqs_disabled());
17961 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
17962  
17963         ret = 0;
17964         old_incr = timer->it.cpu.incr;
17965 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
17966         /*
17967          * Now re-arm for the new expiry time.
17968          */
17969 -       WARN_ON_ONCE(!irqs_disabled());
17970 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
17971         arm_timer(timer);
17972         unlock_task_sighand(p, &flags);
17973  
17974 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
17975   * already updated our counts.  We need to check if any timers fire now.
17976   * Interrupts are disabled.
17977   */
17978 -void run_posix_cpu_timers(struct task_struct *tsk)
17979 +static void __run_posix_cpu_timers(struct task_struct *tsk)
17980  {
17981         LIST_HEAD(firing);
17982         struct k_itimer *timer, *next;
17983         unsigned long flags;
17984  
17985 -       WARN_ON_ONCE(!irqs_disabled());
17986 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
17987  
17988         /*
17989          * The fast path checks that there are no expired thread or thread
17990 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
17991         }
17992  }
17993  
17994 +#ifdef CONFIG_PREEMPT_RT_BASE
17995 +#include <linux/kthread.h>
17996 +#include <linux/cpu.h>
17997 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
17998 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
17999 +
18000 +static int posix_cpu_timers_thread(void *data)
18001 +{
18002 +       int cpu = (long)data;
18003 +
18004 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
18005 +
18006 +       while (!kthread_should_stop()) {
18007 +               struct task_struct *tsk = NULL;
18008 +               struct task_struct *next = NULL;
18009 +
18010 +               if (cpu_is_offline(cpu))
18011 +                       goto wait_to_die;
18012 +
18013 +               /* grab task list */
18014 +               raw_local_irq_disable();
18015 +               tsk = per_cpu(posix_timer_tasklist, cpu);
18016 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
18017 +               raw_local_irq_enable();
18018 +
18019 +               /* its possible the list is empty, just return */
18020 +               if (!tsk) {
18021 +                       set_current_state(TASK_INTERRUPTIBLE);
18022 +                       schedule();
18023 +                       __set_current_state(TASK_RUNNING);
18024 +                       continue;
18025 +               }
18026 +
18027 +               /* Process task list */
18028 +               while (1) {
18029 +                       /* save next */
18030 +                       next = tsk->posix_timer_list;
18031 +
18032 +                       /* run the task timers, clear its ptr and
18033 +                        * unreference it
18034 +                        */
18035 +                       __run_posix_cpu_timers(tsk);
18036 +                       tsk->posix_timer_list = NULL;
18037 +                       put_task_struct(tsk);
18038 +
18039 +                       /* check if this is the last on the list */
18040 +                       if (next == tsk)
18041 +                               break;
18042 +                       tsk = next;
18043 +               }
18044 +       }
18045 +       return 0;
18046 +
18047 +wait_to_die:
18048 +       /* Wait for kthread_stop */
18049 +       set_current_state(TASK_INTERRUPTIBLE);
18050 +       while (!kthread_should_stop()) {
18051 +               schedule();
18052 +               set_current_state(TASK_INTERRUPTIBLE);
18053 +       }
18054 +       __set_current_state(TASK_RUNNING);
18055 +       return 0;
18056 +}
18057 +
18058 +static inline int __fastpath_timer_check(struct task_struct *tsk)
18059 +{
18060 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
18061 +       if (unlikely(tsk->exit_state))
18062 +               return 0;
18063 +
18064 +       if (!task_cputime_zero(&tsk->cputime_expires))
18065 +                       return 1;
18066 +
18067 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
18068 +                       return 1;
18069 +
18070 +       return 0;
18071 +}
18072 +
18073 +void run_posix_cpu_timers(struct task_struct *tsk)
18074 +{
18075 +       unsigned long cpu = smp_processor_id();
18076 +       struct task_struct *tasklist;
18077 +
18078 +       BUG_ON(!irqs_disabled());
18079 +       if(!per_cpu(posix_timer_task, cpu))
18080 +               return;
18081 +       /* get per-cpu references */
18082 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
18083 +
18084 +       /* check to see if we're already queued */
18085 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
18086 +               get_task_struct(tsk);
18087 +               if (tasklist) {
18088 +                       tsk->posix_timer_list = tasklist;
18089 +               } else {
18090 +                       /*
18091 +                        * The list is terminated by a self-pointing
18092 +                        * task_struct
18093 +                        */
18094 +                       tsk->posix_timer_list = tsk;
18095 +               }
18096 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
18097 +
18098 +               wake_up_process(per_cpu(posix_timer_task, cpu));
18099 +       }
18100 +}
18101 +
18102 +/*
18103 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
18104 + * Here we can start up the necessary migration thread for the new CPU.
18105 + */
18106 +static int posix_cpu_thread_call(struct notifier_block *nfb,
18107 +                                unsigned long action, void *hcpu)
18108 +{
18109 +       int cpu = (long)hcpu;
18110 +       struct task_struct *p;
18111 +       struct sched_param param;
18112 +
18113 +       switch (action) {
18114 +       case CPU_UP_PREPARE:
18115 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
18116 +                                       "posixcputmr/%d",cpu);
18117 +               if (IS_ERR(p))
18118 +                       return NOTIFY_BAD;
18119 +               p->flags |= PF_NOFREEZE;
18120 +               kthread_bind(p, cpu);
18121 +               /* Must be high prio to avoid getting starved */
18122 +               param.sched_priority = MAX_RT_PRIO-1;
18123 +               sched_setscheduler(p, SCHED_FIFO, &param);
18124 +               per_cpu(posix_timer_task,cpu) = p;
18125 +               break;
18126 +       case CPU_ONLINE:
18127 +               /* Strictly unneccessary, as first user will wake it. */
18128 +               wake_up_process(per_cpu(posix_timer_task,cpu));
18129 +               break;
18130 +#ifdef CONFIG_HOTPLUG_CPU
18131 +       case CPU_UP_CANCELED:
18132 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
18133 +               kthread_bind(per_cpu(posix_timer_task, cpu),
18134 +                            cpumask_any(cpu_online_mask));
18135 +               kthread_stop(per_cpu(posix_timer_task,cpu));
18136 +               per_cpu(posix_timer_task,cpu) = NULL;
18137 +               break;
18138 +       case CPU_DEAD:
18139 +               kthread_stop(per_cpu(posix_timer_task,cpu));
18140 +               per_cpu(posix_timer_task,cpu) = NULL;
18141 +               break;
18142 +#endif
18143 +       }
18144 +       return NOTIFY_OK;
18145 +}
18146 +
18147 +/* Register at highest priority so that task migration (migrate_all_tasks)
18148 + * happens before everything else.
18149 + */
18150 +static struct notifier_block posix_cpu_thread_notifier = {
18151 +       .notifier_call = posix_cpu_thread_call,
18152 +       .priority = 10
18153 +};
18154 +
18155 +static int __init posix_cpu_thread_init(void)
18156 +{
18157 +       void *hcpu = (void *)(long)smp_processor_id();
18158 +       /* Start one for boot CPU. */
18159 +       unsigned long cpu;
18160 +
18161 +       /* init the per-cpu posix_timer_tasklets */
18162 +       for_each_possible_cpu(cpu)
18163 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
18164 +
18165 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
18166 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
18167 +       register_cpu_notifier(&posix_cpu_thread_notifier);
18168 +       return 0;
18169 +}
18170 +early_initcall(posix_cpu_thread_init);
18171 +#else /* CONFIG_PREEMPT_RT_BASE */
18172 +void run_posix_cpu_timers(struct task_struct *tsk)
18173 +{
18174 +       __run_posix_cpu_timers(tsk);
18175 +}
18176 +#endif /* CONFIG_PREEMPT_RT_BASE */
18177 +
18178  /*
18179   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
18180   * The tsk->sighand->siglock must be held by the caller.
18181 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
18182 index f2826c35e918..464a98155a0e 100644
18183 --- a/kernel/time/posix-timers.c
18184 +++ b/kernel/time/posix-timers.c
18185 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
18186  static struct pid *good_sigevent(sigevent_t * event)
18187  {
18188         struct task_struct *rtn = current->group_leader;
18189 +       int sig = event->sigev_signo;
18190  
18191         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
18192                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
18193 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
18194                 return NULL;
18195  
18196         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
18197 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
18198 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
18199 +            sig_kernel_coredump(sig)))
18200                 return NULL;
18201  
18202         return task_pid(rtn);
18203 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
18204         return overrun;
18205  }
18206  
18207 +/*
18208 + * Protected by RCU!
18209 + */
18210 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
18211 +{
18212 +#ifdef CONFIG_PREEMPT_RT_FULL
18213 +       if (kc->timer_set == common_timer_set)
18214 +               hrtimer_wait_for_timer(&timr->it.real.timer);
18215 +       else
18216 +               /* FIXME: Whacky hack for posix-cpu-timers */
18217 +               schedule_timeout(1);
18218 +#endif
18219 +}
18220 +
18221  /* Set a POSIX.1b interval timer. */
18222  /* timr->it_lock is taken. */
18223  static int
18224 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
18225         if (!timr)
18226                 return -EINVAL;
18227  
18228 +       rcu_read_lock();
18229         kc = clockid_to_kclock(timr->it_clock);
18230         if (WARN_ON_ONCE(!kc || !kc->timer_set))
18231                 error = -EINVAL;
18232 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
18233  
18234         unlock_timer(timr, flag);
18235         if (error == TIMER_RETRY) {
18236 +               timer_wait_for_callback(kc, timr);
18237                 rtn = NULL;     // We already got the old time...
18238 +               rcu_read_unlock();
18239                 goto retry;
18240         }
18241 +       rcu_read_unlock();
18242  
18243         if (old_setting && !error &&
18244             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
18245 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
18246         if (!timer)
18247                 return -EINVAL;
18248  
18249 +       rcu_read_lock();
18250         if (timer_delete_hook(timer) == TIMER_RETRY) {
18251                 unlock_timer(timer, flags);
18252 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
18253 +                                       timer);
18254 +               rcu_read_unlock();
18255                 goto retry_delete;
18256         }
18257 +       rcu_read_unlock();
18258  
18259         spin_lock(&current->sighand->siglock);
18260         list_del(&timer->list);
18261 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
18262  retry_delete:
18263         spin_lock_irqsave(&timer->it_lock, flags);
18264  
18265 -       if (timer_delete_hook(timer) == TIMER_RETRY) {
18266 +       /* On RT we can race with a deletion */
18267 +       if (!timer->it_signal) {
18268                 unlock_timer(timer, flags);
18269 +               return;
18270 +       }
18271 +
18272 +       if (timer_delete_hook(timer) == TIMER_RETRY) {
18273 +               rcu_read_lock();
18274 +               unlock_timer(timer, flags);
18275 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
18276 +                                       timer);
18277 +               rcu_read_unlock();
18278                 goto retry_delete;
18279         }
18280         list_del(&timer->list);
18281 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
18282 index 690b797f522e..fe8ba1619879 100644
18283 --- a/kernel/time/tick-broadcast-hrtimer.c
18284 +++ b/kernel/time/tick-broadcast-hrtimer.c
18285 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
18286  {
18287         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
18288         bctimer.function = bc_handler;
18289 +       bctimer.irqsafe = true;
18290         clockevents_register_device(&ce_broadcast_hrtimer);
18291  }
18292 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
18293 index 4fcd99e12aa0..5a47f2e98faf 100644
18294 --- a/kernel/time/tick-common.c
18295 +++ b/kernel/time/tick-common.c
18296 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
18297  static void tick_periodic(int cpu)
18298  {
18299         if (tick_do_timer_cpu == cpu) {
18300 -               write_seqlock(&jiffies_lock);
18301 +               raw_spin_lock(&jiffies_lock);
18302 +               write_seqcount_begin(&jiffies_seq);
18303  
18304                 /* Keep track of the next tick event */
18305                 tick_next_period = ktime_add(tick_next_period, tick_period);
18306  
18307                 do_timer(1);
18308 -               write_sequnlock(&jiffies_lock);
18309 +               write_seqcount_end(&jiffies_seq);
18310 +               raw_spin_unlock(&jiffies_lock);
18311                 update_wall_time();
18312         }
18313  
18314 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
18315                 ktime_t next;
18316  
18317                 do {
18318 -                       seq = read_seqbegin(&jiffies_lock);
18319 +                       seq = read_seqcount_begin(&jiffies_seq);
18320                         next = tick_next_period;
18321 -               } while (read_seqretry(&jiffies_lock, seq));
18322 +               } while (read_seqcount_retry(&jiffies_seq, seq));
18323  
18324                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
18325  
18326 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
18327 index 3bcb61b52f6c..66d85482a96e 100644
18328 --- a/kernel/time/tick-sched.c
18329 +++ b/kernel/time/tick-sched.c
18330 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
18331                 return;
18332  
18333         /* Reevaluate with jiffies_lock held */
18334 -       write_seqlock(&jiffies_lock);
18335 +       raw_spin_lock(&jiffies_lock);
18336 +       write_seqcount_begin(&jiffies_seq);
18337  
18338         delta = ktime_sub(now, last_jiffies_update);
18339         if (delta.tv64 >= tick_period.tv64) {
18340 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
18341                 /* Keep the tick_next_period variable up to date */
18342                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
18343         } else {
18344 -               write_sequnlock(&jiffies_lock);
18345 +               write_seqcount_end(&jiffies_seq);
18346 +               raw_spin_unlock(&jiffies_lock);
18347                 return;
18348         }
18349 -       write_sequnlock(&jiffies_lock);
18350 +       write_seqcount_end(&jiffies_seq);
18351 +       raw_spin_unlock(&jiffies_lock);
18352         update_wall_time();
18353  }
18354  
18355 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
18356  {
18357         ktime_t period;
18358  
18359 -       write_seqlock(&jiffies_lock);
18360 +       raw_spin_lock(&jiffies_lock);
18361 +       write_seqcount_begin(&jiffies_seq);
18362         /* Did we start the jiffies update yet ? */
18363         if (last_jiffies_update.tv64 == 0)
18364                 last_jiffies_update = tick_next_period;
18365         period = last_jiffies_update;
18366 -       write_sequnlock(&jiffies_lock);
18367 +       write_seqcount_end(&jiffies_seq);
18368 +       raw_spin_unlock(&jiffies_lock);
18369         return period;
18370  }
18371  
18372 @@ -215,6 +220,7 @@ static void nohz_full_kick_func(struct irq_work *work)
18373  
18374  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
18375         .func = nohz_full_kick_func,
18376 +       .flags = IRQ_WORK_HARD_IRQ,
18377  };
18378  
18379  /*
18380 @@ -673,10 +679,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
18381  
18382         /* Read jiffies and the time when jiffies were updated last */
18383         do {
18384 -               seq = read_seqbegin(&jiffies_lock);
18385 +               seq = read_seqcount_begin(&jiffies_seq);
18386                 basemono = last_jiffies_update.tv64;
18387                 basejiff = jiffies;
18388 -       } while (read_seqretry(&jiffies_lock, seq));
18389 +       } while (read_seqcount_retry(&jiffies_seq, seq));
18390         ts->last_jiffies = basejiff;
18391  
18392         if (rcu_needs_cpu(basemono, &next_rcu) ||
18393 @@ -877,14 +883,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
18394                 return false;
18395  
18396         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
18397 -               static int ratelimit;
18398 -
18399 -               if (ratelimit < 10 &&
18400 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18401 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
18402 -                               (unsigned int) local_softirq_pending());
18403 -                       ratelimit++;
18404 -               }
18405 +               softirq_check_pending_idle();
18406                 return false;
18407         }
18408  
18409 @@ -1193,6 +1192,7 @@ void tick_setup_sched_timer(void)
18410          * Emulate tick processing via per-CPU hrtimers:
18411          */
18412         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
18413 +       ts->sched_timer.irqsafe = 1;
18414         ts->sched_timer.function = tick_sched_timer;
18415  
18416         /* Get the next period (per-CPU) */
18417 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
18418 index 46e312e9be38..fa75cf5d9253 100644
18419 --- a/kernel/time/timekeeping.c
18420 +++ b/kernel/time/timekeeping.c
18421 @@ -2328,8 +2328,10 @@ EXPORT_SYMBOL(hardpps);
18422   */
18423  void xtime_update(unsigned long ticks)
18424  {
18425 -       write_seqlock(&jiffies_lock);
18426 +       raw_spin_lock(&jiffies_lock);
18427 +       write_seqcount_begin(&jiffies_seq);
18428         do_timer(ticks);
18429 -       write_sequnlock(&jiffies_lock);
18430 +       write_seqcount_end(&jiffies_seq);
18431 +       raw_spin_unlock(&jiffies_lock);
18432         update_wall_time();
18433  }
18434 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
18435 index 704f595ce83f..763a3e5121ff 100644
18436 --- a/kernel/time/timekeeping.h
18437 +++ b/kernel/time/timekeeping.h
18438 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
18439  extern void do_timer(unsigned long ticks);
18440  extern void update_wall_time(void);
18441  
18442 -extern seqlock_t jiffies_lock;
18443 +extern raw_spinlock_t jiffies_lock;
18444 +extern seqcount_t jiffies_seq;
18445  
18446  #define CS_NAME_LEN    32
18447  
18448 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
18449 index c611c47de884..08a5ab762495 100644
18450 --- a/kernel/time/timer.c
18451 +++ b/kernel/time/timer.c
18452 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
18453  #endif
18454  
18455  struct timer_base {
18456 -       spinlock_t              lock;
18457 +       raw_spinlock_t          lock;
18458         struct timer_list       *running_timer;
18459 +#ifdef CONFIG_PREEMPT_RT_FULL
18460 +       struct swait_queue_head wait_for_running_timer;
18461 +#endif
18462         unsigned long           clk;
18463         unsigned long           next_expiry;
18464         unsigned int            cpu;
18465 @@ -948,10 +951,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
18466  
18467                 if (!(tf & TIMER_MIGRATING)) {
18468                         base = get_timer_base(tf);
18469 -                       spin_lock_irqsave(&base->lock, *flags);
18470 +                       raw_spin_lock_irqsave(&base->lock, *flags);
18471                         if (timer->flags == tf)
18472                                 return base;
18473 -                       spin_unlock_irqrestore(&base->lock, *flags);
18474 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
18475                 }
18476                 cpu_relax();
18477         }
18478 @@ -1023,9 +1026,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
18479                         /* See the comment in lock_timer_base() */
18480                         timer->flags |= TIMER_MIGRATING;
18481  
18482 -                       spin_unlock(&base->lock);
18483 +                       raw_spin_unlock(&base->lock);
18484                         base = new_base;
18485 -                       spin_lock(&base->lock);
18486 +                       raw_spin_lock(&base->lock);
18487                         WRITE_ONCE(timer->flags,
18488                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
18489                 }
18490 @@ -1050,7 +1053,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
18491         }
18492  
18493  out_unlock:
18494 -       spin_unlock_irqrestore(&base->lock, flags);
18495 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18496  
18497         return ret;
18498  }
18499 @@ -1144,19 +1147,46 @@ void add_timer_on(struct timer_list *timer, int cpu)
18500         if (base != new_base) {
18501                 timer->flags |= TIMER_MIGRATING;
18502  
18503 -               spin_unlock(&base->lock);
18504 +               raw_spin_unlock(&base->lock);
18505                 base = new_base;
18506 -               spin_lock(&base->lock);
18507 +               raw_spin_lock(&base->lock);
18508                 WRITE_ONCE(timer->flags,
18509                            (timer->flags & ~TIMER_BASEMASK) | cpu);
18510         }
18511  
18512         debug_activate(timer, timer->expires);
18513         internal_add_timer(base, timer);
18514 -       spin_unlock_irqrestore(&base->lock, flags);
18515 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18516  }
18517  EXPORT_SYMBOL_GPL(add_timer_on);
18518  
18519 +#ifdef CONFIG_PREEMPT_RT_FULL
18520 +/*
18521 + * Wait for a running timer
18522 + */
18523 +static void wait_for_running_timer(struct timer_list *timer)
18524 +{
18525 +       struct timer_base *base;
18526 +       u32 tf = timer->flags;
18527 +
18528 +       if (tf & TIMER_MIGRATING)
18529 +               return;
18530 +
18531 +       base = get_timer_base(tf);
18532 +       swait_event(base->wait_for_running_timer,
18533 +                  base->running_timer != timer);
18534 +}
18535 +
18536 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
18537 +#else
18538 +static inline void wait_for_running_timer(struct timer_list *timer)
18539 +{
18540 +       cpu_relax();
18541 +}
18542 +
18543 +# define wakeup_timer_waiters(b)       do { } while (0)
18544 +#endif
18545 +
18546  /**
18547   * del_timer - deactive a timer.
18548   * @timer: the timer to be deactivated
18549 @@ -1180,7 +1210,7 @@ int del_timer(struct timer_list *timer)
18550         if (timer_pending(timer)) {
18551                 base = lock_timer_base(timer, &flags);
18552                 ret = detach_if_pending(timer, base, true);
18553 -               spin_unlock_irqrestore(&base->lock, flags);
18554 +               raw_spin_unlock_irqrestore(&base->lock, flags);
18555         }
18556  
18557         return ret;
18558 @@ -1208,13 +1238,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
18559                 timer_stats_timer_clear_start_info(timer);
18560                 ret = detach_if_pending(timer, base, true);
18561         }
18562 -       spin_unlock_irqrestore(&base->lock, flags);
18563 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18564  
18565         return ret;
18566  }
18567  EXPORT_SYMBOL(try_to_del_timer_sync);
18568  
18569 -#ifdef CONFIG_SMP
18570 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
18571  /**
18572   * del_timer_sync - deactivate a timer and wait for the handler to finish.
18573   * @timer: the timer to be deactivated
18574 @@ -1274,7 +1304,7 @@ int del_timer_sync(struct timer_list *timer)
18575                 int ret = try_to_del_timer_sync(timer);
18576                 if (ret >= 0)
18577                         return ret;
18578 -               cpu_relax();
18579 +               wait_for_running_timer(timer);
18580         }
18581  }
18582  EXPORT_SYMBOL(del_timer_sync);
18583 @@ -1339,14 +1369,17 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
18584                 fn = timer->function;
18585                 data = timer->data;
18586  
18587 -               if (timer->flags & TIMER_IRQSAFE) {
18588 -                       spin_unlock(&base->lock);
18589 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
18590 +                   timer->flags & TIMER_IRQSAFE) {
18591 +                       raw_spin_unlock(&base->lock);
18592                         call_timer_fn(timer, fn, data);
18593 -                       spin_lock(&base->lock);
18594 +                       base->running_timer = NULL;
18595 +                       raw_spin_lock(&base->lock);
18596                 } else {
18597 -                       spin_unlock_irq(&base->lock);
18598 +                       raw_spin_unlock_irq(&base->lock);
18599                         call_timer_fn(timer, fn, data);
18600 -                       spin_lock_irq(&base->lock);
18601 +                       base->running_timer = NULL;
18602 +                       raw_spin_lock_irq(&base->lock);
18603                 }
18604         }
18605  }
18606 @@ -1515,7 +1548,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
18607         if (cpu_is_offline(smp_processor_id()))
18608                 return expires;
18609  
18610 -       spin_lock(&base->lock);
18611 +       raw_spin_lock(&base->lock);
18612         nextevt = __next_timer_interrupt(base);
18613         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
18614         base->next_expiry = nextevt;
18615 @@ -1543,7 +1576,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
18616                 if ((expires - basem) > TICK_NSEC)
18617                         base->is_idle = true;
18618         }
18619 -       spin_unlock(&base->lock);
18620 +       raw_spin_unlock(&base->lock);
18621  
18622         return cmp_next_hrtimer_event(basem, expires);
18623  }
18624 @@ -1608,13 +1641,13 @@ void update_process_times(int user_tick)
18625  
18626         /* Note: this timer irq context must be accounted for as well. */
18627         account_process_tick(p, user_tick);
18628 +       scheduler_tick();
18629         run_local_timers();
18630         rcu_check_callbacks(user_tick);
18631 -#ifdef CONFIG_IRQ_WORK
18632 +#if defined(CONFIG_IRQ_WORK)
18633         if (in_irq())
18634                 irq_work_tick();
18635  #endif
18636 -       scheduler_tick();
18637         run_posix_cpu_timers(p);
18638  }
18639  
18640 @@ -1630,7 +1663,7 @@ static inline void __run_timers(struct timer_base *base)
18641         if (!time_after_eq(jiffies, base->clk))
18642                 return;
18643  
18644 -       spin_lock_irq(&base->lock);
18645 +       raw_spin_lock_irq(&base->lock);
18646  
18647         while (time_after_eq(jiffies, base->clk)) {
18648  
18649 @@ -1640,8 +1673,8 @@ static inline void __run_timers(struct timer_base *base)
18650                 while (levels--)
18651                         expire_timers(base, heads + levels);
18652         }
18653 -       base->running_timer = NULL;
18654 -       spin_unlock_irq(&base->lock);
18655 +       raw_spin_unlock_irq(&base->lock);
18656 +       wakeup_timer_waiters(base);
18657  }
18658  
18659  /*
18660 @@ -1651,6 +1684,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
18661  {
18662         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
18663  
18664 +       irq_work_tick_soft();
18665 +
18666         __run_timers(base);
18667         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
18668                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
18669 @@ -1836,16 +1871,16 @@ int timers_dead_cpu(unsigned int cpu)
18670                  * The caller is globally serialized and nobody else
18671                  * takes two locks at once, deadlock is not possible.
18672                  */
18673 -               spin_lock_irq(&new_base->lock);
18674 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
18675 +               raw_spin_lock_irq(&new_base->lock);
18676 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
18677  
18678                 BUG_ON(old_base->running_timer);
18679  
18680                 for (i = 0; i < WHEEL_SIZE; i++)
18681                         migrate_timer_list(new_base, old_base->vectors + i);
18682  
18683 -               spin_unlock(&old_base->lock);
18684 -               spin_unlock_irq(&new_base->lock);
18685 +               raw_spin_unlock(&old_base->lock);
18686 +               raw_spin_unlock_irq(&new_base->lock);
18687                 put_cpu_ptr(&timer_bases);
18688         }
18689         return 0;
18690 @@ -1861,8 +1896,11 @@ static void __init init_timer_cpu(int cpu)
18691         for (i = 0; i < NR_BASES; i++) {
18692                 base = per_cpu_ptr(&timer_bases[i], cpu);
18693                 base->cpu = cpu;
18694 -               spin_lock_init(&base->lock);
18695 +               raw_spin_lock_init(&base->lock);
18696                 base->clk = jiffies;
18697 +#ifdef CONFIG_PREEMPT_RT_FULL
18698 +               init_swait_queue_head(&base->wait_for_running_timer);
18699 +#endif
18700         }
18701  }
18702  
18703 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
18704 index 2a96b063d659..812e37237eb8 100644
18705 --- a/kernel/trace/Kconfig
18706 +++ b/kernel/trace/Kconfig
18707 @@ -182,6 +182,24 @@ config IRQSOFF_TRACER
18708           enabled. This option and the preempt-off timing option can be
18709           used together or separately.)
18710  
18711 +config INTERRUPT_OFF_HIST
18712 +       bool "Interrupts-off Latency Histogram"
18713 +       depends on IRQSOFF_TRACER
18714 +       help
18715 +         This option generates continuously updated histograms (one per cpu)
18716 +         of the duration of time periods with interrupts disabled. The
18717 +         histograms are disabled by default. To enable them, write a non-zero
18718 +         number to
18719 +
18720 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
18721 +
18722 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
18723 +         per cpu) are generated that accumulate the duration of time periods
18724 +         when both interrupts and preemption are disabled. The histogram data
18725 +         will be located in the debug file system at
18726 +
18727 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
18728 +
18729  config PREEMPT_TRACER
18730         bool "Preemption-off Latency Tracer"
18731         default n
18732 @@ -206,6 +224,24 @@ config PREEMPT_TRACER
18733           enabled. This option and the irqs-off timing option can be
18734           used together or separately.)
18735  
18736 +config PREEMPT_OFF_HIST
18737 +       bool "Preemption-off Latency Histogram"
18738 +       depends on PREEMPT_TRACER
18739 +       help
18740 +         This option generates continuously updated histograms (one per cpu)
18741 +         of the duration of time periods with preemption disabled. The
18742 +         histograms are disabled by default. To enable them, write a non-zero
18743 +         number to
18744 +
18745 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
18746 +
18747 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
18748 +         per cpu) are generated that accumulate the duration of time periods
18749 +         when both interrupts and preemption are disabled. The histogram data
18750 +         will be located in the debug file system at
18751 +
18752 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
18753 +
18754  config SCHED_TRACER
18755         bool "Scheduling Latency Tracer"
18756         select GENERIC_TRACER
18757 @@ -251,6 +287,74 @@ config HWLAT_TRACER
18758          file. Every time a latency is greater than tracing_thresh, it will
18759          be recorded into the ring buffer.
18760  
18761 +config WAKEUP_LATENCY_HIST
18762 +       bool "Scheduling Latency Histogram"
18763 +       depends on SCHED_TRACER
18764 +       help
18765 +         This option generates continuously updated histograms (one per cpu)
18766 +         of the scheduling latency of the highest priority task.
18767 +         The histograms are disabled by default. To enable them, write a
18768 +         non-zero number to
18769 +
18770 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
18771 +
18772 +         Two different algorithms are used, one to determine the latency of
18773 +         processes that exclusively use the highest priority of the system and
18774 +         another one to determine the latency of processes that share the
18775 +         highest system priority with other processes. The former is used to
18776 +         improve hardware and system software, the latter to optimize the
18777 +         priority design of a given system. The histogram data will be
18778 +         located in the debug file system at
18779 +
18780 +             /sys/kernel/debug/tracing/latency_hist/wakeup
18781 +
18782 +         and
18783 +
18784 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
18785 +
18786 +         If both Scheduling Latency Histogram and Missed Timer Offsets
18787 +         Histogram are selected, additional histogram data will be collected
18788 +         that contain, in addition to the wakeup latency, the timer latency, in
18789 +         case the wakeup was triggered by an expired timer. These histograms
18790 +         are available in the
18791 +
18792 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
18793 +
18794 +         directory. They reflect the apparent interrupt and scheduling latency
18795 +         and are best suitable to determine the worst-case latency of a given
18796 +         system. To enable these histograms, write a non-zero number to
18797 +
18798 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
18799 +
18800 +config MISSED_TIMER_OFFSETS_HIST
18801 +       depends on HIGH_RES_TIMERS
18802 +       select GENERIC_TRACER
18803 +       bool "Missed Timer Offsets Histogram"
18804 +       help
18805 +         Generate a histogram of missed timer offsets in microseconds. The
18806 +         histograms are disabled by default. To enable them, write a non-zero
18807 +         number to
18808 +
18809 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
18810 +
18811 +         The histogram data will be located in the debug file system at
18812 +
18813 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
18814 +
18815 +         If both Scheduling Latency Histogram and Missed Timer Offsets
18816 +         Histogram are selected, additional histogram data will be collected
18817 +         that contain, in addition to the wakeup latency, the timer latency, in
18818 +         case the wakeup was triggered by an expired timer. These histograms
18819 +         are available in the
18820 +
18821 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
18822 +
18823 +         directory. They reflect the apparent interrupt and scheduling latency
18824 +         and are best suitable to determine the worst-case latency of a given
18825 +         system. To enable these histograms, write a non-zero number to
18826 +
18827 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
18828 +
18829  config ENABLE_DEFAULT_TRACERS
18830         bool "Trace process context switches and events"
18831         depends on !GENERIC_TRACER
18832 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
18833 index e57980845549..83af000b783c 100644
18834 --- a/kernel/trace/Makefile
18835 +++ b/kernel/trace/Makefile
18836 @@ -38,6 +38,10 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
18837  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
18838  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
18839  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
18840 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
18841 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
18842 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
18843 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
18844  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
18845  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
18846  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
18847 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
18848 new file mode 100644
18849 index 000000000000..7f6ee70dea41
18850 --- /dev/null
18851 +++ b/kernel/trace/latency_hist.c
18852 @@ -0,0 +1,1178 @@
18853 +/*
18854 + * kernel/trace/latency_hist.c
18855 + *
18856 + * Add support for histograms of preemption-off latency and
18857 + * interrupt-off latency and wakeup latency, it depends on
18858 + * Real-Time Preemption Support.
18859 + *
18860 + *  Copyright (C) 2005 MontaVista Software, Inc.
18861 + *  Yi Yang <yyang@ch.mvista.com>
18862 + *
18863 + *  Converted to work with the new latency tracer.
18864 + *  Copyright (C) 2008 Red Hat, Inc.
18865 + *    Steven Rostedt <srostedt@redhat.com>
18866 + *
18867 + */
18868 +#include <linux/module.h>
18869 +#include <linux/debugfs.h>
18870 +#include <linux/seq_file.h>
18871 +#include <linux/percpu.h>
18872 +#include <linux/kallsyms.h>
18873 +#include <linux/uaccess.h>
18874 +#include <linux/sched.h>
18875 +#include <linux/sched/rt.h>
18876 +#include <linux/slab.h>
18877 +#include <linux/atomic.h>
18878 +#include <asm/div64.h>
18879 +
18880 +#include "trace.h"
18881 +#include <trace/events/sched.h>
18882 +
18883 +#define NSECS_PER_USECS 1000L
18884 +
18885 +#define CREATE_TRACE_POINTS
18886 +#include <trace/events/hist.h>
18887 +
18888 +enum {
18889 +       IRQSOFF_LATENCY = 0,
18890 +       PREEMPTOFF_LATENCY,
18891 +       PREEMPTIRQSOFF_LATENCY,
18892 +       WAKEUP_LATENCY,
18893 +       WAKEUP_LATENCY_SHAREDPRIO,
18894 +       MISSED_TIMER_OFFSETS,
18895 +       TIMERANDWAKEUP_LATENCY,
18896 +       MAX_LATENCY_TYPE,
18897 +};
18898 +
18899 +#define MAX_ENTRY_NUM 10240
18900 +
18901 +struct hist_data {
18902 +       atomic_t hist_mode; /* 0 log, 1 don't log */
18903 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
18904 +       long min_lat;
18905 +       long max_lat;
18906 +       unsigned long long below_hist_bound_samples;
18907 +       unsigned long long above_hist_bound_samples;
18908 +       long long accumulate_lat;
18909 +       unsigned long long total_samples;
18910 +       unsigned long long hist_array[MAX_ENTRY_NUM];
18911 +};
18912 +
18913 +struct enable_data {
18914 +       int latency_type;
18915 +       int enabled;
18916 +};
18917 +
18918 +static char *latency_hist_dir_root = "latency_hist";
18919 +
18920 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18921 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
18922 +static char *irqsoff_hist_dir = "irqsoff";
18923 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
18924 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
18925 +#endif
18926 +
18927 +#ifdef CONFIG_PREEMPT_OFF_HIST
18928 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
18929 +static char *preemptoff_hist_dir = "preemptoff";
18930 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
18931 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
18932 +#endif
18933 +
18934 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
18935 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
18936 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
18937 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
18938 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
18939 +#endif
18940 +
18941 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
18942 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
18943 +static struct enable_data preemptirqsoff_enabled_data = {
18944 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
18945 +       .enabled = 0,
18946 +};
18947 +#endif
18948 +
18949 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18950 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18951 +struct maxlatproc_data {
18952 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
18953 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
18954 +       int pid;
18955 +       int current_pid;
18956 +       int prio;
18957 +       int current_prio;
18958 +       long latency;
18959 +       long timeroffset;
18960 +       cycle_t timestamp;
18961 +};
18962 +#endif
18963 +
18964 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18965 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
18966 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
18967 +static char *wakeup_latency_hist_dir = "wakeup";
18968 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
18969 +static notrace void probe_wakeup_latency_hist_start(void *v,
18970 +       struct task_struct *p);
18971 +static notrace void probe_wakeup_latency_hist_stop(void *v,
18972 +       bool preempt, struct task_struct *prev, struct task_struct *next);
18973 +static notrace void probe_sched_migrate_task(void *,
18974 +       struct task_struct *task, int cpu);
18975 +static struct enable_data wakeup_latency_enabled_data = {
18976 +       .latency_type = WAKEUP_LATENCY,
18977 +       .enabled = 0,
18978 +};
18979 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
18980 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
18981 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
18982 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
18983 +static unsigned long wakeup_pid;
18984 +#endif
18985 +
18986 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18987 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
18988 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
18989 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
18990 +       long long offset, struct task_struct *curr, struct task_struct *task);
18991 +static struct enable_data missed_timer_offsets_enabled_data = {
18992 +       .latency_type = MISSED_TIMER_OFFSETS,
18993 +       .enabled = 0,
18994 +};
18995 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
18996 +static unsigned long missed_timer_offsets_pid;
18997 +#endif
18998 +
18999 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19000 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19001 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
19002 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
19003 +static struct enable_data timerandwakeup_enabled_data = {
19004 +       .latency_type = TIMERANDWAKEUP_LATENCY,
19005 +       .enabled = 0,
19006 +};
19007 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
19008 +#endif
19009 +
19010 +void notrace latency_hist(int latency_type, int cpu, long latency,
19011 +                         long timeroffset, cycle_t stop,
19012 +                         struct task_struct *p)
19013 +{
19014 +       struct hist_data *my_hist;
19015 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19016 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19017 +       struct maxlatproc_data *mp = NULL;
19018 +#endif
19019 +
19020 +       if (!cpu_possible(cpu) || latency_type < 0 ||
19021 +           latency_type >= MAX_LATENCY_TYPE)
19022 +               return;
19023 +
19024 +       switch (latency_type) {
19025 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19026 +       case IRQSOFF_LATENCY:
19027 +               my_hist = &per_cpu(irqsoff_hist, cpu);
19028 +               break;
19029 +#endif
19030 +#ifdef CONFIG_PREEMPT_OFF_HIST
19031 +       case PREEMPTOFF_LATENCY:
19032 +               my_hist = &per_cpu(preemptoff_hist, cpu);
19033 +               break;
19034 +#endif
19035 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
19036 +       case PREEMPTIRQSOFF_LATENCY:
19037 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
19038 +               break;
19039 +#endif
19040 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19041 +       case WAKEUP_LATENCY:
19042 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
19043 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
19044 +               break;
19045 +       case WAKEUP_LATENCY_SHAREDPRIO:
19046 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
19047 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
19048 +               break;
19049 +#endif
19050 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19051 +       case MISSED_TIMER_OFFSETS:
19052 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
19053 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
19054 +               break;
19055 +#endif
19056 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19057 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19058 +       case TIMERANDWAKEUP_LATENCY:
19059 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
19060 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
19061 +               break;
19062 +#endif
19063 +
19064 +       default:
19065 +               return;
19066 +       }
19067 +
19068 +       latency += my_hist->offset;
19069 +
19070 +       if (atomic_read(&my_hist->hist_mode) == 0)
19071 +               return;
19072 +
19073 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
19074 +               if (latency < 0)
19075 +                       my_hist->below_hist_bound_samples++;
19076 +               else
19077 +                       my_hist->above_hist_bound_samples++;
19078 +       } else
19079 +               my_hist->hist_array[latency]++;
19080 +
19081 +       if (unlikely(latency > my_hist->max_lat ||
19082 +           my_hist->min_lat == LONG_MAX)) {
19083 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19084 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19085 +               if (latency_type == WAKEUP_LATENCY ||
19086 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
19087 +                   latency_type == MISSED_TIMER_OFFSETS ||
19088 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
19089 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
19090 +                       strncpy(mp->current_comm, current->comm,
19091 +                           sizeof(mp->current_comm));
19092 +                       mp->pid = task_pid_nr(p);
19093 +                       mp->current_pid = task_pid_nr(current);
19094 +                       mp->prio = p->prio;
19095 +                       mp->current_prio = current->prio;
19096 +                       mp->latency = latency;
19097 +                       mp->timeroffset = timeroffset;
19098 +                       mp->timestamp = stop;
19099 +               }
19100 +#endif
19101 +               my_hist->max_lat = latency;
19102 +       }
19103 +       if (unlikely(latency < my_hist->min_lat))
19104 +               my_hist->min_lat = latency;
19105 +       my_hist->total_samples++;
19106 +       my_hist->accumulate_lat += latency;
19107 +}
19108 +
19109 +static void *l_start(struct seq_file *m, loff_t *pos)
19110 +{
19111 +       loff_t *index_ptr = NULL;
19112 +       loff_t index = *pos;
19113 +       struct hist_data *my_hist = m->private;
19114 +
19115 +       if (index == 0) {
19116 +               char minstr[32], avgstr[32], maxstr[32];
19117 +
19118 +               atomic_dec(&my_hist->hist_mode);
19119 +
19120 +               if (likely(my_hist->total_samples)) {
19121 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
19122 +                           my_hist->total_samples);
19123 +                       snprintf(minstr, sizeof(minstr), "%ld",
19124 +                           my_hist->min_lat - my_hist->offset);
19125 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
19126 +                           avg - my_hist->offset);
19127 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
19128 +                           my_hist->max_lat - my_hist->offset);
19129 +               } else {
19130 +                       strcpy(minstr, "<undef>");
19131 +                       strcpy(avgstr, minstr);
19132 +                       strcpy(maxstr, minstr);
19133 +               }
19134 +
19135 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
19136 +                          "#Average latency: %s microseconds\n"
19137 +                          "#Maximum latency: %s microseconds\n"
19138 +                          "#Total samples: %llu\n"
19139 +                          "#There are %llu samples lower than %ld"
19140 +                          " microseconds.\n"
19141 +                          "#There are %llu samples greater or equal"
19142 +                          " than %ld microseconds.\n"
19143 +                          "#usecs\t%16s\n",
19144 +                          minstr, avgstr, maxstr,
19145 +                          my_hist->total_samples,
19146 +                          my_hist->below_hist_bound_samples,
19147 +                          -my_hist->offset,
19148 +                          my_hist->above_hist_bound_samples,
19149 +                          MAX_ENTRY_NUM - my_hist->offset,
19150 +                          "samples");
19151 +       }
19152 +       if (index < MAX_ENTRY_NUM) {
19153 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
19154 +               if (index_ptr)
19155 +                       *index_ptr = index;
19156 +       }
19157 +
19158 +       return index_ptr;
19159 +}
19160 +
19161 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
19162 +{
19163 +       loff_t *index_ptr = p;
19164 +       struct hist_data *my_hist = m->private;
19165 +
19166 +       if (++*pos >= MAX_ENTRY_NUM) {
19167 +               atomic_inc(&my_hist->hist_mode);
19168 +               return NULL;
19169 +       }
19170 +       *index_ptr = *pos;
19171 +       return index_ptr;
19172 +}
19173 +
19174 +static void l_stop(struct seq_file *m, void *p)
19175 +{
19176 +       kfree(p);
19177 +}
19178 +
19179 +static int l_show(struct seq_file *m, void *p)
19180 +{
19181 +       int index = *(loff_t *) p;
19182 +       struct hist_data *my_hist = m->private;
19183 +
19184 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
19185 +           my_hist->hist_array[index]);
19186 +       return 0;
19187 +}
19188 +
19189 +static const struct seq_operations latency_hist_seq_op = {
19190 +       .start = l_start,
19191 +       .next  = l_next,
19192 +       .stop  = l_stop,
19193 +       .show  = l_show
19194 +};
19195 +
19196 +static int latency_hist_open(struct inode *inode, struct file *file)
19197 +{
19198 +       int ret;
19199 +
19200 +       ret = seq_open(file, &latency_hist_seq_op);
19201 +       if (!ret) {
19202 +               struct seq_file *seq = file->private_data;
19203 +               seq->private = inode->i_private;
19204 +       }
19205 +       return ret;
19206 +}
19207 +
19208 +static const struct file_operations latency_hist_fops = {
19209 +       .open = latency_hist_open,
19210 +       .read = seq_read,
19211 +       .llseek = seq_lseek,
19212 +       .release = seq_release,
19213 +};
19214 +
19215 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19216 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19217 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
19218 +{
19219 +       mp->comm[0] = mp->current_comm[0] = '\0';
19220 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
19221 +           mp->latency = mp->timeroffset = -1;
19222 +       mp->timestamp = 0;
19223 +}
19224 +#endif
19225 +
19226 +static void hist_reset(struct hist_data *hist)
19227 +{
19228 +       atomic_dec(&hist->hist_mode);
19229 +
19230 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
19231 +       hist->below_hist_bound_samples = 0ULL;
19232 +       hist->above_hist_bound_samples = 0ULL;
19233 +       hist->min_lat = LONG_MAX;
19234 +       hist->max_lat = LONG_MIN;
19235 +       hist->total_samples = 0ULL;
19236 +       hist->accumulate_lat = 0LL;
19237 +
19238 +       atomic_inc(&hist->hist_mode);
19239 +}
19240 +
19241 +static ssize_t
19242 +latency_hist_reset(struct file *file, const char __user *a,
19243 +                  size_t size, loff_t *off)
19244 +{
19245 +       int cpu;
19246 +       struct hist_data *hist = NULL;
19247 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19248 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19249 +       struct maxlatproc_data *mp = NULL;
19250 +#endif
19251 +       off_t latency_type = (off_t) file->private_data;
19252 +
19253 +       for_each_online_cpu(cpu) {
19254 +
19255 +               switch (latency_type) {
19256 +#ifdef CONFIG_PREEMPT_OFF_HIST
19257 +               case PREEMPTOFF_LATENCY:
19258 +                       hist = &per_cpu(preemptoff_hist, cpu);
19259 +                       break;
19260 +#endif
19261 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19262 +               case IRQSOFF_LATENCY:
19263 +                       hist = &per_cpu(irqsoff_hist, cpu);
19264 +                       break;
19265 +#endif
19266 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19267 +               case PREEMPTIRQSOFF_LATENCY:
19268 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
19269 +                       break;
19270 +#endif
19271 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19272 +               case WAKEUP_LATENCY:
19273 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
19274 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
19275 +                       break;
19276 +               case WAKEUP_LATENCY_SHAREDPRIO:
19277 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
19278 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
19279 +                       break;
19280 +#endif
19281 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19282 +               case MISSED_TIMER_OFFSETS:
19283 +                       hist = &per_cpu(missed_timer_offsets, cpu);
19284 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
19285 +                       break;
19286 +#endif
19287 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19288 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19289 +               case TIMERANDWAKEUP_LATENCY:
19290 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
19291 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
19292 +                       break;
19293 +#endif
19294 +               }
19295 +
19296 +               hist_reset(hist);
19297 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19298 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19299 +               if (latency_type == WAKEUP_LATENCY ||
19300 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
19301 +                   latency_type == MISSED_TIMER_OFFSETS ||
19302 +                   latency_type == TIMERANDWAKEUP_LATENCY)
19303 +                       clear_maxlatprocdata(mp);
19304 +#endif
19305 +       }
19306 +
19307 +       return size;
19308 +}
19309 +
19310 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19311 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19312 +static ssize_t
19313 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19314 +{
19315 +       char buf[64];
19316 +       int r;
19317 +       unsigned long *this_pid = file->private_data;
19318 +
19319 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
19320 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19321 +}
19322 +
19323 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
19324 +                     size_t cnt, loff_t *ppos)
19325 +{
19326 +       char buf[64];
19327 +       unsigned long pid;
19328 +       unsigned long *this_pid = file->private_data;
19329 +
19330 +       if (cnt >= sizeof(buf))
19331 +               return -EINVAL;
19332 +
19333 +       if (copy_from_user(&buf, ubuf, cnt))
19334 +               return -EFAULT;
19335 +
19336 +       buf[cnt] = '\0';
19337 +
19338 +       if (kstrtoul(buf, 10, &pid))
19339 +               return -EINVAL;
19340 +
19341 +       *this_pid = pid;
19342 +
19343 +       return cnt;
19344 +}
19345 +#endif
19346 +
19347 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19348 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19349 +static ssize_t
19350 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19351 +{
19352 +       int r;
19353 +       struct maxlatproc_data *mp = file->private_data;
19354 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
19355 +       unsigned long long t;
19356 +       unsigned long usecs, secs;
19357 +       char *buf;
19358 +
19359 +       if (mp->pid == -1 || mp->current_pid == -1) {
19360 +               buf = "(none)\n";
19361 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
19362 +                   strlen(buf));
19363 +       }
19364 +
19365 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
19366 +       if (buf == NULL)
19367 +               return -ENOMEM;
19368 +
19369 +       t = ns2usecs(mp->timestamp);
19370 +       usecs = do_div(t, USEC_PER_SEC);
19371 +       secs = (unsigned long) t;
19372 +       r = snprintf(buf, strmaxlen,
19373 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
19374 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
19375 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
19376 +           secs, usecs);
19377 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19378 +       kfree(buf);
19379 +       return r;
19380 +}
19381 +#endif
19382 +
19383 +static ssize_t
19384 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19385 +{
19386 +       char buf[64];
19387 +       struct enable_data *ed = file->private_data;
19388 +       int r;
19389 +
19390 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
19391 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19392 +}
19393 +
19394 +static ssize_t
19395 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
19396 +{
19397 +       char buf[64];
19398 +       long enable;
19399 +       struct enable_data *ed = file->private_data;
19400 +
19401 +       if (cnt >= sizeof(buf))
19402 +               return -EINVAL;
19403 +
19404 +       if (copy_from_user(&buf, ubuf, cnt))
19405 +               return -EFAULT;
19406 +
19407 +       buf[cnt] = 0;
19408 +
19409 +       if (kstrtoul(buf, 10, &enable))
19410 +               return -EINVAL;
19411 +
19412 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
19413 +               return cnt;
19414 +
19415 +       if (enable) {
19416 +               int ret;
19417 +
19418 +               switch (ed->latency_type) {
19419 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19420 +               case PREEMPTIRQSOFF_LATENCY:
19421 +                       ret = register_trace_preemptirqsoff_hist(
19422 +                           probe_preemptirqsoff_hist, NULL);
19423 +                       if (ret) {
19424 +                               pr_info("wakeup trace: Couldn't assign "
19425 +                                   "probe_preemptirqsoff_hist "
19426 +                                   "to trace_preemptirqsoff_hist\n");
19427 +                               return ret;
19428 +                       }
19429 +                       break;
19430 +#endif
19431 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19432 +               case WAKEUP_LATENCY:
19433 +                       ret = register_trace_sched_wakeup(
19434 +                           probe_wakeup_latency_hist_start, NULL);
19435 +                       if (ret) {
19436 +                               pr_info("wakeup trace: Couldn't assign "
19437 +                                   "probe_wakeup_latency_hist_start "
19438 +                                   "to trace_sched_wakeup\n");
19439 +                               return ret;
19440 +                       }
19441 +                       ret = register_trace_sched_wakeup_new(
19442 +                           probe_wakeup_latency_hist_start, NULL);
19443 +                       if (ret) {
19444 +                               pr_info("wakeup trace: Couldn't assign "
19445 +                                   "probe_wakeup_latency_hist_start "
19446 +                                   "to trace_sched_wakeup_new\n");
19447 +                               unregister_trace_sched_wakeup(
19448 +                                   probe_wakeup_latency_hist_start, NULL);
19449 +                               return ret;
19450 +                       }
19451 +                       ret = register_trace_sched_switch(
19452 +                           probe_wakeup_latency_hist_stop, NULL);
19453 +                       if (ret) {
19454 +                               pr_info("wakeup trace: Couldn't assign "
19455 +                                   "probe_wakeup_latency_hist_stop "
19456 +                                   "to trace_sched_switch\n");
19457 +                               unregister_trace_sched_wakeup(
19458 +                                   probe_wakeup_latency_hist_start, NULL);
19459 +                               unregister_trace_sched_wakeup_new(
19460 +                                   probe_wakeup_latency_hist_start, NULL);
19461 +                               return ret;
19462 +                       }
19463 +                       ret = register_trace_sched_migrate_task(
19464 +                           probe_sched_migrate_task, NULL);
19465 +                       if (ret) {
19466 +                               pr_info("wakeup trace: Couldn't assign "
19467 +                                   "probe_sched_migrate_task "
19468 +                                   "to trace_sched_migrate_task\n");
19469 +                               unregister_trace_sched_wakeup(
19470 +                                   probe_wakeup_latency_hist_start, NULL);
19471 +                               unregister_trace_sched_wakeup_new(
19472 +                                   probe_wakeup_latency_hist_start, NULL);
19473 +                               unregister_trace_sched_switch(
19474 +                                   probe_wakeup_latency_hist_stop, NULL);
19475 +                               return ret;
19476 +                       }
19477 +                       break;
19478 +#endif
19479 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19480 +               case MISSED_TIMER_OFFSETS:
19481 +                       ret = register_trace_hrtimer_interrupt(
19482 +                           probe_hrtimer_interrupt, NULL);
19483 +                       if (ret) {
19484 +                               pr_info("wakeup trace: Couldn't assign "
19485 +                                   "probe_hrtimer_interrupt "
19486 +                                   "to trace_hrtimer_interrupt\n");
19487 +                               return ret;
19488 +                       }
19489 +                       break;
19490 +#endif
19491 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19492 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19493 +               case TIMERANDWAKEUP_LATENCY:
19494 +                       if (!wakeup_latency_enabled_data.enabled ||
19495 +                           !missed_timer_offsets_enabled_data.enabled)
19496 +                               return -EINVAL;
19497 +                       break;
19498 +#endif
19499 +               default:
19500 +                       break;
19501 +               }
19502 +       } else {
19503 +               switch (ed->latency_type) {
19504 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19505 +               case PREEMPTIRQSOFF_LATENCY:
19506 +                       {
19507 +                               int cpu;
19508 +
19509 +                               unregister_trace_preemptirqsoff_hist(
19510 +                                   probe_preemptirqsoff_hist, NULL);
19511 +                               for_each_online_cpu(cpu) {
19512 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19513 +                                       per_cpu(hist_irqsoff_counting,
19514 +                                           cpu) = 0;
19515 +#endif
19516 +#ifdef CONFIG_PREEMPT_OFF_HIST
19517 +                                       per_cpu(hist_preemptoff_counting,
19518 +                                           cpu) = 0;
19519 +#endif
19520 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19521 +                                       per_cpu(hist_preemptirqsoff_counting,
19522 +                                           cpu) = 0;
19523 +#endif
19524 +                               }
19525 +                       }
19526 +                       break;
19527 +#endif
19528 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19529 +               case WAKEUP_LATENCY:
19530 +                       {
19531 +                               int cpu;
19532 +
19533 +                               unregister_trace_sched_wakeup(
19534 +                                   probe_wakeup_latency_hist_start, NULL);
19535 +                               unregister_trace_sched_wakeup_new(
19536 +                                   probe_wakeup_latency_hist_start, NULL);
19537 +                               unregister_trace_sched_switch(
19538 +                                   probe_wakeup_latency_hist_stop, NULL);
19539 +                               unregister_trace_sched_migrate_task(
19540 +                                   probe_sched_migrate_task, NULL);
19541 +
19542 +                               for_each_online_cpu(cpu) {
19543 +                                       per_cpu(wakeup_task, cpu) = NULL;
19544 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
19545 +                               }
19546 +                       }
19547 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19548 +                       timerandwakeup_enabled_data.enabled = 0;
19549 +#endif
19550 +                       break;
19551 +#endif
19552 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19553 +               case MISSED_TIMER_OFFSETS:
19554 +                       unregister_trace_hrtimer_interrupt(
19555 +                           probe_hrtimer_interrupt, NULL);
19556 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19557 +                       timerandwakeup_enabled_data.enabled = 0;
19558 +#endif
19559 +                       break;
19560 +#endif
19561 +               default:
19562 +                       break;
19563 +               }
19564 +       }
19565 +       ed->enabled = enable;
19566 +       return cnt;
19567 +}
19568 +
19569 +static const struct file_operations latency_hist_reset_fops = {
19570 +       .open = tracing_open_generic,
19571 +       .write = latency_hist_reset,
19572 +};
19573 +
19574 +static const struct file_operations enable_fops = {
19575 +       .open = tracing_open_generic,
19576 +       .read = show_enable,
19577 +       .write = do_enable,
19578 +};
19579 +
19580 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19581 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19582 +static const struct file_operations pid_fops = {
19583 +       .open = tracing_open_generic,
19584 +       .read = show_pid,
19585 +       .write = do_pid,
19586 +};
19587 +
19588 +static const struct file_operations maxlatproc_fops = {
19589 +       .open = tracing_open_generic,
19590 +       .read = show_maxlatproc,
19591 +};
19592 +#endif
19593 +
19594 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19595 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
19596 +       int starthist)
19597 +{
19598 +       int cpu = raw_smp_processor_id();
19599 +       int time_set = 0;
19600 +
19601 +       if (starthist) {
19602 +               cycle_t uninitialized_var(start);
19603 +
19604 +               if (!preempt_count() && !irqs_disabled())
19605 +                       return;
19606 +
19607 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19608 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
19609 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
19610 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
19611 +                       start = ftrace_now(cpu);
19612 +                       time_set++;
19613 +                       per_cpu(hist_irqsoff_start, cpu) = start;
19614 +               }
19615 +#endif
19616 +
19617 +#ifdef CONFIG_PREEMPT_OFF_HIST
19618 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
19619 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
19620 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
19621 +                       if (!(time_set++))
19622 +                               start = ftrace_now(cpu);
19623 +                       per_cpu(hist_preemptoff_start, cpu) = start;
19624 +               }
19625 +#endif
19626 +
19627 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19628 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
19629 +                   per_cpu(hist_preemptoff_counting, cpu) &&
19630 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
19631 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
19632 +                       if (!time_set)
19633 +                               start = ftrace_now(cpu);
19634 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
19635 +               }
19636 +#endif
19637 +       } else {
19638 +               cycle_t uninitialized_var(stop);
19639 +
19640 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19641 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
19642 +                   per_cpu(hist_irqsoff_counting, cpu)) {
19643 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
19644 +
19645 +                       stop = ftrace_now(cpu);
19646 +                       time_set++;
19647 +                       if (start) {
19648 +                               long latency = ((long) (stop - start)) /
19649 +                                   NSECS_PER_USECS;
19650 +
19651 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
19652 +                                   stop, NULL);
19653 +                       }
19654 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
19655 +               }
19656 +#endif
19657 +
19658 +#ifdef CONFIG_PREEMPT_OFF_HIST
19659 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
19660 +                   per_cpu(hist_preemptoff_counting, cpu)) {
19661 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
19662 +
19663 +                       if (!(time_set++))
19664 +                               stop = ftrace_now(cpu);
19665 +                       if (start) {
19666 +                               long latency = ((long) (stop - start)) /
19667 +                                   NSECS_PER_USECS;
19668 +
19669 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
19670 +                                   0, stop, NULL);
19671 +                       }
19672 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
19673 +               }
19674 +#endif
19675 +
19676 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19677 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
19678 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
19679 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
19680 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
19681 +
19682 +                       if (!time_set)
19683 +                               stop = ftrace_now(cpu);
19684 +                       if (start) {
19685 +                               long latency = ((long) (stop - start)) /
19686 +                                   NSECS_PER_USECS;
19687 +
19688 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
19689 +                                   latency, 0, stop, NULL);
19690 +                       }
19691 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
19692 +               }
19693 +#endif
19694 +       }
19695 +}
19696 +#endif
19697 +
19698 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19699 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
19700 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
19701 +       int cpu)
19702 +{
19703 +       int old_cpu = task_cpu(task);
19704 +
19705 +       if (cpu != old_cpu) {
19706 +               unsigned long flags;
19707 +               struct task_struct *cpu_wakeup_task;
19708 +
19709 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
19710 +
19711 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
19712 +               if (task == cpu_wakeup_task) {
19713 +                       put_task_struct(cpu_wakeup_task);
19714 +                       per_cpu(wakeup_task, old_cpu) = NULL;
19715 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
19716 +                       get_task_struct(cpu_wakeup_task);
19717 +               }
19718 +
19719 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19720 +       }
19721 +}
19722 +
19723 +static notrace void probe_wakeup_latency_hist_start(void *v,
19724 +       struct task_struct *p)
19725 +{
19726 +       unsigned long flags;
19727 +       struct task_struct *curr = current;
19728 +       int cpu = task_cpu(p);
19729 +       struct task_struct *cpu_wakeup_task;
19730 +
19731 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
19732 +
19733 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
19734 +
19735 +       if (wakeup_pid) {
19736 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
19737 +                   p->prio == curr->prio)
19738 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
19739 +               if (likely(wakeup_pid != task_pid_nr(p)))
19740 +                       goto out;
19741 +       } else {
19742 +               if (likely(!rt_task(p)) ||
19743 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
19744 +                   p->prio > curr->prio)
19745 +                       goto out;
19746 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
19747 +                   p->prio == curr->prio)
19748 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
19749 +       }
19750 +
19751 +       if (cpu_wakeup_task)
19752 +               put_task_struct(cpu_wakeup_task);
19753 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
19754 +       get_task_struct(cpu_wakeup_task);
19755 +       cpu_wakeup_task->preempt_timestamp_hist =
19756 +               ftrace_now(raw_smp_processor_id());
19757 +out:
19758 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19759 +}
19760 +
19761 +static notrace void probe_wakeup_latency_hist_stop(void *v,
19762 +       bool preempt, struct task_struct *prev, struct task_struct *next)
19763 +{
19764 +       unsigned long flags;
19765 +       int cpu = task_cpu(next);
19766 +       long latency;
19767 +       cycle_t stop;
19768 +       struct task_struct *cpu_wakeup_task;
19769 +
19770 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
19771 +
19772 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
19773 +
19774 +       if (cpu_wakeup_task == NULL)
19775 +               goto out;
19776 +
19777 +       /* Already running? */
19778 +       if (unlikely(current == cpu_wakeup_task))
19779 +               goto out_reset;
19780 +
19781 +       if (next != cpu_wakeup_task) {
19782 +               if (next->prio < cpu_wakeup_task->prio)
19783 +                       goto out_reset;
19784 +
19785 +               if (next->prio == cpu_wakeup_task->prio)
19786 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
19787 +
19788 +               goto out;
19789 +       }
19790 +
19791 +       if (current->prio == cpu_wakeup_task->prio)
19792 +               per_cpu(wakeup_sharedprio, cpu) = 1;
19793 +
19794 +       /*
19795 +        * The task we are waiting for is about to be switched to.
19796 +        * Calculate latency and store it in histogram.
19797 +        */
19798 +       stop = ftrace_now(raw_smp_processor_id());
19799 +
19800 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
19801 +           NSECS_PER_USECS;
19802 +
19803 +       if (per_cpu(wakeup_sharedprio, cpu)) {
19804 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
19805 +                   next);
19806 +               per_cpu(wakeup_sharedprio, cpu) = 0;
19807 +       } else {
19808 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
19809 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19810 +               if (timerandwakeup_enabled_data.enabled) {
19811 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
19812 +                           next->timer_offset + latency, next->timer_offset,
19813 +                           stop, next);
19814 +               }
19815 +#endif
19816 +       }
19817 +
19818 +out_reset:
19819 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19820 +       next->timer_offset = 0;
19821 +#endif
19822 +       put_task_struct(cpu_wakeup_task);
19823 +       per_cpu(wakeup_task, cpu) = NULL;
19824 +out:
19825 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19826 +}
19827 +#endif
19828 +
19829 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19830 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
19831 +       long long latency_ns, struct task_struct *curr,
19832 +       struct task_struct *task)
19833 +{
19834 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
19835 +           (task->prio < curr->prio ||
19836 +           (task->prio == curr->prio &&
19837 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
19838 +               long latency;
19839 +               cycle_t now;
19840 +
19841 +               if (missed_timer_offsets_pid) {
19842 +                       if (likely(missed_timer_offsets_pid !=
19843 +                           task_pid_nr(task)))
19844 +                               return;
19845 +               }
19846 +
19847 +               now = ftrace_now(cpu);
19848 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
19849 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
19850 +                   task);
19851 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19852 +               task->timer_offset = latency;
19853 +#endif
19854 +       }
19855 +}
19856 +#endif
19857 +
19858 +static __init int latency_hist_init(void)
19859 +{
19860 +       struct dentry *latency_hist_root = NULL;
19861 +       struct dentry *dentry;
19862 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19863 +       struct dentry *dentry_sharedprio;
19864 +#endif
19865 +       struct dentry *entry;
19866 +       struct dentry *enable_root;
19867 +       int i = 0;
19868 +       struct hist_data *my_hist;
19869 +       char name[64];
19870 +       char *cpufmt = "CPU%d";
19871 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19872 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19873 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
19874 +       struct maxlatproc_data *mp = NULL;
19875 +#endif
19876 +
19877 +       dentry = tracing_init_dentry();
19878 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
19879 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
19880 +
19881 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19882 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
19883 +       for_each_possible_cpu(i) {
19884 +               sprintf(name, cpufmt, i);
19885 +               entry = debugfs_create_file(name, 0444, dentry,
19886 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
19887 +               my_hist = &per_cpu(irqsoff_hist, i);
19888 +               atomic_set(&my_hist->hist_mode, 1);
19889 +               my_hist->min_lat = LONG_MAX;
19890 +       }
19891 +       entry = debugfs_create_file("reset", 0644, dentry,
19892 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
19893 +#endif
19894 +
19895 +#ifdef CONFIG_PREEMPT_OFF_HIST
19896 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
19897 +           latency_hist_root);
19898 +       for_each_possible_cpu(i) {
19899 +               sprintf(name, cpufmt, i);
19900 +               entry = debugfs_create_file(name, 0444, dentry,
19901 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
19902 +               my_hist = &per_cpu(preemptoff_hist, i);
19903 +               atomic_set(&my_hist->hist_mode, 1);
19904 +               my_hist->min_lat = LONG_MAX;
19905 +       }
19906 +       entry = debugfs_create_file("reset", 0644, dentry,
19907 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
19908 +#endif
19909 +
19910 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19911 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
19912 +           latency_hist_root);
19913 +       for_each_possible_cpu(i) {
19914 +               sprintf(name, cpufmt, i);
19915 +               entry = debugfs_create_file(name, 0444, dentry,
19916 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
19917 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
19918 +               atomic_set(&my_hist->hist_mode, 1);
19919 +               my_hist->min_lat = LONG_MAX;
19920 +       }
19921 +       entry = debugfs_create_file("reset", 0644, dentry,
19922 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
19923 +#endif
19924 +
19925 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19926 +       entry = debugfs_create_file("preemptirqsoff", 0644,
19927 +           enable_root, (void *)&preemptirqsoff_enabled_data,
19928 +           &enable_fops);
19929 +#endif
19930 +
19931 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19932 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
19933 +           latency_hist_root);
19934 +       dentry_sharedprio = debugfs_create_dir(
19935 +           wakeup_latency_hist_dir_sharedprio, dentry);
19936 +       for_each_possible_cpu(i) {
19937 +               sprintf(name, cpufmt, i);
19938 +
19939 +               entry = debugfs_create_file(name, 0444, dentry,
19940 +                   &per_cpu(wakeup_latency_hist, i),
19941 +                   &latency_hist_fops);
19942 +               my_hist = &per_cpu(wakeup_latency_hist, i);
19943 +               atomic_set(&my_hist->hist_mode, 1);
19944 +               my_hist->min_lat = LONG_MAX;
19945 +
19946 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
19947 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
19948 +                   &latency_hist_fops);
19949 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
19950 +               atomic_set(&my_hist->hist_mode, 1);
19951 +               my_hist->min_lat = LONG_MAX;
19952 +
19953 +               sprintf(name, cpufmt_maxlatproc, i);
19954 +
19955 +               mp = &per_cpu(wakeup_maxlatproc, i);
19956 +               entry = debugfs_create_file(name, 0444, dentry, mp,
19957 +                   &maxlatproc_fops);
19958 +               clear_maxlatprocdata(mp);
19959 +
19960 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
19961 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
19962 +                   &maxlatproc_fops);
19963 +               clear_maxlatprocdata(mp);
19964 +       }
19965 +       entry = debugfs_create_file("pid", 0644, dentry,
19966 +           (void *)&wakeup_pid, &pid_fops);
19967 +       entry = debugfs_create_file("reset", 0644, dentry,
19968 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
19969 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
19970 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
19971 +       entry = debugfs_create_file("wakeup", 0644,
19972 +           enable_root, (void *)&wakeup_latency_enabled_data,
19973 +           &enable_fops);
19974 +#endif
19975 +
19976 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19977 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
19978 +           latency_hist_root);
19979 +       for_each_possible_cpu(i) {
19980 +               sprintf(name, cpufmt, i);
19981 +               entry = debugfs_create_file(name, 0444, dentry,
19982 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
19983 +               my_hist = &per_cpu(missed_timer_offsets, i);
19984 +               atomic_set(&my_hist->hist_mode, 1);
19985 +               my_hist->min_lat = LONG_MAX;
19986 +
19987 +               sprintf(name, cpufmt_maxlatproc, i);
19988 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
19989 +               entry = debugfs_create_file(name, 0444, dentry, mp,
19990 +                   &maxlatproc_fops);
19991 +               clear_maxlatprocdata(mp);
19992 +       }
19993 +       entry = debugfs_create_file("pid", 0644, dentry,
19994 +           (void *)&missed_timer_offsets_pid, &pid_fops);
19995 +       entry = debugfs_create_file("reset", 0644, dentry,
19996 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
19997 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
19998 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
19999 +           &enable_fops);
20000 +#endif
20001 +
20002 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20003 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20004 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
20005 +           latency_hist_root);
20006 +       for_each_possible_cpu(i) {
20007 +               sprintf(name, cpufmt, i);
20008 +               entry = debugfs_create_file(name, 0444, dentry,
20009 +                   &per_cpu(timerandwakeup_latency_hist, i),
20010 +                   &latency_hist_fops);
20011 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
20012 +               atomic_set(&my_hist->hist_mode, 1);
20013 +               my_hist->min_lat = LONG_MAX;
20014 +
20015 +               sprintf(name, cpufmt_maxlatproc, i);
20016 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
20017 +               entry = debugfs_create_file(name, 0444, dentry, mp,
20018 +                   &maxlatproc_fops);
20019 +               clear_maxlatprocdata(mp);
20020 +       }
20021 +       entry = debugfs_create_file("reset", 0644, dentry,
20022 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
20023 +       entry = debugfs_create_file("timerandwakeup", 0644,
20024 +           enable_root, (void *)&timerandwakeup_enabled_data,
20025 +           &enable_fops);
20026 +#endif
20027 +       return 0;
20028 +}
20029 +
20030 +device_initcall(latency_hist_init);
20031 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
20032 index 8696ce6bf2f6..277f048a4695 100644
20033 --- a/kernel/trace/trace.c
20034 +++ b/kernel/trace/trace.c
20035 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
20036         struct task_struct *tsk = current;
20037  
20038         entry->preempt_count            = pc & 0xff;
20039 +       entry->preempt_lazy_count       = preempt_lazy_count();
20040         entry->pid                      = (tsk) ? tsk->pid : 0;
20041         entry->flags =
20042  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
20043 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
20044                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
20045                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
20046                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
20047 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
20048 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
20049 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
20050                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
20051 +
20052 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
20053  }
20054  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
20055  
20056 @@ -2892,14 +2896,17 @@ get_total_entries(struct trace_buffer *buf,
20057  
20058  static void print_lat_help_header(struct seq_file *m)
20059  {
20060 -       seq_puts(m, "#                  _------=> CPU#            \n"
20061 -                   "#                 / _-----=> irqs-off        \n"
20062 -                   "#                | / _----=> need-resched    \n"
20063 -                   "#                || / _---=> hardirq/softirq \n"
20064 -                   "#                ||| / _--=> preempt-depth   \n"
20065 -                   "#                |||| /     delay            \n"
20066 -                   "#  cmd     pid   ||||| time  |   caller      \n"
20067 -                   "#     \\   /      |||||  \\    |   /         \n");
20068 +       seq_puts(m, "#                  _--------=> CPU#              \n"
20069 +                   "#                 / _-------=> irqs-off          \n"
20070 +                   "#                | / _------=> need-resched      \n"
20071 +                   "#                || / _-----=> need-resched_lazy \n"
20072 +                   "#                ||| / _----=> hardirq/softirq   \n"
20073 +                   "#                |||| / _---=> preempt-depth     \n"
20074 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
20075 +                   "#                |||||| / _-=> migrate-disable   \n"
20076 +                   "#                ||||||| /     delay             \n"
20077 +                   "# cmd     pid    |||||||| time   |  caller       \n"
20078 +                   "#     \\   /      ||||||||   \\    |  /            \n");
20079  }
20080  
20081  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
20082 @@ -2925,11 +2932,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
20083         print_event_info(buf, m);
20084         seq_puts(m, "#                              _-----=> irqs-off\n"
20085                     "#                             / _----=> need-resched\n"
20086 -                   "#                            | / _---=> hardirq/softirq\n"
20087 -                   "#                            || / _--=> preempt-depth\n"
20088 -                   "#                            ||| /     delay\n"
20089 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
20090 -                   "#              | |       |   ||||       |         |\n");
20091 +                   "#                            |/  _-----=> need-resched_lazy\n"
20092 +                   "#                            || / _---=> hardirq/softirq\n"
20093 +                   "#                            ||| / _--=> preempt-depth\n"
20094 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
20095 +                   "#                            ||||| / _-=> migrate-disable   \n"
20096 +                   "#                            |||||| /    delay\n"
20097 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
20098 +                   "#              | |       |   |||||||      |         |\n");
20099  }
20100  
20101  void
20102 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
20103 index fd24b1f9ac43..852b2c81be25 100644
20104 --- a/kernel/trace/trace.h
20105 +++ b/kernel/trace/trace.h
20106 @@ -124,6 +124,7 @@ struct kretprobe_trace_entry_head {
20107   *  NEED_RESCHED       - reschedule is requested
20108   *  HARDIRQ            - inside an interrupt handler
20109   *  SOFTIRQ            - inside a softirq handler
20110 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
20111   */
20112  enum trace_flag_type {
20113         TRACE_FLAG_IRQS_OFF             = 0x01,
20114 @@ -133,6 +134,7 @@ enum trace_flag_type {
20115         TRACE_FLAG_SOFTIRQ              = 0x10,
20116         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
20117         TRACE_FLAG_NMI                  = 0x40,
20118 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
20119  };
20120  
20121  #define TRACE_BUF_SIZE         1024
20122 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
20123 index 03c0a48c3ac4..0b85d516b491 100644
20124 --- a/kernel/trace/trace_events.c
20125 +++ b/kernel/trace/trace_events.c
20126 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
20127         __common_field(unsigned char, flags);
20128         __common_field(unsigned char, preempt_count);
20129         __common_field(int, pid);
20130 +       __common_field(unsigned short, migrate_disable);
20131 +       __common_field(unsigned short, padding);
20132  
20133         return ret;
20134  }
20135 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
20136 index 03cdff84d026..940bd10b4406 100644
20137 --- a/kernel/trace/trace_irqsoff.c
20138 +++ b/kernel/trace/trace_irqsoff.c
20139 @@ -13,6 +13,7 @@
20140  #include <linux/uaccess.h>
20141  #include <linux/module.h>
20142  #include <linux/ftrace.h>
20143 +#include <trace/events/hist.h>
20144  
20145  #include "trace.h"
20146  
20147 @@ -424,11 +425,13 @@ void start_critical_timings(void)
20148  {
20149         if (preempt_trace() || irq_trace())
20150                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20151 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
20152  }
20153  EXPORT_SYMBOL_GPL(start_critical_timings);
20154  
20155  void stop_critical_timings(void)
20156  {
20157 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
20158         if (preempt_trace() || irq_trace())
20159                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20160  }
20161 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
20162  #ifdef CONFIG_PROVE_LOCKING
20163  void time_hardirqs_on(unsigned long a0, unsigned long a1)
20164  {
20165 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
20166         if (!preempt_trace() && irq_trace())
20167                 stop_critical_timing(a0, a1);
20168  }
20169 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
20170  {
20171         if (!preempt_trace() && irq_trace())
20172                 start_critical_timing(a0, a1);
20173 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
20174  }
20175  
20176  #else /* !CONFIG_PROVE_LOCKING */
20177 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
20178   */
20179  void trace_hardirqs_on(void)
20180  {
20181 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
20182         if (!preempt_trace() && irq_trace())
20183                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20184  }
20185 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
20186  {
20187         if (!preempt_trace() && irq_trace())
20188                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20189 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
20190  }
20191  EXPORT_SYMBOL(trace_hardirqs_off);
20192  
20193  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
20194  {
20195 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
20196         if (!preempt_trace() && irq_trace())
20197                 stop_critical_timing(CALLER_ADDR0, caller_addr);
20198  }
20199 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
20200  {
20201         if (!preempt_trace() && irq_trace())
20202                 start_critical_timing(CALLER_ADDR0, caller_addr);
20203 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
20204  }
20205  EXPORT_SYMBOL(trace_hardirqs_off_caller);
20206  
20207 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
20208  #ifdef CONFIG_PREEMPT_TRACER
20209  void trace_preempt_on(unsigned long a0, unsigned long a1)
20210  {
20211 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
20212         if (preempt_trace() && !irq_trace())
20213                 stop_critical_timing(a0, a1);
20214  }
20215  
20216  void trace_preempt_off(unsigned long a0, unsigned long a1)
20217  {
20218 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
20219         if (preempt_trace() && !irq_trace())
20220                 start_critical_timing(a0, a1);
20221  }
20222 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
20223 index 3fc20422c166..65a6dde71a7d 100644
20224 --- a/kernel/trace/trace_output.c
20225 +++ b/kernel/trace/trace_output.c
20226 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
20227  {
20228         char hardsoft_irq;
20229         char need_resched;
20230 +       char need_resched_lazy;
20231         char irqs_off;
20232         int hardirq;
20233         int softirq;
20234 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
20235                 break;
20236         }
20237  
20238 +       need_resched_lazy =
20239 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
20240 +
20241         hardsoft_irq =
20242                 (nmi && hardirq)     ? 'Z' :
20243                 nmi                  ? 'z' :
20244 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
20245                 softirq              ? 's' :
20246                                        '.' ;
20247  
20248 -       trace_seq_printf(s, "%c%c%c",
20249 -                        irqs_off, need_resched, hardsoft_irq);
20250 +       trace_seq_printf(s, "%c%c%c%c",
20251 +                        irqs_off, need_resched, need_resched_lazy,
20252 +                        hardsoft_irq);
20253  
20254         if (entry->preempt_count)
20255                 trace_seq_printf(s, "%x", entry->preempt_count);
20256         else
20257                 trace_seq_putc(s, '.');
20258  
20259 +       if (entry->preempt_lazy_count)
20260 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
20261 +       else
20262 +               trace_seq_putc(s, '.');
20263 +
20264 +       if (entry->migrate_disable)
20265 +               trace_seq_printf(s, "%x", entry->migrate_disable);
20266 +       else
20267 +               trace_seq_putc(s, '.');
20268 +
20269         return !trace_seq_has_overflowed(s);
20270  }
20271  
20272 diff --git a/kernel/user.c b/kernel/user.c
20273 index b069ccbfb0b0..1a2e88e98b5e 100644
20274 --- a/kernel/user.c
20275 +++ b/kernel/user.c
20276 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
20277         if (!up)
20278                 return;
20279  
20280 -       local_irq_save(flags);
20281 +       local_irq_save_nort(flags);
20282         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
20283                 free_user(up, flags);
20284         else
20285 -               local_irq_restore(flags);
20286 +               local_irq_restore_nort(flags);
20287  }
20288  
20289  struct user_struct *alloc_uid(kuid_t uid)
20290 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
20291 index 6d1020c03d41..70c6a2f79f7e 100644
20292 --- a/kernel/watchdog.c
20293 +++ b/kernel/watchdog.c
20294 @@ -315,6 +315,8 @@ static int is_softlockup(unsigned long touch_ts)
20295  
20296  #ifdef CONFIG_HARDLOCKUP_DETECTOR
20297  
20298 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
20299 +
20300  static struct perf_event_attr wd_hw_attr = {
20301         .type           = PERF_TYPE_HARDWARE,
20302         .config         = PERF_COUNT_HW_CPU_CYCLES,
20303 @@ -348,6 +350,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
20304                 /* only print hardlockups once */
20305                 if (__this_cpu_read(hard_watchdog_warn) == true)
20306                         return;
20307 +               /*
20308 +                * If early-printk is enabled then make sure we do not
20309 +                * lock up in printk() and kill console logging:
20310 +                */
20311 +               printk_kill();
20312 +
20313 +               raw_spin_lock(&watchdog_output_lock);
20314  
20315                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
20316                 print_modules();
20317 @@ -365,6 +374,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
20318                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
20319                         trigger_allbutself_cpu_backtrace();
20320  
20321 +               raw_spin_unlock(&watchdog_output_lock);
20322                 if (hardlockup_panic)
20323                         nmi_panic(regs, "Hard LOCKUP");
20324  
20325 @@ -512,6 +522,7 @@ static void watchdog_enable(unsigned int cpu)
20326         /* kick off the timer for the hardlockup detector */
20327         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
20328         hrtimer->function = watchdog_timer_fn;
20329 +       hrtimer->irqsafe = 1;
20330  
20331         /* Enable the perf event */
20332         watchdog_nmi_enable(cpu);
20333 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
20334 index 479d840db286..24eba6620a45 100644
20335 --- a/kernel/workqueue.c
20336 +++ b/kernel/workqueue.c
20337 @@ -48,6 +48,8 @@
20338  #include <linux/nodemask.h>
20339  #include <linux/moduleparam.h>
20340  #include <linux/uaccess.h>
20341 +#include <linux/locallock.h>
20342 +#include <linux/delay.h>
20343  
20344  #include "workqueue_internal.h"
20345  
20346 @@ -121,11 +123,16 @@ enum {
20347   *    cpu or grabbing pool->lock is enough for read access.  If
20348   *    POOL_DISASSOCIATED is set, it's identical to L.
20349   *
20350 + *    On RT we need the extra protection via rt_lock_idle_list() for
20351 + *    the list manipulations against read access from
20352 + *    wq_worker_sleeping(). All other places are nicely serialized via
20353 + *    pool->lock.
20354 + *
20355   * A: pool->attach_mutex protected.
20356   *
20357   * PL: wq_pool_mutex protected.
20358   *
20359 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
20360 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
20361   *
20362   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
20363   *
20364 @@ -134,7 +141,7 @@ enum {
20365   *
20366   * WQ: wq->mutex protected.
20367   *
20368 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
20369 + * WR: wq->mutex protected for writes.  RCU protected for reads.
20370   *
20371   * MD: wq_mayday_lock protected.
20372   */
20373 @@ -185,7 +192,7 @@ struct worker_pool {
20374         atomic_t                nr_running ____cacheline_aligned_in_smp;
20375  
20376         /*
20377 -        * Destruction of pool is sched-RCU protected to allow dereferences
20378 +        * Destruction of pool is RCU protected to allow dereferences
20379          * from get_work_pool().
20380          */
20381         struct rcu_head         rcu;
20382 @@ -214,7 +221,7 @@ struct pool_workqueue {
20383         /*
20384          * Release of unbound pwq is punted to system_wq.  See put_pwq()
20385          * and pwq_unbound_release_workfn() for details.  pool_workqueue
20386 -        * itself is also sched-RCU protected so that the first pwq can be
20387 +        * itself is also RCU protected so that the first pwq can be
20388          * determined without grabbing wq->mutex.
20389          */
20390         struct work_struct      unbound_release_work;
20391 @@ -348,6 +355,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
20392  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
20393  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
20394  
20395 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
20396 +
20397  static int worker_thread(void *__worker);
20398  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20399  
20400 @@ -355,20 +364,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20401  #include <trace/events/workqueue.h>
20402  
20403  #define assert_rcu_or_pool_mutex()                                     \
20404 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20405 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20406                          !lockdep_is_held(&wq_pool_mutex),              \
20407 -                        "sched RCU or wq_pool_mutex should be held")
20408 +                        "RCU or wq_pool_mutex should be held")
20409  
20410  #define assert_rcu_or_wq_mutex(wq)                                     \
20411 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20412 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20413                          !lockdep_is_held(&wq->mutex),                  \
20414 -                        "sched RCU or wq->mutex should be held")
20415 +                        "RCU or wq->mutex should be held")
20416  
20417  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
20418 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20419 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20420                          !lockdep_is_held(&wq->mutex) &&                \
20421                          !lockdep_is_held(&wq_pool_mutex),              \
20422 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
20423 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
20424  
20425  #define for_each_cpu_worker_pool(pool, cpu)                            \
20426         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
20427 @@ -380,7 +389,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20428   * @pool: iteration cursor
20429   * @pi: integer used for iteration
20430   *
20431 - * This must be called either with wq_pool_mutex held or sched RCU read
20432 + * This must be called either with wq_pool_mutex held or RCU read
20433   * locked.  If the pool needs to be used beyond the locking in effect, the
20434   * caller is responsible for guaranteeing that the pool stays online.
20435   *
20436 @@ -412,7 +421,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20437   * @pwq: iteration cursor
20438   * @wq: the target workqueue
20439   *
20440 - * This must be called either with wq->mutex held or sched RCU read locked.
20441 + * This must be called either with wq->mutex held or RCU read locked.
20442   * If the pwq needs to be used beyond the locking in effect, the caller is
20443   * responsible for guaranteeing that the pwq stays online.
20444   *
20445 @@ -424,6 +433,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20446                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
20447                 else
20448  
20449 +#ifdef CONFIG_PREEMPT_RT_BASE
20450 +static inline void rt_lock_idle_list(struct worker_pool *pool)
20451 +{
20452 +       preempt_disable();
20453 +}
20454 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
20455 +{
20456 +       preempt_enable();
20457 +}
20458 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
20459 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
20460 +#else
20461 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
20462 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
20463 +static inline void sched_lock_idle_list(struct worker_pool *pool)
20464 +{
20465 +       spin_lock_irq(&pool->lock);
20466 +}
20467 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
20468 +{
20469 +       spin_unlock_irq(&pool->lock);
20470 +}
20471 +#endif
20472 +
20473 +
20474  #ifdef CONFIG_DEBUG_OBJECTS_WORK
20475  
20476  static struct debug_obj_descr work_debug_descr;
20477 @@ -548,7 +582,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
20478   * @wq: the target workqueue
20479   * @node: the node ID
20480   *
20481 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
20482 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
20483   * read locked.
20484   * If the pwq needs to be used beyond the locking in effect, the caller is
20485   * responsible for guaranteeing that the pwq stays online.
20486 @@ -692,8 +726,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
20487   * @work: the work item of interest
20488   *
20489   * Pools are created and destroyed under wq_pool_mutex, and allows read
20490 - * access under sched-RCU read lock.  As such, this function should be
20491 - * called under wq_pool_mutex or with preemption disabled.
20492 + * access under RCU read lock.  As such, this function should be
20493 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
20494   *
20495   * All fields of the returned pool are accessible as long as the above
20496   * mentioned locking is in effect.  If the returned pool needs to be used
20497 @@ -830,50 +864,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
20498   */
20499  static void wake_up_worker(struct worker_pool *pool)
20500  {
20501 -       struct worker *worker = first_idle_worker(pool);
20502 +       struct worker *worker;
20503 +
20504 +       rt_lock_idle_list(pool);
20505 +
20506 +       worker = first_idle_worker(pool);
20507  
20508         if (likely(worker))
20509                 wake_up_process(worker->task);
20510 +
20511 +       rt_unlock_idle_list(pool);
20512  }
20513  
20514  /**
20515 - * wq_worker_waking_up - a worker is waking up
20516 + * wq_worker_running - a worker is running again
20517   * @task: task waking up
20518 - * @cpu: CPU @task is waking up to
20519   *
20520 - * This function is called during try_to_wake_up() when a worker is
20521 - * being awoken.
20522 - *
20523 - * CONTEXT:
20524 - * spin_lock_irq(rq->lock)
20525 + * This function is called when a worker returns from schedule()
20526   */
20527 -void wq_worker_waking_up(struct task_struct *task, int cpu)
20528 +void wq_worker_running(struct task_struct *task)
20529  {
20530         struct worker *worker = kthread_data(task);
20531  
20532 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
20533 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
20534 +       if (!worker->sleeping)
20535 +               return;
20536 +       if (!(worker->flags & WORKER_NOT_RUNNING))
20537                 atomic_inc(&worker->pool->nr_running);
20538 -       }
20539 +       worker->sleeping = 0;
20540  }
20541  
20542  /**
20543   * wq_worker_sleeping - a worker is going to sleep
20544   * @task: task going to sleep
20545   *
20546 - * This function is called during schedule() when a busy worker is
20547 - * going to sleep.  Worker on the same cpu can be woken up by
20548 - * returning pointer to its task.
20549 - *
20550 - * CONTEXT:
20551 - * spin_lock_irq(rq->lock)
20552 - *
20553 - * Return:
20554 - * Worker task on @cpu to wake up, %NULL if none.
20555 + * This function is called from schedule() when a busy worker is
20556 + * going to sleep.
20557   */
20558 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
20559 +void wq_worker_sleeping(struct task_struct *task)
20560  {
20561 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
20562 +       struct worker *worker = kthread_data(task);
20563         struct worker_pool *pool;
20564  
20565         /*
20566 @@ -882,29 +911,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
20567          * checking NOT_RUNNING.
20568          */
20569         if (worker->flags & WORKER_NOT_RUNNING)
20570 -               return NULL;
20571 +               return;
20572  
20573         pool = worker->pool;
20574  
20575 -       /* this can only happen on the local cpu */
20576 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
20577 -               return NULL;
20578 +       if (WARN_ON_ONCE(worker->sleeping))
20579 +               return;
20580 +
20581 +       worker->sleeping = 1;
20582  
20583         /*
20584          * The counterpart of the following dec_and_test, implied mb,
20585          * worklist not empty test sequence is in insert_work().
20586          * Please read comment there.
20587 -        *
20588 -        * NOT_RUNNING is clear.  This means that we're bound to and
20589 -        * running on the local cpu w/ rq lock held and preemption
20590 -        * disabled, which in turn means that none else could be
20591 -        * manipulating idle_list, so dereferencing idle_list without pool
20592 -        * lock is safe.
20593          */
20594         if (atomic_dec_and_test(&pool->nr_running) &&
20595 -           !list_empty(&pool->worklist))
20596 -               to_wakeup = first_idle_worker(pool);
20597 -       return to_wakeup ? to_wakeup->task : NULL;
20598 +           !list_empty(&pool->worklist)) {
20599 +               sched_lock_idle_list(pool);
20600 +               wake_up_worker(pool);
20601 +               sched_unlock_idle_list(pool);
20602 +       }
20603  }
20604  
20605  /**
20606 @@ -1098,12 +1124,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
20607  {
20608         if (pwq) {
20609                 /*
20610 -                * As both pwqs and pools are sched-RCU protected, the
20611 +                * As both pwqs and pools are RCU protected, the
20612                  * following lock operations are safe.
20613                  */
20614 -               spin_lock_irq(&pwq->pool->lock);
20615 +               rcu_read_lock();
20616 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
20617                 put_pwq(pwq);
20618 -               spin_unlock_irq(&pwq->pool->lock);
20619 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
20620 +               rcu_read_unlock();
20621         }
20622  }
20623  
20624 @@ -1207,7 +1235,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
20625         struct worker_pool *pool;
20626         struct pool_workqueue *pwq;
20627  
20628 -       local_irq_save(*flags);
20629 +       local_lock_irqsave(pendingb_lock, *flags);
20630  
20631         /* try to steal the timer if it exists */
20632         if (is_dwork) {
20633 @@ -1226,6 +1254,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
20634         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
20635                 return 0;
20636  
20637 +       rcu_read_lock();
20638         /*
20639          * The queueing is in progress, or it is already queued. Try to
20640          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
20641 @@ -1264,14 +1293,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
20642                 set_work_pool_and_keep_pending(work, pool->id);
20643  
20644                 spin_unlock(&pool->lock);
20645 +               rcu_read_unlock();
20646                 return 1;
20647         }
20648         spin_unlock(&pool->lock);
20649  fail:
20650 -       local_irq_restore(*flags);
20651 +       rcu_read_unlock();
20652 +       local_unlock_irqrestore(pendingb_lock, *flags);
20653         if (work_is_canceling(work))
20654                 return -ENOENT;
20655 -       cpu_relax();
20656 +       cpu_chill();
20657         return -EAGAIN;
20658  }
20659  
20660 @@ -1373,7 +1404,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
20661          * queued or lose PENDING.  Grabbing PENDING and queueing should
20662          * happen with IRQ disabled.
20663          */
20664 -       WARN_ON_ONCE(!irqs_disabled());
20665 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
20666  
20667         debug_work_activate(work);
20668  
20669 @@ -1381,6 +1412,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
20670         if (unlikely(wq->flags & __WQ_DRAINING) &&
20671             WARN_ON_ONCE(!is_chained_work(wq)))
20672                 return;
20673 +       rcu_read_lock();
20674  retry:
20675         if (req_cpu == WORK_CPU_UNBOUND)
20676                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
20677 @@ -1437,10 +1469,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
20678         /* pwq determined, queue */
20679         trace_workqueue_queue_work(req_cpu, pwq, work);
20680  
20681 -       if (WARN_ON(!list_empty(&work->entry))) {
20682 -               spin_unlock(&pwq->pool->lock);
20683 -               return;
20684 -       }
20685 +       if (WARN_ON(!list_empty(&work->entry)))
20686 +               goto out;
20687  
20688         pwq->nr_in_flight[pwq->work_color]++;
20689         work_flags = work_color_to_flags(pwq->work_color);
20690 @@ -1458,7 +1488,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
20691  
20692         insert_work(pwq, work, worklist, work_flags);
20693  
20694 +out:
20695         spin_unlock(&pwq->pool->lock);
20696 +       rcu_read_unlock();
20697  }
20698  
20699  /**
20700 @@ -1478,14 +1510,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
20701         bool ret = false;
20702         unsigned long flags;
20703  
20704 -       local_irq_save(flags);
20705 +       local_lock_irqsave(pendingb_lock,flags);
20706  
20707         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
20708                 __queue_work(cpu, wq, work);
20709                 ret = true;
20710         }
20711  
20712 -       local_irq_restore(flags);
20713 +       local_unlock_irqrestore(pendingb_lock, flags);
20714         return ret;
20715  }
20716  EXPORT_SYMBOL(queue_work_on);
20717 @@ -1552,14 +1584,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
20718         unsigned long flags;
20719  
20720         /* read the comment in __queue_work() */
20721 -       local_irq_save(flags);
20722 +       local_lock_irqsave(pendingb_lock, flags);
20723  
20724         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
20725                 __queue_delayed_work(cpu, wq, dwork, delay);
20726                 ret = true;
20727         }
20728  
20729 -       local_irq_restore(flags);
20730 +       local_unlock_irqrestore(pendingb_lock, flags);
20731         return ret;
20732  }
20733  EXPORT_SYMBOL(queue_delayed_work_on);
20734 @@ -1594,7 +1626,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
20735  
20736         if (likely(ret >= 0)) {
20737                 __queue_delayed_work(cpu, wq, dwork, delay);
20738 -               local_irq_restore(flags);
20739 +               local_unlock_irqrestore(pendingb_lock, flags);
20740         }
20741  
20742         /* -ENOENT from try_to_grab_pending() becomes %true */
20743 @@ -1627,7 +1659,9 @@ static void worker_enter_idle(struct worker *worker)
20744         worker->last_active = jiffies;
20745  
20746         /* idle_list is LIFO */
20747 +       rt_lock_idle_list(pool);
20748         list_add(&worker->entry, &pool->idle_list);
20749 +       rt_unlock_idle_list(pool);
20750  
20751         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
20752                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
20753 @@ -1660,7 +1694,9 @@ static void worker_leave_idle(struct worker *worker)
20754                 return;
20755         worker_clr_flags(worker, WORKER_IDLE);
20756         pool->nr_idle--;
20757 +       rt_lock_idle_list(pool);
20758         list_del_init(&worker->entry);
20759 +       rt_unlock_idle_list(pool);
20760  }
20761  
20762  static struct worker *alloc_worker(int node)
20763 @@ -1826,7 +1862,9 @@ static void destroy_worker(struct worker *worker)
20764         pool->nr_workers--;
20765         pool->nr_idle--;
20766  
20767 +       rt_lock_idle_list(pool);
20768         list_del_init(&worker->entry);
20769 +       rt_unlock_idle_list(pool);
20770         worker->flags |= WORKER_DIE;
20771         wake_up_process(worker->task);
20772  }
20773 @@ -2785,14 +2823,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
20774  
20775         might_sleep();
20776  
20777 -       local_irq_disable();
20778 +       rcu_read_lock();
20779         pool = get_work_pool(work);
20780         if (!pool) {
20781 -               local_irq_enable();
20782 +               rcu_read_unlock();
20783                 return false;
20784         }
20785  
20786 -       spin_lock(&pool->lock);
20787 +       spin_lock_irq(&pool->lock);
20788         /* see the comment in try_to_grab_pending() with the same code */
20789         pwq = get_work_pwq(work);
20790         if (pwq) {
20791 @@ -2821,10 +2859,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
20792         else
20793                 lock_map_acquire_read(&pwq->wq->lockdep_map);
20794         lock_map_release(&pwq->wq->lockdep_map);
20795 -
20796 +       rcu_read_unlock();
20797         return true;
20798  already_gone:
20799         spin_unlock_irq(&pool->lock);
20800 +       rcu_read_unlock();
20801         return false;
20802  }
20803  
20804 @@ -2911,7 +2950,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
20805  
20806         /* tell other tasks trying to grab @work to back off */
20807         mark_work_canceling(work);
20808 -       local_irq_restore(flags);
20809 +       local_unlock_irqrestore(pendingb_lock, flags);
20810  
20811         flush_work(work);
20812         clear_work_data(work);
20813 @@ -2966,10 +3005,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
20814   */
20815  bool flush_delayed_work(struct delayed_work *dwork)
20816  {
20817 -       local_irq_disable();
20818 +       local_lock_irq(pendingb_lock);
20819         if (del_timer_sync(&dwork->timer))
20820                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
20821 -       local_irq_enable();
20822 +       local_unlock_irq(pendingb_lock);
20823         return flush_work(&dwork->work);
20824  }
20825  EXPORT_SYMBOL(flush_delayed_work);
20826 @@ -2987,7 +3026,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
20827                 return false;
20828  
20829         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
20830 -       local_irq_restore(flags);
20831 +       local_unlock_irqrestore(pendingb_lock, flags);
20832         return ret;
20833  }
20834  
20835 @@ -3245,7 +3284,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
20836   * put_unbound_pool - put a worker_pool
20837   * @pool: worker_pool to put
20838   *
20839 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
20840 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
20841   * safe manner.  get_unbound_pool() calls this function on its failure path
20842   * and this function should be able to release pools which went through,
20843   * successfully or not, init_worker_pool().
20844 @@ -3299,8 +3338,8 @@ static void put_unbound_pool(struct worker_pool *pool)
20845         del_timer_sync(&pool->idle_timer);
20846         del_timer_sync(&pool->mayday_timer);
20847  
20848 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
20849 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
20850 +       /* RCU protected to allow dereferences from get_work_pool() */
20851 +       call_rcu(&pool->rcu, rcu_free_pool);
20852  }
20853  
20854  /**
20855 @@ -3407,14 +3446,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
20856         put_unbound_pool(pool);
20857         mutex_unlock(&wq_pool_mutex);
20858  
20859 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
20860 +       call_rcu(&pwq->rcu, rcu_free_pwq);
20861  
20862         /*
20863          * If we're the last pwq going away, @wq is already dead and no one
20864          * is gonna access it anymore.  Schedule RCU free.
20865          */
20866         if (is_last)
20867 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
20868 +               call_rcu(&wq->rcu, rcu_free_wq);
20869  }
20870  
20871  /**
20872 @@ -4064,7 +4103,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
20873                  * The base ref is never dropped on per-cpu pwqs.  Directly
20874                  * schedule RCU free.
20875                  */
20876 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
20877 +               call_rcu(&wq->rcu, rcu_free_wq);
20878         } else {
20879                 /*
20880                  * We're the sole accessor of @wq at this point.  Directly
20881 @@ -4157,7 +4196,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
20882         struct pool_workqueue *pwq;
20883         bool ret;
20884  
20885 -       rcu_read_lock_sched();
20886 +       rcu_read_lock();
20887 +       preempt_disable();
20888  
20889         if (cpu == WORK_CPU_UNBOUND)
20890                 cpu = smp_processor_id();
20891 @@ -4168,7 +4208,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
20892                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
20893  
20894         ret = !list_empty(&pwq->delayed_works);
20895 -       rcu_read_unlock_sched();
20896 +       preempt_enable();
20897 +       rcu_read_unlock();
20898  
20899         return ret;
20900  }
20901 @@ -4194,15 +4235,15 @@ unsigned int work_busy(struct work_struct *work)
20902         if (work_pending(work))
20903                 ret |= WORK_BUSY_PENDING;
20904  
20905 -       local_irq_save(flags);
20906 +       rcu_read_lock();
20907         pool = get_work_pool(work);
20908         if (pool) {
20909 -               spin_lock(&pool->lock);
20910 +               spin_lock_irqsave(&pool->lock, flags);
20911                 if (find_worker_executing_work(pool, work))
20912                         ret |= WORK_BUSY_RUNNING;
20913 -               spin_unlock(&pool->lock);
20914 +               spin_unlock_irqrestore(&pool->lock, flags);
20915         }
20916 -       local_irq_restore(flags);
20917 +       rcu_read_unlock();
20918  
20919         return ret;
20920  }
20921 @@ -4391,7 +4432,7 @@ void show_workqueue_state(void)
20922         unsigned long flags;
20923         int pi;
20924  
20925 -       rcu_read_lock_sched();
20926 +       rcu_read_lock();
20927  
20928         pr_info("Showing busy workqueues and worker pools:\n");
20929  
20930 @@ -4444,7 +4485,7 @@ void show_workqueue_state(void)
20931                 spin_unlock_irqrestore(&pool->lock, flags);
20932         }
20933  
20934 -       rcu_read_unlock_sched();
20935 +       rcu_read_unlock();
20936  }
20937  
20938  /*
20939 @@ -4782,16 +4823,16 @@ bool freeze_workqueues_busy(void)
20940                  * nr_active is monotonically decreasing.  It's safe
20941                  * to peek without lock.
20942                  */
20943 -               rcu_read_lock_sched();
20944 +               rcu_read_lock();
20945                 for_each_pwq(pwq, wq) {
20946                         WARN_ON_ONCE(pwq->nr_active < 0);
20947                         if (pwq->nr_active) {
20948                                 busy = true;
20949 -                               rcu_read_unlock_sched();
20950 +                               rcu_read_unlock();
20951                                 goto out_unlock;
20952                         }
20953                 }
20954 -               rcu_read_unlock_sched();
20955 +               rcu_read_unlock();
20956         }
20957  out_unlock:
20958         mutex_unlock(&wq_pool_mutex);
20959 @@ -4981,7 +5022,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
20960         const char *delim = "";
20961         int node, written = 0;
20962  
20963 -       rcu_read_lock_sched();
20964 +       get_online_cpus();
20965 +       rcu_read_lock();
20966         for_each_node(node) {
20967                 written += scnprintf(buf + written, PAGE_SIZE - written,
20968                                      "%s%d:%d", delim, node,
20969 @@ -4989,7 +5031,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
20970                 delim = " ";
20971         }
20972         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
20973 -       rcu_read_unlock_sched();
20974 +       rcu_read_unlock();
20975 +       put_online_cpus();
20976  
20977         return written;
20978  }
20979 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
20980 index 8635417c587b..f000c4d6917e 100644
20981 --- a/kernel/workqueue_internal.h
20982 +++ b/kernel/workqueue_internal.h
20983 @@ -43,6 +43,7 @@ struct worker {
20984         unsigned long           last_active;    /* L: last active timestamp */
20985         unsigned int            flags;          /* X: flags */
20986         int                     id;             /* I: worker id */
20987 +       int                     sleeping;       /* None */
20988  
20989         /*
20990          * Opaque string set with work_set_desc().  Printed out with task
20991 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
20992   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
20993   * sched/core.c and workqueue.c.
20994   */
20995 -void wq_worker_waking_up(struct task_struct *task, int cpu);
20996 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
20997 +void wq_worker_running(struct task_struct *task);
20998 +void wq_worker_sleeping(struct task_struct *task);
20999  
21000  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
21001 diff --git a/lib/Kconfig b/lib/Kconfig
21002 index 260a80e313b9..b06becb3f477 100644
21003 --- a/lib/Kconfig
21004 +++ b/lib/Kconfig
21005 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
21006  
21007  config CPUMASK_OFFSTACK
21008         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
21009 +       depends on !PREEMPT_RT_FULL
21010         help
21011           Use dynamic allocation for cpumask_var_t, instead of putting
21012           them on the stack.  This is a bit more expensive, but avoids
21013 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
21014 index 056052dc8e91..d8494e126de8 100644
21015 --- a/lib/debugobjects.c
21016 +++ b/lib/debugobjects.c
21017 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
21018         struct debug_obj *obj;
21019         unsigned long flags;
21020  
21021 -       fill_pool();
21022 +#ifdef CONFIG_PREEMPT_RT_FULL
21023 +       if (preempt_count() == 0 && !irqs_disabled())
21024 +#endif
21025 +               fill_pool();
21026  
21027         db = get_bucket((unsigned long) addr);
21028  
21029 diff --git a/lib/idr.c b/lib/idr.c
21030 index 6098336df267..9decbe914595 100644
21031 --- a/lib/idr.c
21032 +++ b/lib/idr.c
21033 @@ -30,6 +30,7 @@
21034  #include <linux/idr.h>
21035  #include <linux/spinlock.h>
21036  #include <linux/percpu.h>
21037 +#include <linux/locallock.h>
21038  
21039  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
21040  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
21041 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
21042  static DEFINE_PER_CPU(int, idr_preload_cnt);
21043  static DEFINE_SPINLOCK(simple_ida_lock);
21044  
21045 +#ifdef CONFIG_PREEMPT_RT_FULL
21046 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
21047 +
21048 +static inline void idr_preload_lock(void)
21049 +{
21050 +       local_lock(idr_lock);
21051 +}
21052 +
21053 +static inline void idr_preload_unlock(void)
21054 +{
21055 +       local_unlock(idr_lock);
21056 +}
21057 +
21058 +void idr_preload_end(void)
21059 +{
21060 +       idr_preload_unlock();
21061 +}
21062 +EXPORT_SYMBOL(idr_preload_end);
21063 +#else
21064 +static inline void idr_preload_lock(void)
21065 +{
21066 +       preempt_disable();
21067 +}
21068 +
21069 +static inline void idr_preload_unlock(void)
21070 +{
21071 +       preempt_enable();
21072 +}
21073 +#endif
21074 +
21075 +
21076  /* the maximum ID which can be allocated given idr->layers */
21077  static int idr_max(int layers)
21078  {
21079 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
21080          * context.  See idr_preload() for details.
21081          */
21082         if (!in_interrupt()) {
21083 -               preempt_disable();
21084 +               idr_preload_lock();
21085                 new = __this_cpu_read(idr_preload_head);
21086                 if (new) {
21087                         __this_cpu_write(idr_preload_head, new->ary[0]);
21088                         __this_cpu_dec(idr_preload_cnt);
21089                         new->ary[0] = NULL;
21090                 }
21091 -               preempt_enable();
21092 +               idr_preload_unlock();
21093                 if (new)
21094                         return new;
21095         }
21096 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
21097         idr_mark_full(pa, id);
21098  }
21099  
21100 -
21101  /**
21102   * idr_preload - preload for idr_alloc()
21103   * @gfp_mask: allocation mask to use for preloading
21104 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
21105         WARN_ON_ONCE(in_interrupt());
21106         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
21107  
21108 -       preempt_disable();
21109 +       idr_preload_lock();
21110  
21111         /*
21112          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
21113 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
21114         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
21115                 struct idr_layer *new;
21116  
21117 -               preempt_enable();
21118 +               idr_preload_unlock();
21119                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
21120 -               preempt_disable();
21121 +               idr_preload_lock();
21122                 if (!new)
21123                         break;
21124  
21125 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
21126 index 1d6565e81030..b23a79761df7 100644
21127 --- a/lib/irq_poll.c
21128 +++ b/lib/irq_poll.c
21129 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
21130         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
21131         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
21132         local_irq_restore(flags);
21133 +       preempt_check_resched_rt();
21134  }
21135  EXPORT_SYMBOL(irq_poll_sched);
21136  
21137 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
21138         local_irq_save(flags);
21139         __irq_poll_complete(iop);
21140         local_irq_restore(flags);
21141 +       preempt_check_resched_rt();
21142  }
21143  EXPORT_SYMBOL(irq_poll_complete);
21144  
21145 @@ -95,6 +97,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
21146                 }
21147  
21148                 local_irq_enable();
21149 +               preempt_check_resched_rt();
21150  
21151                 /* Even though interrupts have been re-enabled, this
21152                  * access is safe because interrupts can only add new
21153 @@ -132,6 +135,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
21154                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
21155  
21156         local_irq_enable();
21157 +       preempt_check_resched_rt();
21158  }
21159  
21160  /**
21161 @@ -195,6 +199,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
21162                          this_cpu_ptr(&blk_cpu_iopoll));
21163         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
21164         local_irq_enable();
21165 +       preempt_check_resched_rt();
21166  
21167         return 0;
21168  }
21169 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
21170 index f3a217ea0388..4611b156ef79 100644
21171 --- a/lib/locking-selftest.c
21172 +++ b/lib/locking-selftest.c
21173 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
21174  #include "locking-selftest-spin-hardirq.h"
21175  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
21176  
21177 +#ifndef CONFIG_PREEMPT_RT_FULL
21178 +
21179  #include "locking-selftest-rlock-hardirq.h"
21180  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
21181  
21182 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
21183  #include "locking-selftest-wlock-softirq.h"
21184  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
21185  
21186 +#endif
21187 +
21188  #undef E1
21189  #undef E2
21190  
21191 +#ifndef CONFIG_PREEMPT_RT_FULL
21192  /*
21193   * Enabling hardirqs with a softirq-safe lock held:
21194   */
21195 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
21196  #undef E1
21197  #undef E2
21198  
21199 +#endif
21200 +
21201  /*
21202   * Enabling irqs with an irq-safe lock held:
21203   */
21204 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
21205  #include "locking-selftest-spin-hardirq.h"
21206  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
21207  
21208 +#ifndef CONFIG_PREEMPT_RT_FULL
21209 +
21210  #include "locking-selftest-rlock-hardirq.h"
21211  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
21212  
21213 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
21214  #include "locking-selftest-wlock-softirq.h"
21215  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
21216  
21217 +#endif
21218 +
21219  #undef E1
21220  #undef E2
21221  
21222 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
21223  #include "locking-selftest-spin-hardirq.h"
21224  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
21225  
21226 +#ifndef CONFIG_PREEMPT_RT_FULL
21227 +
21228  #include "locking-selftest-rlock-hardirq.h"
21229  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
21230  
21231 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
21232  #include "locking-selftest-wlock-softirq.h"
21233  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
21234  
21235 +#endif
21236 +
21237  #undef E1
21238  #undef E2
21239  #undef E3
21240 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
21241  #include "locking-selftest-spin-hardirq.h"
21242  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
21243  
21244 +#ifndef CONFIG_PREEMPT_RT_FULL
21245 +
21246  #include "locking-selftest-rlock-hardirq.h"
21247  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
21248  
21249 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
21250  #include "locking-selftest-wlock-softirq.h"
21251  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
21252  
21253 +#endif
21254 +
21255  #undef E1
21256  #undef E2
21257  #undef E3
21258  
21259 +#ifndef CONFIG_PREEMPT_RT_FULL
21260 +
21261  /*
21262   * read-lock / write-lock irq inversion.
21263   *
21264 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
21265  #undef E2
21266  #undef E3
21267  
21268 +#endif
21269 +
21270 +#ifndef CONFIG_PREEMPT_RT_FULL
21271 +
21272  /*
21273   * read-lock / write-lock recursion that is actually safe.
21274   */
21275 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
21276  #undef E2
21277  #undef E3
21278  
21279 +#endif
21280 +
21281  /*
21282   * read-lock / write-lock recursion that is unsafe.
21283   */
21284 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
21285  
21286         printk("  --------------------------------------------------------------------------\n");
21287  
21288 +#ifndef CONFIG_PREEMPT_RT_FULL
21289         /*
21290          * irq-context testcases:
21291          */
21292 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
21293  
21294         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
21295  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
21296 +#else
21297 +       /* On -rt, we only do hardirq context test for raw spinlock */
21298 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
21299 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
21300 +
21301 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
21302 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
21303 +
21304 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
21305 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
21306 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
21307 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
21308 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
21309 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
21310 +
21311 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
21312 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
21313 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
21314 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
21315 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
21316 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
21317 +#endif
21318  
21319         ww_tests();
21320  
21321 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
21322 index 6d40944960de..822a2c027e72 100644
21323 --- a/lib/percpu_ida.c
21324 +++ b/lib/percpu_ida.c
21325 @@ -26,6 +26,9 @@
21326  #include <linux/string.h>
21327  #include <linux/spinlock.h>
21328  #include <linux/percpu_ida.h>
21329 +#include <linux/locallock.h>
21330 +
21331 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
21332  
21333  struct percpu_ida_cpu {
21334         /*
21335 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21336         unsigned long flags;
21337         int tag;
21338  
21339 -       local_irq_save(flags);
21340 +       local_lock_irqsave(irq_off_lock, flags);
21341         tags = this_cpu_ptr(pool->tag_cpu);
21342  
21343         /* Fastpath */
21344         tag = alloc_local_tag(tags);
21345         if (likely(tag >= 0)) {
21346 -               local_irq_restore(flags);
21347 +               local_unlock_irqrestore(irq_off_lock, flags);
21348                 return tag;
21349         }
21350  
21351 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21352  
21353                 if (!tags->nr_free)
21354                         alloc_global_tags(pool, tags);
21355 +
21356                 if (!tags->nr_free)
21357                         steal_tags(pool, tags);
21358  
21359 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21360                 }
21361  
21362                 spin_unlock(&pool->lock);
21363 -               local_irq_restore(flags);
21364 +               local_unlock_irqrestore(irq_off_lock, flags);
21365  
21366                 if (tag >= 0 || state == TASK_RUNNING)
21367                         break;
21368 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21369  
21370                 schedule();
21371  
21372 -               local_irq_save(flags);
21373 +               local_lock_irqsave(irq_off_lock, flags);
21374                 tags = this_cpu_ptr(pool->tag_cpu);
21375         }
21376         if (state != TASK_RUNNING)
21377 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
21378  
21379         BUG_ON(tag >= pool->nr_tags);
21380  
21381 -       local_irq_save(flags);
21382 +       local_lock_irqsave(irq_off_lock, flags);
21383         tags = this_cpu_ptr(pool->tag_cpu);
21384  
21385         spin_lock(&tags->lock);
21386 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
21387                 spin_unlock(&pool->lock);
21388         }
21389  
21390 -       local_irq_restore(flags);
21391 +       local_unlock_irqrestore(irq_off_lock, flags);
21392  }
21393  EXPORT_SYMBOL_GPL(percpu_ida_free);
21394  
21395 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
21396         struct percpu_ida_cpu *remote;
21397         unsigned cpu, i, err = 0;
21398  
21399 -       local_irq_save(flags);
21400 +       local_lock_irqsave(irq_off_lock, flags);
21401         for_each_possible_cpu(cpu) {
21402                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
21403                 spin_lock(&remote->lock);
21404 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
21405         }
21406         spin_unlock(&pool->lock);
21407  out:
21408 -       local_irq_restore(flags);
21409 +       local_unlock_irqrestore(irq_off_lock, flags);
21410         return err;
21411  }
21412  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
21413 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
21414 index 8e6d552c40dd..881cc195d85f 100644
21415 --- a/lib/radix-tree.c
21416 +++ b/lib/radix-tree.c
21417 @@ -290,13 +290,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
21418                  * succeed in getting a node here (and never reach
21419                  * kmem_cache_alloc)
21420                  */
21421 -               rtp = this_cpu_ptr(&radix_tree_preloads);
21422 +               rtp = &get_cpu_var(radix_tree_preloads);
21423                 if (rtp->nr) {
21424                         ret = rtp->nodes;
21425                         rtp->nodes = ret->private_data;
21426                         ret->private_data = NULL;
21427                         rtp->nr--;
21428                 }
21429 +               put_cpu_var(radix_tree_preloads);
21430                 /*
21431                  * Update the allocation stack trace as this is more useful
21432                  * for debugging.
21433 @@ -336,6 +337,7 @@ radix_tree_node_free(struct radix_tree_node *node)
21434         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
21435  }
21436  
21437 +#ifndef CONFIG_PREEMPT_RT_FULL
21438  /*
21439   * Load up this CPU's radix_tree_node buffer with sufficient objects to
21440   * ensure that the addition of a single element in the tree cannot fail.  On
21441 @@ -455,6 +457,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
21442  
21443         return __radix_tree_preload(gfp_mask, nr_nodes);
21444  }
21445 +#endif
21446  
21447  /*
21448   * The maximum index which can be stored in a radix tree
21449 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
21450 index 004fc70fc56a..ccc46992a517 100644
21451 --- a/lib/scatterlist.c
21452 +++ b/lib/scatterlist.c
21453 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
21454                         flush_kernel_dcache_page(miter->page);
21455  
21456                 if (miter->__flags & SG_MITER_ATOMIC) {
21457 -                       WARN_ON_ONCE(preemptible());
21458 +                       WARN_ON_ONCE(!pagefault_disabled());
21459                         kunmap_atomic(miter->addr);
21460                 } else
21461                         kunmap(miter->page);
21462 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
21463         if (!sg_miter_skip(&miter, skip))
21464                 return false;
21465  
21466 -       local_irq_save(flags);
21467 +       local_irq_save_nort(flags);
21468  
21469         while (sg_miter_next(&miter) && offset < buflen) {
21470                 unsigned int len;
21471 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
21472  
21473         sg_miter_stop(&miter);
21474  
21475 -       local_irq_restore(flags);
21476 +       local_irq_restore_nort(flags);
21477         return offset;
21478  }
21479  EXPORT_SYMBOL(sg_copy_buffer);
21480 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
21481 index 1afec32de6f2..11fa431046a8 100644
21482 --- a/lib/smp_processor_id.c
21483 +++ b/lib/smp_processor_id.c
21484 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
21485         if (!printk_ratelimit())
21486                 goto out_enable;
21487  
21488 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
21489 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
21490 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
21491 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
21492 +               current->comm, current->pid);
21493  
21494         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
21495         dump_stack();
21496 diff --git a/localversion-rt b/localversion-rt
21497 new file mode 100644
21498 index 000000000000..c3054d08a112
21499 --- /dev/null
21500 +++ b/localversion-rt
21501 @@ -0,0 +1 @@
21502 +-rt2
21503 diff --git a/mm/Kconfig b/mm/Kconfig
21504 index 86e3e0e74d20..77e5862a1ed2 100644
21505 --- a/mm/Kconfig
21506 +++ b/mm/Kconfig
21507 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
21508  
21509  config TRANSPARENT_HUGEPAGE
21510         bool "Transparent Hugepage Support"
21511 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
21512 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
21513         select COMPACTION
21514         select RADIX_TREE_MULTIORDER
21515         help
21516 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
21517 index 8fde443f36d7..d7a863b0ec20 100644
21518 --- a/mm/backing-dev.c
21519 +++ b/mm/backing-dev.c
21520 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
21521  {
21522         unsigned long flags;
21523  
21524 -       local_irq_save(flags);
21525 +       local_irq_save_nort(flags);
21526         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
21527 -               local_irq_restore(flags);
21528 +               local_irq_restore_nort(flags);
21529                 return;
21530         }
21531  
21532 diff --git a/mm/compaction.c b/mm/compaction.c
21533 index 70e6bec46dc2..6678ed58b7c6 100644
21534 --- a/mm/compaction.c
21535 +++ b/mm/compaction.c
21536 @@ -1593,10 +1593,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
21537                                 block_start_pfn(cc->migrate_pfn, cc->order);
21538  
21539                         if (cc->last_migrated_pfn < current_block_start) {
21540 -                               cpu = get_cpu();
21541 +                               cpu = get_cpu_light();
21542 +                               local_lock_irq(swapvec_lock);
21543                                 lru_add_drain_cpu(cpu);
21544 +                               local_unlock_irq(swapvec_lock);
21545                                 drain_local_pages(zone);
21546 -                               put_cpu();
21547 +                               put_cpu_light();
21548                                 /* No more flushing until we migrate again */
21549                                 cc->last_migrated_pfn = 0;
21550                         }
21551 diff --git a/mm/filemap.c b/mm/filemap.c
21552 index 9a50acecc473..59f749a0b738 100644
21553 --- a/mm/filemap.c
21554 +++ b/mm/filemap.c
21555 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
21556                  * node->private_list is protected by
21557                  * mapping->tree_lock.
21558                  */
21559 -               if (!list_empty(&node->private_list))
21560 -                       list_lru_del(&workingset_shadow_nodes,
21561 +               if (!list_empty(&node->private_list)) {
21562 +                       local_lock(workingset_shadow_lock);
21563 +                       list_lru_del(&__workingset_shadow_nodes,
21564                                      &node->private_list);
21565 +                       local_unlock(workingset_shadow_lock);
21566 +               }
21567         }
21568         return 0;
21569  }
21570 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
21571                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
21572                                 list_empty(&node->private_list)) {
21573                         node->private_data = mapping;
21574 -                       list_lru_add(&workingset_shadow_nodes,
21575 -                                       &node->private_list);
21576 +                       local_lock(workingset_shadow_lock);
21577 +                       list_lru_add(&__workingset_shadow_nodes,
21578 +                                    &node->private_list);
21579 +                       local_unlock(workingset_shadow_lock);
21580                 }
21581         }
21582  
21583 diff --git a/mm/highmem.c b/mm/highmem.c
21584 index 50b4ca6787f0..77518a3b35a1 100644
21585 --- a/mm/highmem.c
21586 +++ b/mm/highmem.c
21587 @@ -29,10 +29,11 @@
21588  #include <linux/kgdb.h>
21589  #include <asm/tlbflush.h>
21590  
21591 -
21592 +#ifndef CONFIG_PREEMPT_RT_FULL
21593  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
21594  DEFINE_PER_CPU(int, __kmap_atomic_idx);
21595  #endif
21596 +#endif
21597  
21598  /*
21599   * Virtual_count is not a pure "count".
21600 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
21601  unsigned long totalhigh_pages __read_mostly;
21602  EXPORT_SYMBOL(totalhigh_pages);
21603  
21604 -
21605 +#ifndef CONFIG_PREEMPT_RT_FULL
21606  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
21607 +#endif
21608  
21609  unsigned int nr_free_highpages (void)
21610  {
21611 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
21612 index 0f870ba43942..f219b4066e6d 100644
21613 --- a/mm/memcontrol.c
21614 +++ b/mm/memcontrol.c
21615 @@ -67,6 +67,7 @@
21616  #include <net/sock.h>
21617  #include <net/ip.h>
21618  #include "slab.h"
21619 +#include <linux/locallock.h>
21620  
21621  #include <asm/uaccess.h>
21622  
21623 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
21624  #define do_swap_account                0
21625  #endif
21626  
21627 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
21628 +
21629  /* Whether legacy memory+swap accounting is active */
21630  static bool do_memsw_account(void)
21631  {
21632 @@ -1694,6 +1697,7 @@ struct memcg_stock_pcp {
21633  #define FLUSHING_CACHED_CHARGE 0
21634  };
21635  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
21636 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
21637  static DEFINE_MUTEX(percpu_charge_mutex);
21638  
21639  /**
21640 @@ -1716,7 +1720,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
21641         if (nr_pages > CHARGE_BATCH)
21642                 return ret;
21643  
21644 -       local_irq_save(flags);
21645 +       local_lock_irqsave(memcg_stock_ll, flags);
21646  
21647         stock = this_cpu_ptr(&memcg_stock);
21648         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
21649 @@ -1724,7 +1728,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
21650                 ret = true;
21651         }
21652  
21653 -       local_irq_restore(flags);
21654 +       local_unlock_irqrestore(memcg_stock_ll, flags);
21655  
21656         return ret;
21657  }
21658 @@ -1751,13 +1755,13 @@ static void drain_local_stock(struct work_struct *dummy)
21659         struct memcg_stock_pcp *stock;
21660         unsigned long flags;
21661  
21662 -       local_irq_save(flags);
21663 +       local_lock_irqsave(memcg_stock_ll, flags);
21664  
21665         stock = this_cpu_ptr(&memcg_stock);
21666         drain_stock(stock);
21667         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
21668  
21669 -       local_irq_restore(flags);
21670 +       local_unlock_irqrestore(memcg_stock_ll, flags);
21671  }
21672  
21673  /*
21674 @@ -1769,7 +1773,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
21675         struct memcg_stock_pcp *stock;
21676         unsigned long flags;
21677  
21678 -       local_irq_save(flags);
21679 +       local_lock_irqsave(memcg_stock_ll, flags);
21680  
21681         stock = this_cpu_ptr(&memcg_stock);
21682         if (stock->cached != memcg) { /* reset if necessary */
21683 @@ -1778,7 +1782,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
21684         }
21685         stock->nr_pages += nr_pages;
21686  
21687 -       local_irq_restore(flags);
21688 +       local_unlock_irqrestore(memcg_stock_ll, flags);
21689  }
21690  
21691  /*
21692 @@ -1794,7 +1798,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
21693                 return;
21694         /* Notify other cpus that system-wide "drain" is running */
21695         get_online_cpus();
21696 -       curcpu = get_cpu();
21697 +       curcpu = get_cpu_light();
21698         for_each_online_cpu(cpu) {
21699                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
21700                 struct mem_cgroup *memcg;
21701 @@ -1811,7 +1815,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
21702                                 schedule_work_on(cpu, &stock->work);
21703                 }
21704         }
21705 -       put_cpu();
21706 +       put_cpu_light();
21707         put_online_cpus();
21708         mutex_unlock(&percpu_charge_mutex);
21709  }
21710 @@ -4550,12 +4554,12 @@ static int mem_cgroup_move_account(struct page *page,
21711  
21712         ret = 0;
21713  
21714 -       local_irq_disable();
21715 +       local_lock_irq(event_lock);
21716         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
21717         memcg_check_events(to, page);
21718         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
21719         memcg_check_events(from, page);
21720 -       local_irq_enable();
21721 +       local_unlock_irq(event_lock);
21722  out_unlock:
21723         unlock_page(page);
21724  out:
21725 @@ -5430,10 +5434,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
21726  
21727         commit_charge(page, memcg, lrucare);
21728  
21729 -       local_irq_disable();
21730 +       local_lock_irq(event_lock);
21731         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
21732         memcg_check_events(memcg, page);
21733 -       local_irq_enable();
21734 +       local_unlock_irq(event_lock);
21735  
21736         if (do_memsw_account() && PageSwapCache(page)) {
21737                 swp_entry_t entry = { .val = page_private(page) };
21738 @@ -5489,14 +5493,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
21739                 memcg_oom_recover(memcg);
21740         }
21741  
21742 -       local_irq_save(flags);
21743 +       local_lock_irqsave(event_lock, flags);
21744         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
21745         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
21746         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
21747         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
21748         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
21749         memcg_check_events(memcg, dummy_page);
21750 -       local_irq_restore(flags);
21751 +       local_unlock_irqrestore(event_lock, flags);
21752  
21753         if (!mem_cgroup_is_root(memcg))
21754                 css_put_many(&memcg->css, nr_pages);
21755 @@ -5651,10 +5655,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
21756  
21757         commit_charge(newpage, memcg, false);
21758  
21759 -       local_irq_save(flags);
21760 +       local_lock_irqsave(event_lock, flags);
21761         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
21762         memcg_check_events(memcg, newpage);
21763 -       local_irq_restore(flags);
21764 +       local_unlock_irqrestore(event_lock, flags);
21765  }
21766  
21767  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
21768 @@ -5834,6 +5838,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
21769  {
21770         struct mem_cgroup *memcg, *swap_memcg;
21771         unsigned short oldid;
21772 +       unsigned long flags;
21773  
21774         VM_BUG_ON_PAGE(PageLRU(page), page);
21775         VM_BUG_ON_PAGE(page_count(page), page);
21776 @@ -5874,12 +5879,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
21777          * important here to have the interrupts disabled because it is the
21778          * only synchronisation we have for udpating the per-CPU variables.
21779          */
21780 +       local_lock_irqsave(event_lock, flags);
21781 +#ifndef CONFIG_PREEMPT_RT_BASE
21782         VM_BUG_ON(!irqs_disabled());
21783 +#endif
21784         mem_cgroup_charge_statistics(memcg, page, false, -1);
21785         memcg_check_events(memcg, page);
21786  
21787         if (!mem_cgroup_is_root(memcg))
21788                 css_put(&memcg->css);
21789 +       local_unlock_irqrestore(event_lock, flags);
21790  }
21791  
21792  /*
21793 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
21794 index 6f4d27c5bb32..5cd25c745a8f 100644
21795 --- a/mm/mmu_context.c
21796 +++ b/mm/mmu_context.c
21797 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
21798         struct task_struct *tsk = current;
21799  
21800         task_lock(tsk);
21801 +       preempt_disable_rt();
21802         active_mm = tsk->active_mm;
21803         if (active_mm != mm) {
21804                 atomic_inc(&mm->mm_count);
21805 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
21806         }
21807         tsk->mm = mm;
21808         switch_mm(active_mm, mm, tsk);
21809 +       preempt_enable_rt();
21810         task_unlock(tsk);
21811  #ifdef finish_arch_post_lock_switch
21812         finish_arch_post_lock_switch();
21813 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
21814 index 34ada718ef47..21f0dc3fe2aa 100644
21815 --- a/mm/page_alloc.c
21816 +++ b/mm/page_alloc.c
21817 @@ -61,6 +61,7 @@
21818  #include <linux/page_ext.h>
21819  #include <linux/hugetlb.h>
21820  #include <linux/sched/rt.h>
21821 +#include <linux/locallock.h>
21822  #include <linux/page_owner.h>
21823  #include <linux/kthread.h>
21824  #include <linux/memcontrol.h>
21825 @@ -281,6 +282,18 @@ EXPORT_SYMBOL(nr_node_ids);
21826  EXPORT_SYMBOL(nr_online_nodes);
21827  #endif
21828  
21829 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
21830 +
21831 +#ifdef CONFIG_PREEMPT_RT_BASE
21832 +# define cpu_lock_irqsave(cpu, flags)          \
21833 +       local_lock_irqsave_on(pa_lock, flags, cpu)
21834 +# define cpu_unlock_irqrestore(cpu, flags)     \
21835 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
21836 +#else
21837 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
21838 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
21839 +#endif
21840 +
21841  int page_group_by_mobility_disabled __read_mostly;
21842  
21843  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
21844 @@ -1072,7 +1085,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
21845  #endif /* CONFIG_DEBUG_VM */
21846  
21847  /*
21848 - * Frees a number of pages from the PCP lists
21849 + * Frees a number of pages which have been collected from the pcp lists.
21850   * Assumes all pages on list are in same zone, and of same order.
21851   * count is the number of pages to free.
21852   *
21853 @@ -1083,19 +1096,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
21854   * pinned" detection logic.
21855   */
21856  static void free_pcppages_bulk(struct zone *zone, int count,
21857 -                                       struct per_cpu_pages *pcp)
21858 +                              struct list_head *list)
21859  {
21860 -       int migratetype = 0;
21861 -       int batch_free = 0;
21862         unsigned long nr_scanned;
21863         bool isolated_pageblocks;
21864 +       unsigned long flags;
21865 +
21866 +       spin_lock_irqsave(&zone->lock, flags);
21867  
21868 -       spin_lock(&zone->lock);
21869         isolated_pageblocks = has_isolate_pageblock(zone);
21870         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
21871         if (nr_scanned)
21872                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
21873  
21874 +       while (!list_empty(list)) {
21875 +               struct page *page;
21876 +               int mt; /* migratetype of the to-be-freed page */
21877 +
21878 +               page = list_first_entry(list, struct page, lru);
21879 +               /* must delete as __free_one_page list manipulates */
21880 +               list_del(&page->lru);
21881 +
21882 +               mt = get_pcppage_migratetype(page);
21883 +               /* MIGRATE_ISOLATE page should not go to pcplists */
21884 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
21885 +               /* Pageblock could have been isolated meanwhile */
21886 +               if (unlikely(isolated_pageblocks))
21887 +                       mt = get_pageblock_migratetype(page);
21888 +
21889 +               if (bulkfree_pcp_prepare(page))
21890 +                       continue;
21891 +
21892 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
21893 +               trace_mm_page_pcpu_drain(page, 0, mt);
21894 +               count--;
21895 +       }
21896 +       WARN_ON(count != 0);
21897 +       spin_unlock_irqrestore(&zone->lock, flags);
21898 +}
21899 +
21900 +/*
21901 + * Moves a number of pages from the PCP lists to free list which
21902 + * is freed outside of the locked region.
21903 + *
21904 + * Assumes all pages on list are in same zone, and of same order.
21905 + * count is the number of pages to free.
21906 + */
21907 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
21908 +                             struct list_head *dst)
21909 +{
21910 +       int migratetype = 0;
21911 +       int batch_free = 0;
21912 +
21913         while (count) {
21914                 struct page *page;
21915                 struct list_head *list;
21916 @@ -1111,7 +1163,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
21917                         batch_free++;
21918                         if (++migratetype == MIGRATE_PCPTYPES)
21919                                 migratetype = 0;
21920 -                       list = &pcp->lists[migratetype];
21921 +                       list = &src->lists[migratetype];
21922                 } while (list_empty(list));
21923  
21924                 /* This is the only non-empty list. Free them all. */
21925 @@ -1119,27 +1171,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
21926                         batch_free = count;
21927  
21928                 do {
21929 -                       int mt; /* migratetype of the to-be-freed page */
21930 -
21931                         page = list_last_entry(list, struct page, lru);
21932 -                       /* must delete as __free_one_page list manipulates */
21933                         list_del(&page->lru);
21934  
21935 -                       mt = get_pcppage_migratetype(page);
21936 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
21937 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
21938 -                       /* Pageblock could have been isolated meanwhile */
21939 -                       if (unlikely(isolated_pageblocks))
21940 -                               mt = get_pageblock_migratetype(page);
21941 -
21942 -                       if (bulkfree_pcp_prepare(page))
21943 -                               continue;
21944 -
21945 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
21946 -                       trace_mm_page_pcpu_drain(page, 0, mt);
21947 +                       list_add(&page->lru, dst);
21948                 } while (--count && --batch_free && !list_empty(list));
21949         }
21950 -       spin_unlock(&zone->lock);
21951  }
21952  
21953  static void free_one_page(struct zone *zone,
21954 @@ -1148,7 +1185,9 @@ static void free_one_page(struct zone *zone,
21955                                 int migratetype)
21956  {
21957         unsigned long nr_scanned;
21958 -       spin_lock(&zone->lock);
21959 +       unsigned long flags;
21960 +
21961 +       spin_lock_irqsave(&zone->lock, flags);
21962         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
21963         if (nr_scanned)
21964                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
21965 @@ -1158,7 +1197,7 @@ static void free_one_page(struct zone *zone,
21966                 migratetype = get_pfnblock_migratetype(page, pfn);
21967         }
21968         __free_one_page(page, pfn, zone, order, migratetype);
21969 -       spin_unlock(&zone->lock);
21970 +       spin_unlock_irqrestore(&zone->lock, flags);
21971  }
21972  
21973  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
21974 @@ -1244,10 +1283,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
21975                 return;
21976  
21977         migratetype = get_pfnblock_migratetype(page, pfn);
21978 -       local_irq_save(flags);
21979 +       local_lock_irqsave(pa_lock, flags);
21980         __count_vm_events(PGFREE, 1 << order);
21981         free_one_page(page_zone(page), page, pfn, order, migratetype);
21982 -       local_irq_restore(flags);
21983 +       local_unlock_irqrestore(pa_lock, flags);
21984  }
21985  
21986  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
21987 @@ -2246,16 +2285,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
21988  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
21989  {
21990         unsigned long flags;
21991 +       LIST_HEAD(dst);
21992         int to_drain, batch;
21993  
21994 -       local_irq_save(flags);
21995 +       local_lock_irqsave(pa_lock, flags);
21996         batch = READ_ONCE(pcp->batch);
21997         to_drain = min(pcp->count, batch);
21998         if (to_drain > 0) {
21999 -               free_pcppages_bulk(zone, to_drain, pcp);
22000 +               isolate_pcp_pages(to_drain, pcp, &dst);
22001                 pcp->count -= to_drain;
22002         }
22003 -       local_irq_restore(flags);
22004 +       local_unlock_irqrestore(pa_lock, flags);
22005 +       free_pcppages_bulk(zone, to_drain, &dst);
22006  }
22007  #endif
22008  
22009 @@ -2271,16 +2312,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
22010         unsigned long flags;
22011         struct per_cpu_pageset *pset;
22012         struct per_cpu_pages *pcp;
22013 +       LIST_HEAD(dst);
22014 +       int count;
22015  
22016 -       local_irq_save(flags);
22017 +       cpu_lock_irqsave(cpu, flags);
22018         pset = per_cpu_ptr(zone->pageset, cpu);
22019  
22020         pcp = &pset->pcp;
22021 -       if (pcp->count) {
22022 -               free_pcppages_bulk(zone, pcp->count, pcp);
22023 +       count = pcp->count;
22024 +       if (count) {
22025 +               isolate_pcp_pages(count, pcp, &dst);
22026                 pcp->count = 0;
22027         }
22028 -       local_irq_restore(flags);
22029 +       cpu_unlock_irqrestore(cpu, flags);
22030 +       if (count)
22031 +               free_pcppages_bulk(zone, count, &dst);
22032  }
22033  
22034  /*
22035 @@ -2366,8 +2412,17 @@ void drain_all_pages(struct zone *zone)
22036                 else
22037                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
22038         }
22039 +#ifndef CONFIG_PREEMPT_RT_BASE
22040         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
22041                                                                 zone, 1);
22042 +#else
22043 +       for_each_cpu(cpu, &cpus_with_pcps) {
22044 +               if (zone)
22045 +                       drain_pages_zone(cpu, zone);
22046 +               else
22047 +                       drain_pages(cpu);
22048 +       }
22049 +#endif
22050  }
22051  
22052  #ifdef CONFIG_HIBERNATION
22053 @@ -2427,7 +2482,7 @@ void free_hot_cold_page(struct page *page, bool cold)
22054  
22055         migratetype = get_pfnblock_migratetype(page, pfn);
22056         set_pcppage_migratetype(page, migratetype);
22057 -       local_irq_save(flags);
22058 +       local_lock_irqsave(pa_lock, flags);
22059         __count_vm_event(PGFREE);
22060  
22061         /*
22062 @@ -2453,12 +2508,17 @@ void free_hot_cold_page(struct page *page, bool cold)
22063         pcp->count++;
22064         if (pcp->count >= pcp->high) {
22065                 unsigned long batch = READ_ONCE(pcp->batch);
22066 -               free_pcppages_bulk(zone, batch, pcp);
22067 +               LIST_HEAD(dst);
22068 +
22069 +               isolate_pcp_pages(batch, pcp, &dst);
22070                 pcp->count -= batch;
22071 +               local_unlock_irqrestore(pa_lock, flags);
22072 +               free_pcppages_bulk(zone, batch, &dst);
22073 +               return;
22074         }
22075  
22076  out:
22077 -       local_irq_restore(flags);
22078 +       local_unlock_irqrestore(pa_lock, flags);
22079  }
22080  
22081  /*
22082 @@ -2600,7 +2660,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
22083                 struct per_cpu_pages *pcp;
22084                 struct list_head *list;
22085  
22086 -               local_irq_save(flags);
22087 +               local_lock_irqsave(pa_lock, flags);
22088                 do {
22089                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
22090                         list = &pcp->lists[migratetype];
22091 @@ -2627,7 +2687,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
22092                  * allocate greater than order-1 page units with __GFP_NOFAIL.
22093                  */
22094                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
22095 -               spin_lock_irqsave(&zone->lock, flags);
22096 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
22097  
22098                 do {
22099                         page = NULL;
22100 @@ -2639,22 +2699,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
22101                         if (!page)
22102                                 page = __rmqueue(zone, order, migratetype);
22103                 } while (page && check_new_pages(page, order));
22104 -               spin_unlock(&zone->lock);
22105 -               if (!page)
22106 +               if (!page) {
22107 +                       spin_unlock(&zone->lock);
22108                         goto failed;
22109 +               }
22110                 __mod_zone_freepage_state(zone, -(1 << order),
22111                                           get_pcppage_migratetype(page));
22112 +               spin_unlock(&zone->lock);
22113         }
22114  
22115         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
22116         zone_statistics(preferred_zone, zone, gfp_flags);
22117 -       local_irq_restore(flags);
22118 +       local_unlock_irqrestore(pa_lock, flags);
22119  
22120         VM_BUG_ON_PAGE(bad_range(zone, page), page);
22121         return page;
22122  
22123  failed:
22124 -       local_irq_restore(flags);
22125 +       local_unlock_irqrestore(pa_lock, flags);
22126         return NULL;
22127  }
22128  
22129 @@ -6505,7 +6567,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
22130         int cpu = (unsigned long)hcpu;
22131  
22132         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
22133 +               local_lock_irq_on(swapvec_lock, cpu);
22134                 lru_add_drain_cpu(cpu);
22135 +               local_unlock_irq_on(swapvec_lock, cpu);
22136                 drain_pages(cpu);
22137  
22138                 /*
22139 @@ -6531,6 +6595,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
22140  void __init page_alloc_init(void)
22141  {
22142         hotcpu_notifier(page_alloc_cpu_notify, 0);
22143 +       local_irq_lock_init(pa_lock);
22144  }
22145  
22146  /*
22147 @@ -7359,7 +7424,7 @@ void zone_pcp_reset(struct zone *zone)
22148         struct per_cpu_pageset *pset;
22149  
22150         /* avoid races with drain_pages()  */
22151 -       local_irq_save(flags);
22152 +       local_lock_irqsave(pa_lock, flags);
22153         if (zone->pageset != &boot_pageset) {
22154                 for_each_online_cpu(cpu) {
22155                         pset = per_cpu_ptr(zone->pageset, cpu);
22156 @@ -7368,7 +7433,7 @@ void zone_pcp_reset(struct zone *zone)
22157                 free_percpu(zone->pageset);
22158                 zone->pageset = &boot_pageset;
22159         }
22160 -       local_irq_restore(flags);
22161 +       local_unlock_irqrestore(pa_lock, flags);
22162  }
22163  
22164  #ifdef CONFIG_MEMORY_HOTREMOVE
22165 diff --git a/mm/slab.h b/mm/slab.h
22166 index bc05fdc3edce..610cf61634f0 100644
22167 --- a/mm/slab.h
22168 +++ b/mm/slab.h
22169 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
22170   * The slab lists for all objects.
22171   */
22172  struct kmem_cache_node {
22173 +#ifdef CONFIG_SLUB
22174 +       raw_spinlock_t list_lock;
22175 +#else
22176         spinlock_t list_lock;
22177 +#endif
22178  
22179  #ifdef CONFIG_SLAB
22180         struct list_head slabs_partial; /* partial list first, better asm code */
22181 diff --git a/mm/slub.c b/mm/slub.c
22182 index 2b3e740609e9..1732f9c5d31f 100644
22183 --- a/mm/slub.c
22184 +++ b/mm/slub.c
22185 @@ -1141,7 +1141,7 @@ static noinline int free_debug_processing(
22186         unsigned long uninitialized_var(flags);
22187         int ret = 0;
22188  
22189 -       spin_lock_irqsave(&n->list_lock, flags);
22190 +       raw_spin_lock_irqsave(&n->list_lock, flags);
22191         slab_lock(page);
22192  
22193         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
22194 @@ -1176,7 +1176,7 @@ static noinline int free_debug_processing(
22195                          bulk_cnt, cnt);
22196  
22197         slab_unlock(page);
22198 -       spin_unlock_irqrestore(&n->list_lock, flags);
22199 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22200         if (!ret)
22201                 slab_fix(s, "Object at 0x%p not freed", object);
22202         return ret;
22203 @@ -1304,6 +1304,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
22204  
22205  #endif /* CONFIG_SLUB_DEBUG */
22206  
22207 +struct slub_free_list {
22208 +       raw_spinlock_t          lock;
22209 +       struct list_head        list;
22210 +};
22211 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
22212 +
22213  /*
22214   * Hooks for other subsystems that check memory allocations. In a typical
22215   * production configuration these hooks all should produce no code at all.
22216 @@ -1523,10 +1529,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
22217         void *start, *p;
22218         int idx, order;
22219         bool shuffle;
22220 +       bool enableirqs = false;
22221  
22222         flags &= gfp_allowed_mask;
22223  
22224         if (gfpflags_allow_blocking(flags))
22225 +               enableirqs = true;
22226 +#ifdef CONFIG_PREEMPT_RT_FULL
22227 +       if (system_state == SYSTEM_RUNNING)
22228 +               enableirqs = true;
22229 +#endif
22230 +       if (enableirqs)
22231                 local_irq_enable();
22232  
22233         flags |= s->allocflags;
22234 @@ -1601,7 +1614,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
22235         page->frozen = 1;
22236  
22237  out:
22238 -       if (gfpflags_allow_blocking(flags))
22239 +       if (enableirqs)
22240                 local_irq_disable();
22241         if (!page)
22242                 return NULL;
22243 @@ -1660,6 +1673,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
22244         __free_pages(page, order);
22245  }
22246  
22247 +static void free_delayed(struct list_head *h)
22248 +{
22249 +       while(!list_empty(h)) {
22250 +               struct page *page = list_first_entry(h, struct page, lru);
22251 +
22252 +               list_del(&page->lru);
22253 +               __free_slab(page->slab_cache, page);
22254 +       }
22255 +}
22256 +
22257  #define need_reserve_slab_rcu                                          \
22258         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
22259  
22260 @@ -1691,6 +1714,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
22261                 }
22262  
22263                 call_rcu(head, rcu_free_slab);
22264 +       } else if (irqs_disabled()) {
22265 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
22266 +
22267 +               raw_spin_lock(&f->lock);
22268 +               list_add(&page->lru, &f->list);
22269 +               raw_spin_unlock(&f->lock);
22270         } else
22271                 __free_slab(s, page);
22272  }
22273 @@ -1798,7 +1827,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
22274         if (!n || !n->nr_partial)
22275                 return NULL;
22276  
22277 -       spin_lock(&n->list_lock);
22278 +       raw_spin_lock(&n->list_lock);
22279         list_for_each_entry_safe(page, page2, &n->partial, lru) {
22280                 void *t;
22281  
22282 @@ -1823,7 +1852,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
22283                         break;
22284  
22285         }
22286 -       spin_unlock(&n->list_lock);
22287 +       raw_spin_unlock(&n->list_lock);
22288         return object;
22289  }
22290  
22291 @@ -2069,7 +2098,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
22292                          * that acquire_slab() will see a slab page that
22293                          * is frozen
22294                          */
22295 -                       spin_lock(&n->list_lock);
22296 +                       raw_spin_lock(&n->list_lock);
22297                 }
22298         } else {
22299                 m = M_FULL;
22300 @@ -2080,7 +2109,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
22301                          * slabs from diagnostic functions will not see
22302                          * any frozen slabs.
22303                          */
22304 -                       spin_lock(&n->list_lock);
22305 +                       raw_spin_lock(&n->list_lock);
22306                 }
22307         }
22308  
22309 @@ -2115,7 +2144,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
22310                 goto redo;
22311  
22312         if (lock)
22313 -               spin_unlock(&n->list_lock);
22314 +               raw_spin_unlock(&n->list_lock);
22315  
22316         if (m == M_FREE) {
22317                 stat(s, DEACTIVATE_EMPTY);
22318 @@ -2147,10 +2176,10 @@ static void unfreeze_partials(struct kmem_cache *s,
22319                 n2 = get_node(s, page_to_nid(page));
22320                 if (n != n2) {
22321                         if (n)
22322 -                               spin_unlock(&n->list_lock);
22323 +                               raw_spin_unlock(&n->list_lock);
22324  
22325                         n = n2;
22326 -                       spin_lock(&n->list_lock);
22327 +                       raw_spin_lock(&n->list_lock);
22328                 }
22329  
22330                 do {
22331 @@ -2179,7 +2208,7 @@ static void unfreeze_partials(struct kmem_cache *s,
22332         }
22333  
22334         if (n)
22335 -               spin_unlock(&n->list_lock);
22336 +               raw_spin_unlock(&n->list_lock);
22337  
22338         while (discard_page) {
22339                 page = discard_page;
22340 @@ -2218,14 +2247,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
22341                         pobjects = oldpage->pobjects;
22342                         pages = oldpage->pages;
22343                         if (drain && pobjects > s->cpu_partial) {
22344 +                               struct slub_free_list *f;
22345                                 unsigned long flags;
22346 +                               LIST_HEAD(tofree);
22347                                 /*
22348                                  * partial array is full. Move the existing
22349                                  * set to the per node partial list.
22350                                  */
22351                                 local_irq_save(flags);
22352                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
22353 +                               f = this_cpu_ptr(&slub_free_list);
22354 +                               raw_spin_lock(&f->lock);
22355 +                               list_splice_init(&f->list, &tofree);
22356 +                               raw_spin_unlock(&f->lock);
22357                                 local_irq_restore(flags);
22358 +                               free_delayed(&tofree);
22359                                 oldpage = NULL;
22360                                 pobjects = 0;
22361                                 pages = 0;
22362 @@ -2297,7 +2333,22 @@ static bool has_cpu_slab(int cpu, void *info)
22363  
22364  static void flush_all(struct kmem_cache *s)
22365  {
22366 +       LIST_HEAD(tofree);
22367 +       int cpu;
22368 +
22369         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
22370 +       for_each_online_cpu(cpu) {
22371 +               struct slub_free_list *f;
22372 +
22373 +               if (!has_cpu_slab(cpu, s))
22374 +                       continue;
22375 +
22376 +               f = &per_cpu(slub_free_list, cpu);
22377 +               raw_spin_lock_irq(&f->lock);
22378 +               list_splice_init(&f->list, &tofree);
22379 +               raw_spin_unlock_irq(&f->lock);
22380 +               free_delayed(&tofree);
22381 +       }
22382  }
22383  
22384  /*
22385 @@ -2352,10 +2403,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
22386         unsigned long x = 0;
22387         struct page *page;
22388  
22389 -       spin_lock_irqsave(&n->list_lock, flags);
22390 +       raw_spin_lock_irqsave(&n->list_lock, flags);
22391         list_for_each_entry(page, &n->partial, lru)
22392                 x += get_count(page);
22393 -       spin_unlock_irqrestore(&n->list_lock, flags);
22394 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22395         return x;
22396  }
22397  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
22398 @@ -2493,8 +2544,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
22399   * already disabled (which is the case for bulk allocation).
22400   */
22401  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22402 -                         unsigned long addr, struct kmem_cache_cpu *c)
22403 +                         unsigned long addr, struct kmem_cache_cpu *c,
22404 +                         struct list_head *to_free)
22405  {
22406 +       struct slub_free_list *f;
22407         void *freelist;
22408         struct page *page;
22409  
22410 @@ -2554,6 +2607,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22411         VM_BUG_ON(!c->page->frozen);
22412         c->freelist = get_freepointer(s, freelist);
22413         c->tid = next_tid(c->tid);
22414 +
22415 +out:
22416 +       f = this_cpu_ptr(&slub_free_list);
22417 +       raw_spin_lock(&f->lock);
22418 +       list_splice_init(&f->list, to_free);
22419 +       raw_spin_unlock(&f->lock);
22420 +
22421         return freelist;
22422  
22423  new_slab:
22424 @@ -2585,7 +2645,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22425         deactivate_slab(s, page, get_freepointer(s, freelist));
22426         c->page = NULL;
22427         c->freelist = NULL;
22428 -       return freelist;
22429 +       goto out;
22430  }
22431  
22432  /*
22433 @@ -2597,6 +2657,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22434  {
22435         void *p;
22436         unsigned long flags;
22437 +       LIST_HEAD(tofree);
22438  
22439         local_irq_save(flags);
22440  #ifdef CONFIG_PREEMPT
22441 @@ -2608,8 +2669,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22442         c = this_cpu_ptr(s->cpu_slab);
22443  #endif
22444  
22445 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
22446 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
22447         local_irq_restore(flags);
22448 +       free_delayed(&tofree);
22449         return p;
22450  }
22451  
22452 @@ -2795,7 +2857,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22453  
22454         do {
22455                 if (unlikely(n)) {
22456 -                       spin_unlock_irqrestore(&n->list_lock, flags);
22457 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22458                         n = NULL;
22459                 }
22460                 prior = page->freelist;
22461 @@ -2827,7 +2889,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22462                                  * Otherwise the list_lock will synchronize with
22463                                  * other processors updating the list of slabs.
22464                                  */
22465 -                               spin_lock_irqsave(&n->list_lock, flags);
22466 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
22467  
22468                         }
22469                 }
22470 @@ -2869,7 +2931,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22471                 add_partial(n, page, DEACTIVATE_TO_TAIL);
22472                 stat(s, FREE_ADD_PARTIAL);
22473         }
22474 -       spin_unlock_irqrestore(&n->list_lock, flags);
22475 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22476         return;
22477  
22478  slab_empty:
22479 @@ -2884,7 +2946,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22480                 remove_full(s, n, page);
22481         }
22482  
22483 -       spin_unlock_irqrestore(&n->list_lock, flags);
22484 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22485         stat(s, FREE_SLAB);
22486         discard_slab(s, page);
22487  }
22488 @@ -3089,6 +3151,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
22489                           void **p)
22490  {
22491         struct kmem_cache_cpu *c;
22492 +       LIST_HEAD(to_free);
22493         int i;
22494  
22495         /* memcg and kmem_cache debug support */
22496 @@ -3112,7 +3175,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
22497                          * of re-populating per CPU c->freelist
22498                          */
22499                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
22500 -                                           _RET_IP_, c);
22501 +                                           _RET_IP_, c, &to_free);
22502                         if (unlikely(!p[i]))
22503                                 goto error;
22504  
22505 @@ -3124,6 +3187,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
22506         }
22507         c->tid = next_tid(c->tid);
22508         local_irq_enable();
22509 +       free_delayed(&to_free);
22510  
22511         /* Clear memory outside IRQ disabled fastpath loop */
22512         if (unlikely(flags & __GFP_ZERO)) {
22513 @@ -3271,7 +3335,7 @@ static void
22514  init_kmem_cache_node(struct kmem_cache_node *n)
22515  {
22516         n->nr_partial = 0;
22517 -       spin_lock_init(&n->list_lock);
22518 +       raw_spin_lock_init(&n->list_lock);
22519         INIT_LIST_HEAD(&n->partial);
22520  #ifdef CONFIG_SLUB_DEBUG
22521         atomic_long_set(&n->nr_slabs, 0);
22522 @@ -3615,6 +3679,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
22523                                                         const char *text)
22524  {
22525  #ifdef CONFIG_SLUB_DEBUG
22526 +#ifdef CONFIG_PREEMPT_RT_BASE
22527 +       /* XXX move out of irq-off section */
22528 +       slab_err(s, page, text, s->name);
22529 +#else
22530         void *addr = page_address(page);
22531         void *p;
22532         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
22533 @@ -3635,6 +3703,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
22534         slab_unlock(page);
22535         kfree(map);
22536  #endif
22537 +#endif
22538  }
22539  
22540  /*
22541 @@ -3648,7 +3717,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
22542         struct page *page, *h;
22543  
22544         BUG_ON(irqs_disabled());
22545 -       spin_lock_irq(&n->list_lock);
22546 +       raw_spin_lock_irq(&n->list_lock);
22547         list_for_each_entry_safe(page, h, &n->partial, lru) {
22548                 if (!page->inuse) {
22549                         remove_partial(n, page);
22550 @@ -3658,7 +3727,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
22551                         "Objects remaining in %s on __kmem_cache_shutdown()");
22552                 }
22553         }
22554 -       spin_unlock_irq(&n->list_lock);
22555 +       raw_spin_unlock_irq(&n->list_lock);
22556  
22557         list_for_each_entry_safe(page, h, &discard, lru)
22558                 discard_slab(s, page);
22559 @@ -3916,7 +3985,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
22560                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
22561                         INIT_LIST_HEAD(promote + i);
22562  
22563 -               spin_lock_irqsave(&n->list_lock, flags);
22564 +               raw_spin_lock_irqsave(&n->list_lock, flags);
22565  
22566                 /*
22567                  * Build lists of slabs to discard or promote.
22568 @@ -3947,7 +4016,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
22569                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
22570                         list_splice(promote + i, &n->partial);
22571  
22572 -               spin_unlock_irqrestore(&n->list_lock, flags);
22573 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
22574  
22575                 /* Release empty slabs */
22576                 list_for_each_entry_safe(page, t, &discard, lru)
22577 @@ -4123,6 +4192,12 @@ void __init kmem_cache_init(void)
22578  {
22579         static __initdata struct kmem_cache boot_kmem_cache,
22580                 boot_kmem_cache_node;
22581 +       int cpu;
22582 +
22583 +       for_each_possible_cpu(cpu) {
22584 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
22585 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
22586 +       }
22587  
22588         if (debug_guardpage_minorder())
22589                 slub_max_order = 0;
22590 @@ -4331,7 +4406,7 @@ static int validate_slab_node(struct kmem_cache *s,
22591         struct page *page;
22592         unsigned long flags;
22593  
22594 -       spin_lock_irqsave(&n->list_lock, flags);
22595 +       raw_spin_lock_irqsave(&n->list_lock, flags);
22596  
22597         list_for_each_entry(page, &n->partial, lru) {
22598                 validate_slab_slab(s, page, map);
22599 @@ -4353,7 +4428,7 @@ static int validate_slab_node(struct kmem_cache *s,
22600                        s->name, count, atomic_long_read(&n->nr_slabs));
22601  
22602  out:
22603 -       spin_unlock_irqrestore(&n->list_lock, flags);
22604 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22605         return count;
22606  }
22607  
22608 @@ -4541,12 +4616,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
22609                 if (!atomic_long_read(&n->nr_slabs))
22610                         continue;
22611  
22612 -               spin_lock_irqsave(&n->list_lock, flags);
22613 +               raw_spin_lock_irqsave(&n->list_lock, flags);
22614                 list_for_each_entry(page, &n->partial, lru)
22615                         process_slab(&t, s, page, alloc, map);
22616                 list_for_each_entry(page, &n->full, lru)
22617                         process_slab(&t, s, page, alloc, map);
22618 -               spin_unlock_irqrestore(&n->list_lock, flags);
22619 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
22620         }
22621  
22622         for (i = 0; i < t.count; i++) {
22623 diff --git a/mm/swap.c b/mm/swap.c
22624 index 4dcf852e1e6d..69c3a5b24060 100644
22625 --- a/mm/swap.c
22626 +++ b/mm/swap.c
22627 @@ -32,6 +32,7 @@
22628  #include <linux/memcontrol.h>
22629  #include <linux/gfp.h>
22630  #include <linux/uio.h>
22631 +#include <linux/locallock.h>
22632  #include <linux/hugetlb.h>
22633  #include <linux/page_idle.h>
22634  
22635 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
22636  #ifdef CONFIG_SMP
22637  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
22638  #endif
22639 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
22640 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
22641  
22642  /*
22643   * This path almost never happens for VM activity - pages are normally
22644 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
22645                 unsigned long flags;
22646  
22647                 get_page(page);
22648 -               local_irq_save(flags);
22649 +               local_lock_irqsave(rotate_lock, flags);
22650                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
22651                 if (!pagevec_add(pvec, page) || PageCompound(page))
22652                         pagevec_move_tail(pvec);
22653 -               local_irq_restore(flags);
22654 +               local_unlock_irqrestore(rotate_lock, flags);
22655         }
22656  }
22657  
22658 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
22659  {
22660         page = compound_head(page);
22661         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
22662 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
22663 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
22664 +                                                      activate_page_pvecs);
22665  
22666                 get_page(page);
22667                 if (!pagevec_add(pvec, page) || PageCompound(page))
22668                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
22669 -               put_cpu_var(activate_page_pvecs);
22670 +               put_locked_var(swapvec_lock, activate_page_pvecs);
22671         }
22672  }
22673  
22674 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
22675  
22676  static void __lru_cache_activate_page(struct page *page)
22677  {
22678 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
22679 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
22680         int i;
22681  
22682         /*
22683 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
22684                 }
22685         }
22686  
22687 -       put_cpu_var(lru_add_pvec);
22688 +       put_locked_var(swapvec_lock, lru_add_pvec);
22689  }
22690  
22691  /*
22692 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
22693  
22694  static void __lru_cache_add(struct page *page)
22695  {
22696 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
22697 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
22698  
22699         get_page(page);
22700         if (!pagevec_add(pvec, page) || PageCompound(page))
22701                 __pagevec_lru_add(pvec);
22702 -       put_cpu_var(lru_add_pvec);
22703 +       put_locked_var(swapvec_lock, lru_add_pvec);
22704  }
22705  
22706  /**
22707 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
22708                 unsigned long flags;
22709  
22710                 /* No harm done if a racing interrupt already did this */
22711 -               local_irq_save(flags);
22712 +#ifdef CONFIG_PREEMPT_RT_BASE
22713 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
22714                 pagevec_move_tail(pvec);
22715 -               local_irq_restore(flags);
22716 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
22717 +#else
22718 +               local_lock_irqsave(rotate_lock, flags);
22719 +               pagevec_move_tail(pvec);
22720 +               local_unlock_irqrestore(rotate_lock, flags);
22721 +#endif
22722         }
22723  
22724         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
22725 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
22726                 return;
22727  
22728         if (likely(get_page_unless_zero(page))) {
22729 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
22730 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
22731 +                                                      lru_deactivate_file_pvecs);
22732  
22733                 if (!pagevec_add(pvec, page) || PageCompound(page))
22734                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
22735 -               put_cpu_var(lru_deactivate_file_pvecs);
22736 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
22737         }
22738  }
22739  
22740 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
22741  void deactivate_page(struct page *page)
22742  {
22743         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
22744 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
22745 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
22746 +                                                      lru_deactivate_pvecs);
22747  
22748                 get_page(page);
22749                 if (!pagevec_add(pvec, page) || PageCompound(page))
22750                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
22751 -               put_cpu_var(lru_deactivate_pvecs);
22752 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
22753         }
22754  }
22755  
22756  void lru_add_drain(void)
22757  {
22758 -       lru_add_drain_cpu(get_cpu());
22759 -       put_cpu();
22760 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
22761 +       local_unlock_cpu(swapvec_lock);
22762  }
22763  
22764 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
22765 +#ifdef CONFIG_PREEMPT_RT_BASE
22766 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
22767  {
22768 -       lru_add_drain();
22769 +       local_lock_on(swapvec_lock, cpu);
22770 +       lru_add_drain_cpu(cpu);
22771 +       local_unlock_on(swapvec_lock, cpu);
22772  }
22773  
22774 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
22775 +#else
22776  
22777  /*
22778   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
22779 @@ -686,6 +701,22 @@ static int __init lru_init(void)
22780  }
22781  early_initcall(lru_init);
22782  
22783 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
22784 +{
22785 +       lru_add_drain();
22786 +}
22787 +
22788 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
22789 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
22790 +{
22791 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
22792 +
22793 +       INIT_WORK(work, lru_add_drain_per_cpu);
22794 +       queue_work_on(cpu, lru_add_drain_wq, work);
22795 +       cpumask_set_cpu(cpu, has_work);
22796 +}
22797 +#endif
22798 +
22799  void lru_add_drain_all(void)
22800  {
22801         static DEFINE_MUTEX(lock);
22802 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
22803         cpumask_clear(&has_work);
22804  
22805         for_each_online_cpu(cpu) {
22806 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
22807 -
22808                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
22809                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
22810                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
22811                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
22812 -                   need_activate_page_drain(cpu)) {
22813 -                       INIT_WORK(work, lru_add_drain_per_cpu);
22814 -                       queue_work_on(cpu, lru_add_drain_wq, work);
22815 -                       cpumask_set_cpu(cpu, &has_work);
22816 -               }
22817 +                   need_activate_page_drain(cpu))
22818 +                       remote_lru_add_drain(cpu, &has_work);
22819         }
22820  
22821 +#ifndef CONFIG_PREEMPT_RT_BASE
22822         for_each_cpu(cpu, &has_work)
22823                 flush_work(&per_cpu(lru_add_drain_work, cpu));
22824 +#endif
22825  
22826         put_online_cpus();
22827         mutex_unlock(&lock);
22828 diff --git a/mm/truncate.c b/mm/truncate.c
22829 index 8d8c62d89e6d..5bf1bd25d077 100644
22830 --- a/mm/truncate.c
22831 +++ b/mm/truncate.c
22832 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
22833          * protected by mapping->tree_lock.
22834          */
22835         if (!workingset_node_shadows(node) &&
22836 -           !list_empty(&node->private_list))
22837 -               list_lru_del(&workingset_shadow_nodes,
22838 +           !list_empty(&node->private_list)) {
22839 +               local_lock(workingset_shadow_lock);
22840 +               list_lru_del(&__workingset_shadow_nodes,
22841                                 &node->private_list);
22842 +               local_unlock(workingset_shadow_lock);
22843 +       }
22844         __radix_tree_delete_node(&mapping->page_tree, node);
22845  unlock:
22846         spin_unlock_irq(&mapping->tree_lock);
22847 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
22848 index f2481cb4e6b2..db4de08fa97c 100644
22849 --- a/mm/vmalloc.c
22850 +++ b/mm/vmalloc.c
22851 @@ -845,7 +845,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
22852         struct vmap_block *vb;
22853         struct vmap_area *va;
22854         unsigned long vb_idx;
22855 -       int node, err;
22856 +       int node, err, cpu;
22857         void *vaddr;
22858  
22859         node = numa_node_id();
22860 @@ -888,11 +888,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
22861         BUG_ON(err);
22862         radix_tree_preload_end();
22863  
22864 -       vbq = &get_cpu_var(vmap_block_queue);
22865 +       cpu = get_cpu_light();
22866 +       vbq = this_cpu_ptr(&vmap_block_queue);
22867         spin_lock(&vbq->lock);
22868         list_add_tail_rcu(&vb->free_list, &vbq->free);
22869         spin_unlock(&vbq->lock);
22870 -       put_cpu_var(vmap_block_queue);
22871 +       put_cpu_light();
22872  
22873         return vaddr;
22874  }
22875 @@ -961,6 +962,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
22876         struct vmap_block *vb;
22877         void *vaddr = NULL;
22878         unsigned int order;
22879 +       int cpu;
22880  
22881         BUG_ON(offset_in_page(size));
22882         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
22883 @@ -975,7 +977,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
22884         order = get_order(size);
22885  
22886         rcu_read_lock();
22887 -       vbq = &get_cpu_var(vmap_block_queue);
22888 +       cpu = get_cpu_light();
22889 +       vbq = this_cpu_ptr(&vmap_block_queue);
22890         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
22891                 unsigned long pages_off;
22892  
22893 @@ -998,7 +1001,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
22894                 break;
22895         }
22896  
22897 -       put_cpu_var(vmap_block_queue);
22898 +       put_cpu_light();
22899         rcu_read_unlock();
22900  
22901         /* Allocate new block if nothing was found */
22902 diff --git a/mm/vmstat.c b/mm/vmstat.c
22903 index 604f26a4f696..312006d2db50 100644
22904 --- a/mm/vmstat.c
22905 +++ b/mm/vmstat.c
22906 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
22907         long x;
22908         long t;
22909  
22910 +       preempt_disable_rt();
22911         x = delta + __this_cpu_read(*p);
22912  
22913         t = __this_cpu_read(pcp->stat_threshold);
22914 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
22915                 x = 0;
22916         }
22917         __this_cpu_write(*p, x);
22918 +       preempt_enable_rt();
22919  }
22920  EXPORT_SYMBOL(__mod_zone_page_state);
22921  
22922 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
22923         long x;
22924         long t;
22925  
22926 +       preempt_disable_rt();
22927         x = delta + __this_cpu_read(*p);
22928  
22929         t = __this_cpu_read(pcp->stat_threshold);
22930 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
22931                 x = 0;
22932         }
22933         __this_cpu_write(*p, x);
22934 +       preempt_enable_rt();
22935  }
22936  EXPORT_SYMBOL(__mod_node_page_state);
22937  
22938 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
22939         s8 __percpu *p = pcp->vm_stat_diff + item;
22940         s8 v, t;
22941  
22942 +       preempt_disable_rt();
22943         v = __this_cpu_inc_return(*p);
22944         t = __this_cpu_read(pcp->stat_threshold);
22945         if (unlikely(v > t)) {
22946 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
22947                 zone_page_state_add(v + overstep, zone, item);
22948                 __this_cpu_write(*p, -overstep);
22949         }
22950 +       preempt_enable_rt();
22951  }
22952  
22953  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22954 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22955         s8 __percpu *p = pcp->vm_node_stat_diff + item;
22956         s8 v, t;
22957  
22958 +       preempt_disable_rt();
22959         v = __this_cpu_inc_return(*p);
22960         t = __this_cpu_read(pcp->stat_threshold);
22961         if (unlikely(v > t)) {
22962 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22963                 node_page_state_add(v + overstep, pgdat, item);
22964                 __this_cpu_write(*p, -overstep);
22965         }
22966 +       preempt_enable_rt();
22967  }
22968  
22969  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
22970 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
22971         s8 __percpu *p = pcp->vm_stat_diff + item;
22972         s8 v, t;
22973  
22974 +       preempt_disable_rt();
22975         v = __this_cpu_dec_return(*p);
22976         t = __this_cpu_read(pcp->stat_threshold);
22977         if (unlikely(v < - t)) {
22978 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
22979                 zone_page_state_add(v - overstep, zone, item);
22980                 __this_cpu_write(*p, overstep);
22981         }
22982 +       preempt_enable_rt();
22983  }
22984  
22985  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22986 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22987         s8 __percpu *p = pcp->vm_node_stat_diff + item;
22988         s8 v, t;
22989  
22990 +       preempt_disable_rt();
22991         v = __this_cpu_dec_return(*p);
22992         t = __this_cpu_read(pcp->stat_threshold);
22993         if (unlikely(v < - t)) {
22994 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22995                 node_page_state_add(v - overstep, pgdat, item);
22996                 __this_cpu_write(*p, overstep);
22997         }
22998 +       preempt_enable_rt();
22999  }
23000  
23001  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
23002 diff --git a/mm/workingset.c b/mm/workingset.c
23003 index fb1f9183d89a..7e6ef1a48cd3 100644
23004 --- a/mm/workingset.c
23005 +++ b/mm/workingset.c
23006 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
23007   * point where they would still be useful.
23008   */
23009  
23010 -struct list_lru workingset_shadow_nodes;
23011 +struct list_lru __workingset_shadow_nodes;
23012 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
23013  
23014  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
23015                                         struct shrink_control *sc)
23016 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
23017         unsigned long pages;
23018  
23019         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
23020 -       local_irq_disable();
23021 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
23022 -       local_irq_enable();
23023 +       local_lock_irq(workingset_shadow_lock);
23024 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
23025 +       local_unlock_irq(workingset_shadow_lock);
23026  
23027         if (sc->memcg) {
23028                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
23029 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
23030         spin_unlock(&mapping->tree_lock);
23031         ret = LRU_REMOVED_RETRY;
23032  out:
23033 -       local_irq_enable();
23034 +       local_unlock_irq(workingset_shadow_lock);
23035         cond_resched();
23036 -       local_irq_disable();
23037 +       local_lock_irq(workingset_shadow_lock);
23038         spin_lock(lru_lock);
23039         return ret;
23040  }
23041 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
23042         unsigned long ret;
23043  
23044         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
23045 -       local_irq_disable();
23046 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
23047 +       local_lock_irq(workingset_shadow_lock);
23048 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
23049                                     shadow_lru_isolate, NULL);
23050 -       local_irq_enable();
23051 +       local_unlock_irq(workingset_shadow_lock);
23052         return ret;
23053  }
23054  
23055 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
23056         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
23057                timestamp_bits, max_order, bucket_order);
23058  
23059 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
23060 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
23061         if (ret)
23062                 goto err;
23063         ret = register_shrinker(&workingset_shadow_shrinker);
23064 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
23065                 goto err_list_lru;
23066         return 0;
23067  err_list_lru:
23068 -       list_lru_destroy(&workingset_shadow_nodes);
23069 +       list_lru_destroy(&__workingset_shadow_nodes);
23070  err:
23071         return ret;
23072  }
23073 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
23074 index b0bc023d25c5..5af6426fbcbe 100644
23075 --- a/mm/zsmalloc.c
23076 +++ b/mm/zsmalloc.c
23077 @@ -53,6 +53,7 @@
23078  #include <linux/mount.h>
23079  #include <linux/migrate.h>
23080  #include <linux/pagemap.h>
23081 +#include <linux/locallock.h>
23082  
23083  #define ZSPAGE_MAGIC   0x58
23084  
23085 @@ -70,9 +71,22 @@
23086   */
23087  #define ZS_MAX_ZSPAGE_ORDER 2
23088  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
23089 -
23090  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
23091  
23092 +#ifdef CONFIG_PREEMPT_RT_FULL
23093 +
23094 +struct zsmalloc_handle {
23095 +       unsigned long addr;
23096 +       struct mutex lock;
23097 +};
23098 +
23099 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
23100 +
23101 +#else
23102 +
23103 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
23104 +#endif
23105 +
23106  /*
23107   * Object location (<PFN>, <obj_idx>) is encoded as
23108   * as single (unsigned long) handle value.
23109 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
23110  
23111  static int create_cache(struct zs_pool *pool)
23112  {
23113 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
23114 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
23115                                         0, 0, NULL);
23116         if (!pool->handle_cachep)
23117                 return 1;
23118 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
23119  
23120  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
23121  {
23122 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
23123 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
23124 +       void *p;
23125 +
23126 +       p = kmem_cache_alloc(pool->handle_cachep,
23127 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
23128 +#ifdef CONFIG_PREEMPT_RT_FULL
23129 +       if (p) {
23130 +               struct zsmalloc_handle *zh = p;
23131 +
23132 +               mutex_init(&zh->lock);
23133 +       }
23134 +#endif
23135 +       return (unsigned long)p;
23136  }
23137  
23138 +#ifdef CONFIG_PREEMPT_RT_FULL
23139 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
23140 +{
23141 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
23142 +}
23143 +#endif
23144 +
23145  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
23146  {
23147         kmem_cache_free(pool->handle_cachep, (void *)handle);
23148 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
23149  
23150  static void record_obj(unsigned long handle, unsigned long obj)
23151  {
23152 +#ifdef CONFIG_PREEMPT_RT_FULL
23153 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23154 +
23155 +       WRITE_ONCE(zh->addr, obj);
23156 +#else
23157         /*
23158          * lsb of @obj represents handle lock while other bits
23159          * represent object value the handle is pointing so
23160          * updating shouldn't do store tearing.
23161          */
23162         WRITE_ONCE(*(unsigned long *)handle, obj);
23163 +#endif
23164  }
23165  
23166  /* zpool driver */
23167 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
23168  
23169  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
23170  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
23171 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
23172  
23173  static bool is_zspage_isolated(struct zspage *zspage)
23174  {
23175 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
23176  
23177  static unsigned long handle_to_obj(unsigned long handle)
23178  {
23179 +#ifdef CONFIG_PREEMPT_RT_FULL
23180 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23181 +
23182 +       return zh->addr;
23183 +#else
23184         return *(unsigned long *)handle;
23185 +#endif
23186  }
23187  
23188  static unsigned long obj_to_head(struct page *page, void *obj)
23189 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
23190  
23191  static inline int testpin_tag(unsigned long handle)
23192  {
23193 +#ifdef CONFIG_PREEMPT_RT_FULL
23194 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23195 +
23196 +       return mutex_is_locked(&zh->lock);
23197 +#else
23198         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
23199 +#endif
23200  }
23201  
23202  static inline int trypin_tag(unsigned long handle)
23203  {
23204 +#ifdef CONFIG_PREEMPT_RT_FULL
23205 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23206 +
23207 +       return mutex_trylock(&zh->lock);
23208 +#else
23209         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
23210 +#endif
23211  }
23212  
23213  static void pin_tag(unsigned long handle)
23214  {
23215 +#ifdef CONFIG_PREEMPT_RT_FULL
23216 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23217 +
23218 +       return mutex_lock(&zh->lock);
23219 +#else
23220         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
23221 +#endif
23222  }
23223  
23224  static void unpin_tag(unsigned long handle)
23225  {
23226 +#ifdef CONFIG_PREEMPT_RT_FULL
23227 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23228 +
23229 +       return mutex_unlock(&zh->lock);
23230 +#else
23231         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
23232 +#endif
23233  }
23234  
23235  static void reset_page(struct page *page)
23236 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
23237         class = pool->size_class[class_idx];
23238         off = (class->size * obj_idx) & ~PAGE_MASK;
23239  
23240 -       area = &get_cpu_var(zs_map_area);
23241 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
23242         area->vm_mm = mm;
23243         if (off + class->size <= PAGE_SIZE) {
23244                 /* this object is contained entirely within a page */
23245 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
23246  
23247                 __zs_unmap_object(area, pages, off, class->size);
23248         }
23249 -       put_cpu_var(zs_map_area);
23250 +       put_locked_var(zs_map_area_lock, zs_map_area);
23251  
23252         migrate_read_unlock(zspage);
23253         unpin_tag(handle);
23254 diff --git a/net/core/dev.c b/net/core/dev.c
23255 index e1d731fdc72c..6ab4b7863755 100644
23256 --- a/net/core/dev.c
23257 +++ b/net/core/dev.c
23258 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
23259  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
23260  
23261  static seqcount_t devnet_rename_seq;
23262 +static DEFINE_MUTEX(devnet_rename_mutex);
23263  
23264  static inline void dev_base_seq_inc(struct net *net)
23265  {
23266 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
23267  static inline void rps_lock(struct softnet_data *sd)
23268  {
23269  #ifdef CONFIG_RPS
23270 -       spin_lock(&sd->input_pkt_queue.lock);
23271 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
23272  #endif
23273  }
23274  
23275  static inline void rps_unlock(struct softnet_data *sd)
23276  {
23277  #ifdef CONFIG_RPS
23278 -       spin_unlock(&sd->input_pkt_queue.lock);
23279 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
23280  #endif
23281  }
23282  
23283 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
23284         strcpy(name, dev->name);
23285         rcu_read_unlock();
23286         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
23287 -               cond_resched();
23288 +               mutex_lock(&devnet_rename_mutex);
23289 +               mutex_unlock(&devnet_rename_mutex);
23290                 goto retry;
23291         }
23292  
23293 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
23294         if (dev->flags & IFF_UP)
23295                 return -EBUSY;
23296  
23297 -       write_seqcount_begin(&devnet_rename_seq);
23298 +       mutex_lock(&devnet_rename_mutex);
23299 +       __raw_write_seqcount_begin(&devnet_rename_seq);
23300  
23301 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
23302 -               write_seqcount_end(&devnet_rename_seq);
23303 -               return 0;
23304 -       }
23305 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
23306 +               goto outunlock;
23307  
23308         memcpy(oldname, dev->name, IFNAMSIZ);
23309  
23310         err = dev_get_valid_name(net, dev, newname);
23311 -       if (err < 0) {
23312 -               write_seqcount_end(&devnet_rename_seq);
23313 -               return err;
23314 -       }
23315 +       if (err < 0)
23316 +               goto outunlock;
23317  
23318         if (oldname[0] && !strchr(oldname, '%'))
23319                 netdev_info(dev, "renamed from %s\n", oldname);
23320 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
23321         if (ret) {
23322                 memcpy(dev->name, oldname, IFNAMSIZ);
23323                 dev->name_assign_type = old_assign_type;
23324 -               write_seqcount_end(&devnet_rename_seq);
23325 -               return ret;
23326 +               err = ret;
23327 +               goto outunlock;
23328         }
23329  
23330 -       write_seqcount_end(&devnet_rename_seq);
23331 +       __raw_write_seqcount_end(&devnet_rename_seq);
23332 +       mutex_unlock(&devnet_rename_mutex);
23333  
23334         netdev_adjacent_rename_links(dev, oldname);
23335  
23336 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
23337                 /* err >= 0 after dev_alloc_name() or stores the first errno */
23338                 if (err >= 0) {
23339                         err = ret;
23340 -                       write_seqcount_begin(&devnet_rename_seq);
23341 +                       mutex_lock(&devnet_rename_mutex);
23342 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
23343                         memcpy(dev->name, oldname, IFNAMSIZ);
23344                         memcpy(oldname, newname, IFNAMSIZ);
23345                         dev->name_assign_type = old_assign_type;
23346 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
23347         }
23348  
23349         return err;
23350 +
23351 +outunlock:
23352 +       __raw_write_seqcount_end(&devnet_rename_seq);
23353 +       mutex_unlock(&devnet_rename_mutex);
23354 +       return err;
23355  }
23356  
23357  /**
23358 @@ -2263,6 +2269,7 @@ static void __netif_reschedule(struct Qdisc *q)
23359         sd->output_queue_tailp = &q->next_sched;
23360         raise_softirq_irqoff(NET_TX_SOFTIRQ);
23361         local_irq_restore(flags);
23362 +       preempt_check_resched_rt();
23363  }
23364  
23365  void __netif_schedule(struct Qdisc *q)
23366 @@ -2344,6 +2351,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
23367         __this_cpu_write(softnet_data.completion_queue, skb);
23368         raise_softirq_irqoff(NET_TX_SOFTIRQ);
23369         local_irq_restore(flags);
23370 +       preempt_check_resched_rt();
23371  }
23372  EXPORT_SYMBOL(__dev_kfree_skb_irq);
23373  
23374 @@ -3078,7 +3086,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
23375          * This permits qdisc->running owner to get the lock more
23376          * often and dequeue packets faster.
23377          */
23378 +#ifdef CONFIG_PREEMPT_RT_FULL
23379 +       contended = true;
23380 +#else
23381         contended = qdisc_is_running(q);
23382 +#endif
23383         if (unlikely(contended))
23384                 spin_lock(&q->busylock);
23385  
23386 @@ -3141,8 +3153,10 @@ static void skb_update_prio(struct sk_buff *skb)
23387  #define skb_update_prio(skb)
23388  #endif
23389  
23390 +#ifndef CONFIG_PREEMPT_RT_FULL
23391  DEFINE_PER_CPU(int, xmit_recursion);
23392  EXPORT_SYMBOL(xmit_recursion);
23393 +#endif
23394  
23395  /**
23396   *     dev_loopback_xmit - loop back @skb
23397 @@ -3376,8 +3390,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
23398                 int cpu = smp_processor_id(); /* ok because BHs are off */
23399  
23400                 if (txq->xmit_lock_owner != cpu) {
23401 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
23402 -                                    XMIT_RECURSION_LIMIT))
23403 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
23404                                 goto recursion_alert;
23405  
23406                         skb = validate_xmit_skb(skb, dev);
23407 @@ -3387,9 +3400,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
23408                         HARD_TX_LOCK(dev, txq, cpu);
23409  
23410                         if (!netif_xmit_stopped(txq)) {
23411 -                               __this_cpu_inc(xmit_recursion);
23412 +                               xmit_rec_inc();
23413                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
23414 -                               __this_cpu_dec(xmit_recursion);
23415 +                               xmit_rec_dec();
23416                                 if (dev_xmit_complete(rc)) {
23417                                         HARD_TX_UNLOCK(dev, txq);
23418                                         goto out;
23419 @@ -3763,6 +3776,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
23420         rps_unlock(sd);
23421  
23422         local_irq_restore(flags);
23423 +       preempt_check_resched_rt();
23424  
23425         atomic_long_inc(&skb->dev->rx_dropped);
23426         kfree_skb(skb);
23427 @@ -3781,7 +3795,7 @@ static int netif_rx_internal(struct sk_buff *skb)
23428                 struct rps_dev_flow voidflow, *rflow = &voidflow;
23429                 int cpu;
23430  
23431 -               preempt_disable();
23432 +               migrate_disable();
23433                 rcu_read_lock();
23434  
23435                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
23436 @@ -3791,13 +3805,13 @@ static int netif_rx_internal(struct sk_buff *skb)
23437                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
23438  
23439                 rcu_read_unlock();
23440 -               preempt_enable();
23441 +               migrate_enable();
23442         } else
23443  #endif
23444         {
23445                 unsigned int qtail;
23446 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
23447 -               put_cpu();
23448 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
23449 +               put_cpu_light();
23450         }
23451         return ret;
23452  }
23453 @@ -3831,11 +3845,9 @@ int netif_rx_ni(struct sk_buff *skb)
23454  
23455         trace_netif_rx_ni_entry(skb);
23456  
23457 -       preempt_disable();
23458 +       local_bh_disable();
23459         err = netif_rx_internal(skb);
23460 -       if (local_softirq_pending())
23461 -               do_softirq();
23462 -       preempt_enable();
23463 +       local_bh_enable();
23464  
23465         return err;
23466  }
23467 @@ -4314,7 +4326,7 @@ static void flush_backlog(struct work_struct *work)
23468         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
23469                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23470                         __skb_unlink(skb, &sd->input_pkt_queue);
23471 -                       kfree_skb(skb);
23472 +                       __skb_queue_tail(&sd->tofree_queue, skb);
23473                         input_queue_head_incr(sd);
23474                 }
23475         }
23476 @@ -4324,11 +4336,14 @@ static void flush_backlog(struct work_struct *work)
23477         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
23478                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23479                         __skb_unlink(skb, &sd->process_queue);
23480 -                       kfree_skb(skb);
23481 +                       __skb_queue_tail(&sd->tofree_queue, skb);
23482                         input_queue_head_incr(sd);
23483                 }
23484         }
23485 +       if (!skb_queue_empty(&sd->tofree_queue))
23486 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
23487         local_bh_enable();
23488 +
23489  }
23490  
23491  static void flush_all_backlogs(void)
23492 @@ -4809,6 +4824,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
23493                 sd->rps_ipi_list = NULL;
23494  
23495                 local_irq_enable();
23496 +               preempt_check_resched_rt();
23497  
23498                 /* Send pending IPI's to kick RPS processing on remote cpus. */
23499                 while (remsd) {
23500 @@ -4822,6 +4838,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
23501         } else
23502  #endif
23503                 local_irq_enable();
23504 +       preempt_check_resched_rt();
23505  }
23506  
23507  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
23508 @@ -4851,7 +4868,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
23509         while (again) {
23510                 struct sk_buff *skb;
23511  
23512 +               local_irq_disable();
23513                 while ((skb = __skb_dequeue(&sd->process_queue))) {
23514 +                       local_irq_enable();
23515                         rcu_read_lock();
23516                         __netif_receive_skb(skb);
23517                         rcu_read_unlock();
23518 @@ -4859,9 +4878,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
23519                         if (++work >= quota)
23520                                 return work;
23521  
23522 +                       local_irq_disable();
23523                 }
23524  
23525 -               local_irq_disable();
23526                 rps_lock(sd);
23527                 if (skb_queue_empty(&sd->input_pkt_queue)) {
23528                         /*
23529 @@ -4899,9 +4918,11 @@ void __napi_schedule(struct napi_struct *n)
23530         local_irq_save(flags);
23531         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
23532         local_irq_restore(flags);
23533 +       preempt_check_resched_rt();
23534  }
23535  EXPORT_SYMBOL(__napi_schedule);
23536  
23537 +#ifndef CONFIG_PREEMPT_RT_FULL
23538  /**
23539   * __napi_schedule_irqoff - schedule for receive
23540   * @n: entry to schedule
23541 @@ -4913,6 +4934,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
23542         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
23543  }
23544  EXPORT_SYMBOL(__napi_schedule_irqoff);
23545 +#endif
23546  
23547  void __napi_complete(struct napi_struct *n)
23548  {
23549 @@ -5202,13 +5224,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
23550         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
23551         unsigned long time_limit = jiffies + 2;
23552         int budget = netdev_budget;
23553 +       struct sk_buff_head tofree_q;
23554 +       struct sk_buff *skb;
23555         LIST_HEAD(list);
23556         LIST_HEAD(repoll);
23557  
23558 +       __skb_queue_head_init(&tofree_q);
23559 +
23560         local_irq_disable();
23561 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
23562         list_splice_init(&sd->poll_list, &list);
23563         local_irq_enable();
23564  
23565 +       while ((skb = __skb_dequeue(&tofree_q)))
23566 +               kfree_skb(skb);
23567 +
23568         for (;;) {
23569                 struct napi_struct *n;
23570  
23571 @@ -5239,7 +5269,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
23572         list_splice_tail(&repoll, &list);
23573         list_splice(&list, &sd->poll_list);
23574         if (!list_empty(&sd->poll_list))
23575 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
23576 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
23577  
23578         net_rps_action_and_irq_enable(sd);
23579  }
23580 @@ -8000,16 +8030,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
23581  
23582         raise_softirq_irqoff(NET_TX_SOFTIRQ);
23583         local_irq_enable();
23584 +       preempt_check_resched_rt();
23585  
23586         /* Process offline CPU's input_pkt_queue */
23587         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
23588                 netif_rx_ni(skb);
23589                 input_queue_head_incr(oldsd);
23590         }
23591 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
23592 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
23593                 netif_rx_ni(skb);
23594                 input_queue_head_incr(oldsd);
23595         }
23596 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
23597 +               kfree_skb(skb);
23598 +       }
23599  
23600         return NOTIFY_OK;
23601  }
23602 @@ -8314,8 +8348,9 @@ static int __init net_dev_init(void)
23603  
23604                 INIT_WORK(flush, flush_backlog);
23605  
23606 -               skb_queue_head_init(&sd->input_pkt_queue);
23607 -               skb_queue_head_init(&sd->process_queue);
23608 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
23609 +               skb_queue_head_init_raw(&sd->process_queue);
23610 +               skb_queue_head_init_raw(&sd->tofree_queue);
23611                 INIT_LIST_HEAD(&sd->poll_list);
23612                 sd->output_queue_tailp = &sd->output_queue;
23613  #ifdef CONFIG_RPS
23614 diff --git a/net/core/filter.c b/net/core/filter.c
23615 index b391209838ef..b86e9681a88e 100644
23616 --- a/net/core/filter.c
23617 +++ b/net/core/filter.c
23618 @@ -1645,7 +1645,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
23619  {
23620         int ret;
23621  
23622 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
23623 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
23624                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
23625                 kfree_skb(skb);
23626                 return -ENETDOWN;
23627 @@ -1653,9 +1653,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
23628  
23629         skb->dev = dev;
23630  
23631 -       __this_cpu_inc(xmit_recursion);
23632 +       xmit_rec_inc();
23633         ret = dev_queue_xmit(skb);
23634 -       __this_cpu_dec(xmit_recursion);
23635 +       xmit_rec_dec();
23636  
23637         return ret;
23638  }
23639 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
23640 index cad8e791f28e..2a9364fe62a5 100644
23641 --- a/net/core/gen_estimator.c
23642 +++ b/net/core/gen_estimator.c
23643 @@ -84,7 +84,7 @@ struct gen_estimator
23644         struct gnet_stats_basic_packed  *bstats;
23645         struct gnet_stats_rate_est64    *rate_est;
23646         spinlock_t              *stats_lock;
23647 -       seqcount_t              *running;
23648 +       net_seqlock_t           *running;
23649         int                     ewma_log;
23650         u32                     last_packets;
23651         unsigned long           avpps;
23652 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
23653                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
23654                       struct gnet_stats_rate_est64 *rate_est,
23655                       spinlock_t *stats_lock,
23656 -                     seqcount_t *running,
23657 +                     net_seqlock_t *running,
23658                       struct nlattr *opt)
23659  {
23660         struct gen_estimator *est;
23661 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
23662                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
23663                           struct gnet_stats_rate_est64 *rate_est,
23664                           spinlock_t *stats_lock,
23665 -                         seqcount_t *running, struct nlattr *opt)
23666 +                         net_seqlock_t *running, struct nlattr *opt)
23667  {
23668         gen_kill_estimator(bstats, rate_est);
23669         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
23670 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
23671 index 508e051304fb..bc3b17b78c94 100644
23672 --- a/net/core/gen_stats.c
23673 +++ b/net/core/gen_stats.c
23674 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
23675  }
23676  
23677  void
23678 -__gnet_stats_copy_basic(const seqcount_t *running,
23679 +__gnet_stats_copy_basic(net_seqlock_t *running,
23680                         struct gnet_stats_basic_packed *bstats,
23681                         struct gnet_stats_basic_cpu __percpu *cpu,
23682                         struct gnet_stats_basic_packed *b)
23683 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
23684         }
23685         do {
23686                 if (running)
23687 -                       seq = read_seqcount_begin(running);
23688 +                       seq = net_seq_begin(running);
23689                 bstats->bytes = b->bytes;
23690                 bstats->packets = b->packets;
23691 -       } while (running && read_seqcount_retry(running, seq));
23692 +       } while (running && net_seq_retry(running, seq));
23693  }
23694  EXPORT_SYMBOL(__gnet_stats_copy_basic);
23695  
23696 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
23697   * if the room in the socket buffer was not sufficient.
23698   */
23699  int
23700 -gnet_stats_copy_basic(const seqcount_t *running,
23701 +gnet_stats_copy_basic(net_seqlock_t *running,
23702                       struct gnet_dump *d,
23703                       struct gnet_stats_basic_cpu __percpu *cpu,
23704                       struct gnet_stats_basic_packed *b)
23705 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
23706 index 1e3e0087245b..1077b39db717 100644
23707 --- a/net/core/skbuff.c
23708 +++ b/net/core/skbuff.c
23709 @@ -64,6 +64,7 @@
23710  #include <linux/errqueue.h>
23711  #include <linux/prefetch.h>
23712  #include <linux/if_vlan.h>
23713 +#include <linux/locallock.h>
23714  
23715  #include <net/protocol.h>
23716  #include <net/dst.h>
23717 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
23718  
23719  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
23720  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
23721 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
23722 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
23723  
23724  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23725  {
23726 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23727         unsigned long flags;
23728         void *data;
23729  
23730 -       local_irq_save(flags);
23731 +       local_lock_irqsave(netdev_alloc_lock, flags);
23732         nc = this_cpu_ptr(&netdev_alloc_cache);
23733         data = __alloc_page_frag(nc, fragsz, gfp_mask);
23734 -       local_irq_restore(flags);
23735 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
23736         return data;
23737  }
23738  
23739 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
23740  
23741  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23742  {
23743 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23744 +       struct napi_alloc_cache *nc;
23745 +       void *data;
23746  
23747 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
23748 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23749 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
23750 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23751 +       return data;
23752  }
23753  
23754  void *napi_alloc_frag(unsigned int fragsz)
23755 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
23756         if (sk_memalloc_socks())
23757                 gfp_mask |= __GFP_MEMALLOC;
23758  
23759 -       local_irq_save(flags);
23760 +       local_lock_irqsave(netdev_alloc_lock, flags);
23761  
23762         nc = this_cpu_ptr(&netdev_alloc_cache);
23763         data = __alloc_page_frag(nc, len, gfp_mask);
23764         pfmemalloc = nc->pfmemalloc;
23765  
23766 -       local_irq_restore(flags);
23767 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
23768  
23769         if (unlikely(!data))
23770                 return NULL;
23771 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
23772  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
23773                                  gfp_t gfp_mask)
23774  {
23775 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23776 +       struct napi_alloc_cache *nc;
23777         struct sk_buff *skb;
23778         void *data;
23779 +       bool pfmemalloc;
23780  
23781         len += NET_SKB_PAD + NET_IP_ALIGN;
23782  
23783 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
23784         if (sk_memalloc_socks())
23785                 gfp_mask |= __GFP_MEMALLOC;
23786  
23787 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23788         data = __alloc_page_frag(&nc->page, len, gfp_mask);
23789 +       pfmemalloc = nc->page.pfmemalloc;
23790 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23791         if (unlikely(!data))
23792                 return NULL;
23793  
23794 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
23795         }
23796  
23797         /* use OR instead of assignment to avoid clearing of bits in mask */
23798 -       if (nc->page.pfmemalloc)
23799 +       if (pfmemalloc)
23800                 skb->pfmemalloc = 1;
23801         skb->head_frag = 1;
23802  
23803 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
23804  
23805  void __kfree_skb_flush(void)
23806  {
23807 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23808 +       struct napi_alloc_cache *nc;
23809  
23810 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23811         /* flush skb_cache if containing objects */
23812         if (nc->skb_count) {
23813                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
23814                                      nc->skb_cache);
23815                 nc->skb_count = 0;
23816         }
23817 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23818  }
23819  
23820  static inline void _kfree_skb_defer(struct sk_buff *skb)
23821  {
23822 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23823 +       struct napi_alloc_cache *nc;
23824  
23825         /* drop skb->head and call any destructors for packet */
23826         skb_release_all(skb);
23827  
23828 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23829         /* record skb to CPU local list */
23830         nc->skb_cache[nc->skb_count++] = skb;
23831  
23832 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
23833                                      nc->skb_cache);
23834                 nc->skb_count = 0;
23835         }
23836 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23837  }
23838  void __kfree_skb_defer(struct sk_buff *skb)
23839  {
23840 diff --git a/net/core/sock.c b/net/core/sock.c
23841 index bc6543f7de36..2c32ee79620f 100644
23842 --- a/net/core/sock.c
23843 +++ b/net/core/sock.c
23844 @@ -2488,12 +2488,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
23845         if (sk->sk_lock.owned)
23846                 __lock_sock(sk);
23847         sk->sk_lock.owned = 1;
23848 -       spin_unlock(&sk->sk_lock.slock);
23849 +       spin_unlock_bh(&sk->sk_lock.slock);
23850         /*
23851          * The sk_lock has mutex_lock() semantics here:
23852          */
23853         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
23854 -       local_bh_enable();
23855  }
23856  EXPORT_SYMBOL(lock_sock_nested);
23857  
23858 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
23859 index 48734ee6293f..e6864ff11352 100644
23860 --- a/net/ipv4/icmp.c
23861 +++ b/net/ipv4/icmp.c
23862 @@ -69,6 +69,7 @@
23863  #include <linux/jiffies.h>
23864  #include <linux/kernel.h>
23865  #include <linux/fcntl.h>
23866 +#include <linux/sysrq.h>
23867  #include <linux/socket.h>
23868  #include <linux/in.h>
23869  #include <linux/inet.h>
23870 @@ -77,6 +78,7 @@
23871  #include <linux/string.h>
23872  #include <linux/netfilter_ipv4.h>
23873  #include <linux/slab.h>
23874 +#include <linux/locallock.h>
23875  #include <net/snmp.h>
23876  #include <net/ip.h>
23877  #include <net/route.h>
23878 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
23879   *
23880   *     On SMP we have one ICMP socket per-cpu.
23881   */
23882 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
23883 +
23884  static struct sock *icmp_sk(struct net *net)
23885  {
23886         return *this_cpu_ptr(net->ipv4.icmp_sk);
23887 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
23888  
23889         local_bh_disable();
23890  
23891 +       local_lock(icmp_sk_lock);
23892         sk = icmp_sk(net);
23893  
23894         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
23895                 /* This can happen if the output path signals a
23896                  * dst_link_failure() for an outgoing ICMP packet.
23897                  */
23898 +               local_unlock(icmp_sk_lock);
23899                 local_bh_enable();
23900                 return NULL;
23901         }
23902 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
23903  static inline void icmp_xmit_unlock(struct sock *sk)
23904  {
23905         spin_unlock_bh(&sk->sk_lock.slock);
23906 +       local_unlock(icmp_sk_lock);
23907  }
23908  
23909  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
23910 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
23911         struct sock *sk;
23912         struct sk_buff *skb;
23913  
23914 +       local_lock(icmp_sk_lock);
23915         sk = icmp_sk(dev_net((*rt)->dst.dev));
23916         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
23917                            icmp_param->data_len+icmp_param->head_len,
23918 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
23919                 skb->ip_summed = CHECKSUM_NONE;
23920                 ip_push_pending_frames(sk, fl4);
23921         }
23922 +       local_unlock(icmp_sk_lock);
23923  }
23924  
23925  /*
23926 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
23927  }
23928  
23929  /*
23930 + * 32bit and 64bit have different timestamp length, so we check for
23931 + * the cookie at offset 20 and verify it is repeated at offset 50
23932 + */
23933 +#define CO_POS0                20
23934 +#define CO_POS1                50
23935 +#define CO_SIZE                sizeof(int)
23936 +#define ICMP_SYSRQ_SIZE        57
23937 +
23938 +/*
23939 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
23940 + * pattern and if it matches send the next byte as a trigger to sysrq.
23941 + */
23942 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
23943 +{
23944 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
23945 +       char *p = skb->data;
23946 +
23947 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
23948 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
23949 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
23950 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
23951 +}
23952 +
23953 +/*
23954   *     Handle ICMP_ECHO ("ping") requests.
23955   *
23956   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
23957 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
23958                 icmp_param.data_len        = skb->len;
23959                 icmp_param.head_len        = sizeof(struct icmphdr);
23960                 icmp_reply(&icmp_param, skb);
23961 +
23962 +               if (skb->len == ICMP_SYSRQ_SIZE &&
23963 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
23964 +                       icmp_check_sysrq(net, skb);
23965 +               }
23966         }
23967         /* should there be an ICMP stat for ignored echos? */
23968         return true;
23969 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
23970 index 80bc36b25de2..215b90adfb05 100644
23971 --- a/net/ipv4/sysctl_net_ipv4.c
23972 +++ b/net/ipv4/sysctl_net_ipv4.c
23973 @@ -681,6 +681,13 @@ static struct ctl_table ipv4_net_table[] = {
23974                 .proc_handler   = proc_dointvec
23975         },
23976         {
23977 +               .procname       = "icmp_echo_sysrq",
23978 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
23979 +               .maxlen         = sizeof(int),
23980 +               .mode           = 0644,
23981 +               .proc_handler   = proc_dointvec
23982 +       },
23983 +       {
23984                 .procname       = "icmp_ignore_bogus_error_responses",
23985                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
23986                 .maxlen         = sizeof(int),
23987 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
23988 index 2259114c7242..829e60985a81 100644
23989 --- a/net/ipv4/tcp_ipv4.c
23990 +++ b/net/ipv4/tcp_ipv4.c
23991 @@ -62,6 +62,7 @@
23992  #include <linux/init.h>
23993  #include <linux/times.h>
23994  #include <linux/slab.h>
23995 +#include <linux/locallock.h>
23996  
23997  #include <net/net_namespace.h>
23998  #include <net/icmp.h>
23999 @@ -564,6 +565,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
24000  }
24001  EXPORT_SYMBOL(tcp_v4_send_check);
24002  
24003 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
24004  /*
24005   *     This routine will send an RST to the other tcp.
24006   *
24007 @@ -691,6 +693,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
24008                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
24009  
24010         arg.tos = ip_hdr(skb)->tos;
24011 +
24012 +       local_lock(tcp_sk_lock);
24013         local_bh_disable();
24014         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
24015                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
24016 @@ -700,6 +704,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
24017         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
24018         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
24019         local_bh_enable();
24020 +       local_unlock(tcp_sk_lock);
24021  
24022  #ifdef CONFIG_TCP_MD5SIG
24023  out:
24024 @@ -775,6 +780,7 @@ static void tcp_v4_send_ack(struct net *net,
24025         if (oif)
24026                 arg.bound_dev_if = oif;
24027         arg.tos = tos;
24028 +       local_lock(tcp_sk_lock);
24029         local_bh_disable();
24030         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
24031                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
24032 @@ -783,6 +789,7 @@ static void tcp_v4_send_ack(struct net *net,
24033  
24034         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
24035         local_bh_enable();
24036 +       local_unlock(tcp_sk_lock);
24037  }
24038  
24039  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
24040 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
24041 index a47bbc973f2d..c1c1c64589d9 100644
24042 --- a/net/mac80211/rx.c
24043 +++ b/net/mac80211/rx.c
24044 @@ -4156,7 +4156,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
24045         struct ieee80211_supported_band *sband;
24046         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
24047  
24048 -       WARN_ON_ONCE(softirq_count() == 0);
24049 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
24050  
24051         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
24052                 goto drop;
24053 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
24054 index 004af030ef1a..b64f751bda45 100644
24055 --- a/net/netfilter/core.c
24056 +++ b/net/netfilter/core.c
24057 @@ -22,12 +22,18 @@
24058  #include <linux/proc_fs.h>
24059  #include <linux/mutex.h>
24060  #include <linux/slab.h>
24061 +#include <linux/locallock.h>
24062  #include <linux/rcupdate.h>
24063  #include <net/net_namespace.h>
24064  #include <net/sock.h>
24065  
24066  #include "nf_internals.h"
24067  
24068 +#ifdef CONFIG_PREEMPT_RT_BASE
24069 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
24070 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
24071 +#endif
24072 +
24073  static DEFINE_MUTEX(afinfo_mutex);
24074  
24075  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
24076 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
24077 index dd2332390c45..f6a703b25b6c 100644
24078 --- a/net/packet/af_packet.c
24079 +++ b/net/packet/af_packet.c
24080 @@ -63,6 +63,7 @@
24081  #include <linux/if_packet.h>
24082  #include <linux/wireless.h>
24083  #include <linux/kernel.h>
24084 +#include <linux/delay.h>
24085  #include <linux/kmod.h>
24086  #include <linux/slab.h>
24087  #include <linux/vmalloc.h>
24088 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
24089         if (BLOCK_NUM_PKTS(pbd)) {
24090                 while (atomic_read(&pkc->blk_fill_in_prog)) {
24091                         /* Waiting for skb_copy_bits to finish... */
24092 -                       cpu_relax();
24093 +                       cpu_chill();
24094                 }
24095         }
24096  
24097 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
24098                 if (!(status & TP_STATUS_BLK_TMO)) {
24099                         while (atomic_read(&pkc->blk_fill_in_prog)) {
24100                                 /* Waiting for skb_copy_bits to finish... */
24101 -                               cpu_relax();
24102 +                               cpu_chill();
24103                         }
24104                 }
24105                 prb_close_block(pkc, pbd, po, status);
24106 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
24107 index 977f69886c00..f3e7a36b0396 100644
24108 --- a/net/rds/ib_rdma.c
24109 +++ b/net/rds/ib_rdma.c
24110 @@ -34,6 +34,7 @@
24111  #include <linux/slab.h>
24112  #include <linux/rculist.h>
24113  #include <linux/llist.h>
24114 +#include <linux/delay.h>
24115  
24116  #include "rds_single_path.h"
24117  #include "ib_mr.h"
24118 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
24119         for_each_online_cpu(cpu) {
24120                 flag = &per_cpu(clean_list_grace, cpu);
24121                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
24122 -                       cpu_relax();
24123 +                       cpu_chill();
24124         }
24125  }
24126  
24127 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
24128 index 7d921e56e715..13df56a738e5 100644
24129 --- a/net/rxrpc/security.c
24130 +++ b/net/rxrpc/security.c
24131 @@ -19,9 +19,6 @@
24132  #include <keys/rxrpc-type.h>
24133  #include "ar-internal.h"
24134  
24135 -static LIST_HEAD(rxrpc_security_methods);
24136 -static DECLARE_RWSEM(rxrpc_security_sem);
24137 -
24138  static const struct rxrpc_security *rxrpc_security_types[] = {
24139         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
24140  #ifdef CONFIG_RXKAD
24141 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
24142 index 206dc24add3a..00ea9bde5bb3 100644
24143 --- a/net/sched/sch_api.c
24144 +++ b/net/sched/sch_api.c
24145 @@ -981,7 +981,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
24146                         rcu_assign_pointer(sch->stab, stab);
24147                 }
24148                 if (tca[TCA_RATE]) {
24149 -                       seqcount_t *running;
24150 +                       net_seqlock_t *running;
24151  
24152                         err = -EOPNOTSUPP;
24153                         if (sch->flags & TCQ_F_MQROOT)
24154 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
24155 index 6cfb6e9038c2..20727e1347de 100644
24156 --- a/net/sched/sch_generic.c
24157 +++ b/net/sched/sch_generic.c
24158 @@ -425,7 +425,11 @@ struct Qdisc noop_qdisc = {
24159         .ops            =       &noop_qdisc_ops,
24160         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
24161         .dev_queue      =       &noop_netdev_queue,
24162 +#ifdef CONFIG_PREEMPT_RT_BASE
24163 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
24164 +#else
24165         .running        =       SEQCNT_ZERO(noop_qdisc.running),
24166 +#endif
24167         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
24168  };
24169  EXPORT_SYMBOL(noop_qdisc);
24170 @@ -624,9 +628,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
24171         lockdep_set_class(&sch->busylock,
24172                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
24173  
24174 +#ifdef CONFIG_PREEMPT_RT_BASE
24175 +       seqlock_init(&sch->running);
24176 +       lockdep_set_class(&sch->running.seqcount,
24177 +                         dev->qdisc_running_key ?: &qdisc_running_key);
24178 +       lockdep_set_class(&sch->running.lock,
24179 +                         dev->qdisc_running_key ?: &qdisc_running_key);
24180 +#else
24181         seqcount_init(&sch->running);
24182         lockdep_set_class(&sch->running,
24183                           dev->qdisc_running_key ?: &qdisc_running_key);
24184 +#endif
24185  
24186         sch->ops = ops;
24187         sch->enqueue = ops->enqueue;
24188 @@ -925,7 +937,7 @@ void dev_deactivate_many(struct list_head *head)
24189         /* Wait for outstanding qdisc_run calls. */
24190         list_for_each_entry(dev, head, close_list)
24191                 while (some_qdisc_is_busy(dev))
24192 -                       yield();
24193 +                       msleep(1);
24194  }
24195  
24196  void dev_deactivate(struct net_device *dev)
24197 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
24198 index 3bc1d61694cb..480141d45f49 100644
24199 --- a/net/sunrpc/svc_xprt.c
24200 +++ b/net/sunrpc/svc_xprt.c
24201 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
24202                 goto out;
24203         }
24204  
24205 -       cpu = get_cpu();
24206 +       cpu = get_cpu_light();
24207         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
24208  
24209         atomic_long_inc(&pool->sp_stats.packets);
24210 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
24211  
24212                 atomic_long_inc(&pool->sp_stats.threads_woken);
24213                 wake_up_process(rqstp->rq_task);
24214 -               put_cpu();
24215 +               put_cpu_light();
24216                 goto out;
24217         }
24218         rcu_read_unlock();
24219 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
24220                 goto redo_search;
24221         }
24222         rqstp = NULL;
24223 -       put_cpu();
24224 +       put_cpu_light();
24225  out:
24226         trace_svc_xprt_do_enqueue(xprt, rqstp);
24227  }
24228 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
24229 index 6fdc97ef6023..523e0420d7f0 100755
24230 --- a/scripts/mkcompile_h
24231 +++ b/scripts/mkcompile_h
24232 @@ -4,7 +4,8 @@ TARGET=$1
24233  ARCH=$2
24234  SMP=$3
24235  PREEMPT=$4
24236 -CC=$5
24237 +RT=$5
24238 +CC=$6
24239  
24240  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
24241  
24242 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
24243  CONFIG_FLAGS=""
24244  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
24245  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
24246 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
24247  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
24248  
24249  # Truncate to maximum length
24250 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
24251 index 9d33c1e85c79..3d307bda86f9 100644
24252 --- a/sound/core/pcm_native.c
24253 +++ b/sound/core/pcm_native.c
24254 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
24255  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
24256  {
24257         if (!substream->pcm->nonatomic)
24258 -               local_irq_disable();
24259 +               local_irq_disable_nort();
24260         snd_pcm_stream_lock(substream);
24261  }
24262  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
24263 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
24264  {
24265         snd_pcm_stream_unlock(substream);
24266         if (!substream->pcm->nonatomic)
24267 -               local_irq_enable();
24268 +               local_irq_enable_nort();
24269  }
24270  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
24271  
24272 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
24273  {
24274         unsigned long flags = 0;
24275         if (!substream->pcm->nonatomic)
24276 -               local_irq_save(flags);
24277 +               local_irq_save_nort(flags);
24278         snd_pcm_stream_lock(substream);
24279         return flags;
24280  }
24281 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
24282  {
24283         snd_pcm_stream_unlock(substream);
24284         if (!substream->pcm->nonatomic)
24285 -               local_irq_restore(flags);
24286 +               local_irq_restore_nort(flags);
24287  }
24288  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
24289  
This page took 2.280208 seconds and 4 git commands to generate.