]> git.pld-linux.org Git - packages/kernel.git/blob - kernel-rt.patch
RT variant added
[packages/kernel.git] / kernel-rt.patch
1 diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
2 new file mode 100644
3 index 000000000000..cb61516483d3
4 --- /dev/null
5 +++ b/Documentation/hwlat_detector.txt
6 @@ -0,0 +1,64 @@
7 +Introduction:
8 +-------------
9 +
10 +The module hwlat_detector is a special purpose kernel module that is used to
11 +detect large system latencies induced by the behavior of certain underlying
12 +hardware or firmware, independent of Linux itself. The code was developed
13 +originally to detect SMIs (System Management Interrupts) on x86 systems,
14 +however there is nothing x86 specific about this patchset. It was
15 +originally written for use by the "RT" patch since the Real Time
16 +kernel is highly latency sensitive.
17 +
18 +SMIs are usually not serviced by the Linux kernel, which typically does not
19 +even know that they are occuring. SMIs are instead are set up by BIOS code
20 +and are serviced by BIOS code, usually for "critical" events such as
21 +management of thermal sensors and fans. Sometimes though, SMIs are used for
22 +other tasks and those tasks can spend an inordinate amount of time in the
23 +handler (sometimes measured in milliseconds). Obviously this is a problem if
24 +you are trying to keep event service latencies down in the microsecond range.
25 +
26 +The hardware latency detector works by hogging all of the cpus for configurable
27 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
28 +for some period, then looking for gaps in the TSC data. Any gap indicates a
29 +time when the polling was interrupted and since the machine is stopped and
30 +interrupts turned off the only thing that could do that would be an SMI.
31 +
32 +Note that the SMI detector should *NEVER* be used in a production environment.
33 +It is intended to be run manually to determine if the hardware platform has a
34 +problem with long system firmware service routines.
35 +
36 +Usage:
37 +------
38 +
39 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
40 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
41 +step required to start the hwlat_detector. It is possible to redefine the
42 +threshold in microseconds (us) above which latency spikes will be taken
43 +into account (parameter "threshold=").
44 +
45 +Example:
46 +
47 +       # modprobe hwlat_detector enabled=1 threshold=100
48 +
49 +After the module is loaded, it creates a directory named "hwlat_detector" under
50 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
51 +to have debugfs mounted, which might be on /sys/debug on your system.
52 +
53 +The /debug/hwlat_detector interface contains the following files:
54 +
55 +count                  - number of latency spikes observed since last reset
56 +enable                 - a global enable/disable toggle (0/1), resets count
57 +max                    - maximum hardware latency actually observed (usecs)
58 +sample                 - a pipe from which to read current raw sample data
59 +                         in the format <timestamp> <latency observed usecs>
60 +                         (can be opened O_NONBLOCK for a single sample)
61 +threshold              - minimum latency value to be considered (usecs)
62 +width                  - time period to sample with CPUs held (usecs)
63 +                         must be less than the total window size (enforced)
64 +window                 - total period of sampling, width being inside (usecs)
65 +
66 +By default we will set width to 500,000 and window to 1,000,000, meaning that
67 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
68 +observe any latencies that exceed the threshold (initially 100 usecs),
69 +then we write to a global sample ring buffer of 8K samples, which is
70 +consumed by reading from the "sample" (pipe) debugfs file interface.
71 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
72 index 0e4102ae1a61..26b5f39d57a8 100644
73 --- a/Documentation/kernel-parameters.txt
74 +++ b/Documentation/kernel-parameters.txt
75 @@ -1629,6 +1629,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
76         ip=             [IP_PNP]
77                         See Documentation/filesystems/nfs/nfsroot.txt.
78  
79 +       irqaffinity=    [SMP] Set the default irq affinity mask
80 +                       Format:
81 +                       <cpu number>,...,<cpu number>
82 +                       or
83 +                       <cpu number>-<cpu number>
84 +                       (must be a positive range in ascending order)
85 +                       or a mixture
86 +                       <cpu number>,...,<cpu number>-<cpu number>
87 +
88         irqfixup        [HW]
89                         When an interrupt is not handled search all handlers
90                         for it. Intended to get systems with badly broken
91 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
92 index 13f5619b2203..f64d075ba647 100644
93 --- a/Documentation/sysrq.txt
94 +++ b/Documentation/sysrq.txt
95 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
96  On other - If you know of the key combos for other architectures, please
97             let me know so I can add them to this section.
98  
99 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
100 -
101 +On all -  write a character to /proc/sysrq-trigger, e.g.:
102                 echo t > /proc/sysrq-trigger
103  
104 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
105 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
106 +        Send an ICMP echo request with this pattern plus the particular
107 +        SysRq command key. Example:
108 +               # ping -c1 -s57 -p0102030468
109 +        will trigger the SysRq-H (help) command.
110 +
111 +
112  *  What are the 'command' keys?
113  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
114  'b'     - Will immediately reboot the system without syncing or unmounting
115 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
116 new file mode 100644
117 index 000000000000..6f2aeabf7faa
118 --- /dev/null
119 +++ b/Documentation/trace/histograms.txt
120 @@ -0,0 +1,186 @@
121 +               Using the Linux Kernel Latency Histograms
122 +
123 +
124 +This document gives a short explanation how to enable, configure and use
125 +latency histograms. Latency histograms are primarily relevant in the
126 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
127 +and are used in the quality management of the Linux real-time
128 +capabilities.
129 +
130 +
131 +* Purpose of latency histograms
132 +
133 +A latency histogram continuously accumulates the frequencies of latency
134 +data. There are two types of histograms
135 +- potential sources of latencies
136 +- effective latencies
137 +
138 +
139 +* Potential sources of latencies
140 +
141 +Potential sources of latencies are code segments where interrupts,
142 +preemption or both are disabled (aka critical sections). To create
143 +histograms of potential sources of latency, the kernel stores the time
144 +stamp at the start of a critical section, determines the time elapsed
145 +when the end of the section is reached, and increments the frequency
146 +counter of that latency value - irrespective of whether any concurrently
147 +running process is affected by latency or not.
148 +- Configuration items (in the Kernel hacking/Tracers submenu)
149 +  CONFIG_INTERRUPT_OFF_LATENCY
150 +  CONFIG_PREEMPT_OFF_LATENCY
151 +
152 +
153 +* Effective latencies
154 +
155 +Effective latencies are actually occuring during wakeup of a process. To
156 +determine effective latencies, the kernel stores the time stamp when a
157 +process is scheduled to be woken up, and determines the duration of the
158 +wakeup time shortly before control is passed over to this process. Note
159 +that the apparent latency in user space may be somewhat longer, since the
160 +process may be interrupted after control is passed over to it but before
161 +the execution in user space takes place. Simply measuring the interval
162 +between enqueuing and wakeup may also not appropriate in cases when a
163 +process is scheduled as a result of a timer expiration. The timer may have
164 +missed its deadline, e.g. due to disabled interrupts, but this latency
165 +would not be registered. Therefore, the offsets of missed timers are
166 +recorded in a separate histogram. If both wakeup latency and missed timer
167 +offsets are configured and enabled, a third histogram may be enabled that
168 +records the overall latency as a sum of the timer latency, if any, and the
169 +wakeup latency. This histogram is called "timerandwakeup".
170 +- Configuration items (in the Kernel hacking/Tracers submenu)
171 +  CONFIG_WAKEUP_LATENCY
172 +  CONFIG_MISSED_TIMER_OFSETS
173 +
174 +
175 +* Usage
176 +
177 +The interface to the administration of the latency histograms is located
178 +in the debugfs file system. To mount it, either enter
179 +
180 +mount -t sysfs nodev /sys
181 +mount -t debugfs nodev /sys/kernel/debug
182 +
183 +from shell command line level, or add
184 +
185 +nodev  /sys                    sysfs   defaults        0 0
186 +nodev  /sys/kernel/debug       debugfs defaults        0 0
187 +
188 +to the file /etc/fstab. All latency histogram related files are then
189 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
190 +particular histogram type is enabled by writing non-zero to the related
191 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
192 +Select "preemptirqsoff" for the histograms of potential sources of
193 +latencies and "wakeup" for histograms of effective latencies etc. The
194 +histogram data - one per CPU - are available in the files
195 +
196 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
197 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
198 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
199 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
200 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
201 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
202 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
203 +
204 +The histograms are reset by writing non-zero to the file "reset" in a
205 +particular latency directory. To reset all latency data, use
206 +
207 +#!/bin/sh
208 +
209 +TRACINGDIR=/sys/kernel/debug/tracing
210 +HISTDIR=$TRACINGDIR/latency_hist
211 +
212 +if test -d $HISTDIR
213 +then
214 +  cd $HISTDIR
215 +  for i in `find . | grep /reset$`
216 +  do
217 +    echo 1 >$i
218 +  done
219 +fi
220 +
221 +
222 +* Data format
223 +
224 +Latency data are stored with a resolution of one microsecond. The
225 +maximum latency is 10,240 microseconds. The data are only valid, if the
226 +overflow register is empty. Every output line contains the latency in
227 +microseconds in the first row and the number of samples in the second
228 +row. To display only lines with a positive latency count, use, for
229 +example,
230 +
231 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
232 +
233 +#Minimum latency: 0 microseconds.
234 +#Average latency: 0 microseconds.
235 +#Maximum latency: 25 microseconds.
236 +#Total samples: 3104770694
237 +#There are 0 samples greater or equal than 10240 microseconds
238 +#usecs          samples
239 +    0        2984486876
240 +    1          49843506
241 +    2          58219047
242 +    3           5348126
243 +    4           2187960
244 +    5           3388262
245 +    6            959289
246 +    7            208294
247 +    8             40420
248 +    9              4485
249 +   10             14918
250 +   11             18340
251 +   12             25052
252 +   13             19455
253 +   14              5602
254 +   15               969
255 +   16                47
256 +   17                18
257 +   18                14
258 +   19                 1
259 +   20                 3
260 +   21                 2
261 +   22                 5
262 +   23                 2
263 +   25                 1
264 +
265 +
266 +* Wakeup latency of a selected process
267 +
268 +To only collect wakeup latency data of a particular process, write the
269 +PID of the requested process to
270 +
271 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
272 +
273 +PIDs are not considered, if this variable is set to 0.
274 +
275 +
276 +* Details of the process with the highest wakeup latency so far
277 +
278 +Selected data of the process that suffered from the highest wakeup
279 +latency that occurred in a particular CPU are available in the file
280 +
281 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
282 +
283 +In addition, other relevant system data at the time when the
284 +latency occurred are given.
285 +
286 +The format of the data is (all in one line):
287 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
288 +<- <PID> <Priority> <Command> <Timestamp>
289 +
290 +The value of <Timeroffset> is only relevant in the combined timer
291 +and wakeup latency recording. In the wakeup recording, it is
292 +always 0, in the missed_timer_offsets recording, it is the same
293 +as <Latency>.
294 +
295 +When retrospectively searching for the origin of a latency and
296 +tracing was not enabled, it may be helpful to know the name and
297 +some basic data of the task that (finally) was switching to the
298 +late real-tlme task. In addition to the victim's data, also the
299 +data of the possible culprit are therefore displayed after the
300 +"<-" symbol.
301 +
302 +Finally, the timestamp of the time when the latency occurred
303 +in <seconds>.<microseconds> after the most recent system boot
304 +is provided.
305 +
306 +These data are also reset when the wakeup histogram is reset.
307 diff --git a/Makefile b/Makefile
308 index 95421b688f23..336590d1c969 100644
309 --- a/Makefile
310 +++ b/Makefile
311 @@ -783,6 +783,9 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=strict-prototypes)
312  # Prohibit date/time macros, which would make the build non-deterministic
313  KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
314  
315 +# enforce correct pointer usage
316 +KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
317 +
318  # use the deterministic mode of AR if available
319  KBUILD_ARFLAGS := $(call ar-option,D)
320  
321 diff --git a/arch/Kconfig b/arch/Kconfig
322 index 4e949e58b192..3b26d76933fb 100644
323 --- a/arch/Kconfig
324 +++ b/arch/Kconfig
325 @@ -9,6 +9,7 @@ config OPROFILE
326         tristate "OProfile system profiling"
327         depends on PROFILING
328         depends on HAVE_OPROFILE
329 +       depends on !PREEMPT_RT_FULL
330         select RING_BUFFER
331         select RING_BUFFER_ALLOW_SWAP
332         help
333 @@ -52,6 +53,7 @@ config KPROBES
334  config JUMP_LABEL
335         bool "Optimize very unlikely/likely branches"
336         depends on HAVE_ARCH_JUMP_LABEL
337 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
338         help
339           This option enables a transparent branch optimization that
340          makes certain almost-always-true or almost-always-false branch
341 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
342 index 34e1569a11ee..79c4603e9453 100644
343 --- a/arch/arm/Kconfig
344 +++ b/arch/arm/Kconfig
345 @@ -33,7 +33,7 @@ config ARM
346         select HARDIRQS_SW_RESEND
347         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
348         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
349 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
350 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !PREEMPT_RT_BASE
351         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
352         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
353         select HAVE_ARCH_TRACEHOOK
354 @@ -68,6 +68,7 @@ config ARM
355         select HAVE_PERF_EVENTS
356         select HAVE_PERF_REGS
357         select HAVE_PERF_USER_STACK_DUMP
358 +       select HAVE_PREEMPT_LAZY
359         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
360         select HAVE_REGS_AND_STACK_ACCESS_API
361         select HAVE_SYSCALL_TRACEPOINTS
362 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
363 index 12ebfcc1d539..c962084605bc 100644
364 --- a/arch/arm/include/asm/switch_to.h
365 +++ b/arch/arm/include/asm/switch_to.h
366 @@ -3,6 +3,13 @@
367  
368  #include <linux/thread_info.h>
369  
370 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
371 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
372 +#else
373 +static inline void
374 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
375 +#endif
376 +
377  /*
378   * For v7 SMP cores running a preemptible kernel we may be pre-empted
379   * during a TLB maintenance operation, so execute an inner-shareable dsb
380 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
381  #define switch_to(prev,next,last)                                      \
382  do {                                                                   \
383         __complete_pending_tlbi();                                      \
384 +       switch_kmaps(prev, next);                                       \
385         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
386  } while (0)
387  
388 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
389 index 776757d1604a..1f36a4eccc72 100644
390 --- a/arch/arm/include/asm/thread_info.h
391 +++ b/arch/arm/include/asm/thread_info.h
392 @@ -49,6 +49,7 @@ struct cpu_context_save {
393  struct thread_info {
394         unsigned long           flags;          /* low level flags */
395         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
396 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
397         mm_segment_t            addr_limit;     /* address limit */
398         struct task_struct      *task;          /* main task structure */
399         __u32                   cpu;            /* cpu */
400 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
401  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
402  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
403  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
404 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
405 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
406 +#define TIF_NEED_RESCHED_LAZY  7
407  
408  #define TIF_NOHZ               12      /* in adaptive nohz mode */
409  #define TIF_USING_IWMMXT       17
410 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
411  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
412  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
413  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
414 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
415  #define _TIF_UPROBE            (1 << TIF_UPROBE)
416  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
417  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
418 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
419   * Change these and you break ASM code in entry-common.S
420   */
421  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
422 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
423 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
424 +                                _TIF_NEED_RESCHED_LAZY)
425  
426  #endif /* __KERNEL__ */
427  #endif /* __ASM_ARM_THREAD_INFO_H */
428 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
429 index 871b8267d211..4dbe70de7318 100644
430 --- a/arch/arm/kernel/asm-offsets.c
431 +++ b/arch/arm/kernel/asm-offsets.c
432 @@ -65,6 +65,7 @@ int main(void)
433    BLANK();
434    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
435    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
436 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
437    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
438    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
439    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
440 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
441 index 3ce377f7251f..d044cea59f54 100644
442 --- a/arch/arm/kernel/entry-armv.S
443 +++ b/arch/arm/kernel/entry-armv.S
444 @@ -215,11 +215,18 @@ __irq_svc:
445  #ifdef CONFIG_PREEMPT
446         get_thread_info tsk
447         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
448 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
449         teq     r8, #0                          @ if preempt count != 0
450 +       bne     1f                              @ return from exeption
451 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
452 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
453 +       blne    svc_preempt                     @ preempt!
454 +
455 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
456 +       teq     r8, #0                          @ if preempt lazy count != 0
457         movne   r0, #0                          @ force flags to 0
458 -       tst     r0, #_TIF_NEED_RESCHED
459 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
460         blne    svc_preempt
461 +1:
462  #endif
463  
464         svc_exit r5, irq = 1                    @ return from exception
465 @@ -234,8 +241,14 @@ svc_preempt:
466  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
467         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
468         tst     r0, #_TIF_NEED_RESCHED
469 +       bne     1b
470 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
471         reteq   r8                              @ go again
472 -       b       1b
473 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
474 +       teq     r0, #0                          @ if preempt lazy count != 0
475 +       beq     1b
476 +       ret     r8                              @ go again
477 +
478  #endif
479  
480  __und_fault:
481 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
482 index 30a7228eaceb..c3bd6cbfce4b 100644
483 --- a/arch/arm/kernel/entry-common.S
484 +++ b/arch/arm/kernel/entry-common.S
485 @@ -36,7 +36,9 @@ ret_fast_syscall:
486   UNWIND(.cantunwind    )
487         disable_irq_notrace                     @ disable interrupts
488         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
489 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
490 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
491 +       bne     fast_work_pending
492 +       tst     r1, #_TIF_SECCOMP
493         bne     fast_work_pending
494  
495         /* perform architecture specific actions before user return */
496 @@ -62,8 +64,11 @@ ret_fast_syscall:
497         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
498         disable_irq_notrace                     @ disable interrupts
499         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
500 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
501 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
502 +       bne     do_slower_path
503 +       tst     r1, #_TIF_SECCOMP
504         beq     no_work_pending
505 +do_slower_path:
506   UNWIND(.fnend         )
507  ENDPROC(ret_fast_syscall)
508  
509 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
510 index 4adfb46e3ee9..15f1d94b47c5 100644
511 --- a/arch/arm/kernel/process.c
512 +++ b/arch/arm/kernel/process.c
513 @@ -319,6 +319,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
514  }
515  
516  #ifdef CONFIG_MMU
517 +/*
518 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
519 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
520 + * fail.
521 + */
522 +static int __init vectors_user_mapping_init_page(void)
523 +{
524 +       struct page *page;
525 +       unsigned long addr = 0xffff0000;
526 +       pgd_t *pgd;
527 +       pud_t *pud;
528 +       pmd_t *pmd;
529 +
530 +       pgd = pgd_offset_k(addr);
531 +       pud = pud_offset(pgd, addr);
532 +       pmd = pmd_offset(pud, addr);
533 +       page = pmd_page(*(pmd));
534 +
535 +       pgtable_page_ctor(page);
536 +
537 +       return 0;
538 +}
539 +late_initcall(vectors_user_mapping_init_page);
540 +
541  #ifdef CONFIG_KUSER_HELPERS
542  /*
543   * The vectors page is always readable from user space for the
544 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
545 index 7b8f2141427b..96541e00b74a 100644
546 --- a/arch/arm/kernel/signal.c
547 +++ b/arch/arm/kernel/signal.c
548 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
549          */
550         trace_hardirqs_off();
551         do {
552 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
553 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
554 +                                          _TIF_NEED_RESCHED_LAZY))) {
555                         schedule();
556                 } else {
557                         if (unlikely(!user_mode(regs)))
558 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
559 index b26361355dae..e5754e3b03c4 100644
560 --- a/arch/arm/kernel/smp.c
561 +++ b/arch/arm/kernel/smp.c
562 @@ -230,8 +230,6 @@ int __cpu_disable(void)
563         flush_cache_louis();
564         local_flush_tlb_all();
565  
566 -       clear_tasks_mm_cpumask(cpu);
567 -
568         return 0;
569  }
570  
571 @@ -247,6 +245,9 @@ void __cpu_die(unsigned int cpu)
572                 pr_err("CPU%u: cpu didn't die\n", cpu);
573                 return;
574         }
575 +
576 +       clear_tasks_mm_cpumask(cpu);
577 +
578         pr_notice("CPU%u: shutdown\n", cpu);
579  
580         /*
581 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
582 index 0bee233fef9a..314cfb232a63 100644
583 --- a/arch/arm/kernel/unwind.c
584 +++ b/arch/arm/kernel/unwind.c
585 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
586  static const struct unwind_idx *__origin_unwind_idx;
587  extern const struct unwind_idx __stop_unwind_idx[];
588  
589 -static DEFINE_SPINLOCK(unwind_lock);
590 +static DEFINE_RAW_SPINLOCK(unwind_lock);
591  static LIST_HEAD(unwind_tables);
592  
593  /* Convert a prel31 symbol to an absolute address */
594 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
595                 /* module unwind tables */
596                 struct unwind_table *table;
597  
598 -               spin_lock_irqsave(&unwind_lock, flags);
599 +               raw_spin_lock_irqsave(&unwind_lock, flags);
600                 list_for_each_entry(table, &unwind_tables, list) {
601                         if (addr >= table->begin_addr &&
602                             addr < table->end_addr) {
603 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
604                                 break;
605                         }
606                 }
607 -               spin_unlock_irqrestore(&unwind_lock, flags);
608 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
609         }
610  
611         pr_debug("%s: idx = %p\n", __func__, idx);
612 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
613         tab->begin_addr = text_addr;
614         tab->end_addr = text_addr + text_size;
615  
616 -       spin_lock_irqsave(&unwind_lock, flags);
617 +       raw_spin_lock_irqsave(&unwind_lock, flags);
618         list_add_tail(&tab->list, &unwind_tables);
619 -       spin_unlock_irqrestore(&unwind_lock, flags);
620 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
621  
622         return tab;
623  }
624 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
625         if (!tab)
626                 return;
627  
628 -       spin_lock_irqsave(&unwind_lock, flags);
629 +       raw_spin_lock_irqsave(&unwind_lock, flags);
630         list_del(&tab->list);
631 -       spin_unlock_irqrestore(&unwind_lock, flags);
632 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
633  
634         kfree(tab);
635  }
636 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
637 index d7bef2144760..36a3e51492f7 100644
638 --- a/arch/arm/kvm/arm.c
639 +++ b/arch/arm/kvm/arm.c
640 @@ -496,18 +496,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
641         struct kvm_vcpu *vcpu;
642  
643         kvm_for_each_vcpu(i, vcpu, kvm) {
644 -               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
645 +               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
646  
647                 vcpu->arch.pause = false;
648 -               wake_up_interruptible(wq);
649 +               swake_up(wq);
650         }
651  }
652  
653  static void vcpu_sleep(struct kvm_vcpu *vcpu)
654  {
655 -       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
656 +       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
657  
658 -       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
659 +       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
660                                        (!vcpu->arch.pause)));
661  }
662  
663 @@ -566,7 +566,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
664                  * involves poking the GIC, which must be done in a
665                  * non-preemptible context.
666                  */
667 -               preempt_disable();
668 +               migrate_disable();
669                 kvm_timer_flush_hwstate(vcpu);
670                 kvm_vgic_flush_hwstate(vcpu);
671  
672 @@ -585,7 +585,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
673                         local_irq_enable();
674                         kvm_timer_sync_hwstate(vcpu);
675                         kvm_vgic_sync_hwstate(vcpu);
676 -                       preempt_enable();
677 +                       migrate_enable();
678                         continue;
679                 }
680  
681 @@ -639,7 +639,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
682  
683                 kvm_vgic_sync_hwstate(vcpu);
684  
685 -               preempt_enable();
686 +               migrate_enable();
687  
688                 ret = handle_exit(vcpu, run, ret);
689         }
690 diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
691 index a9b3b905e661..c2b131527a64 100644
692 --- a/arch/arm/kvm/psci.c
693 +++ b/arch/arm/kvm/psci.c
694 @@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
695  {
696         struct kvm *kvm = source_vcpu->kvm;
697         struct kvm_vcpu *vcpu = NULL;
698 -       wait_queue_head_t *wq;
699 +       struct swait_queue_head *wq;
700         unsigned long cpu_id;
701         unsigned long context_id;
702         phys_addr_t target_pc;
703 @@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
704         smp_mb();               /* Make sure the above is visible */
705  
706         wq = kvm_arch_vcpu_wq(vcpu);
707 -       wake_up_interruptible(wq);
708 +       swake_up(wq);
709  
710         return PSCI_RET_SUCCESS;
711  }
712 diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
713 index 28656c2b54a0..3f501305ca26 100644
714 --- a/arch/arm/mach-at91/Kconfig
715 +++ b/arch/arm/mach-at91/Kconfig
716 @@ -99,6 +99,7 @@ config HAVE_AT91_USB_CLK
717  config COMMON_CLK_AT91
718         bool
719         select COMMON_CLK
720 +       select MFD_SYSCON
721  
722  config HAVE_AT91_SMD
723         bool
724 diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c
725 index c1a7c6cc00e1..63b4fa25b48a 100644
726 --- a/arch/arm/mach-at91/at91rm9200.c
727 +++ b/arch/arm/mach-at91/at91rm9200.c
728 @@ -12,7 +12,6 @@
729  #include <linux/of_platform.h>
730  
731  #include <asm/mach/arch.h>
732 -#include <asm/system_misc.h>
733  
734  #include "generic.h"
735  #include "soc.h"
736 @@ -33,7 +32,6 @@ static void __init at91rm9200_dt_device_init(void)
737  
738         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
739  
740 -       arm_pm_idle = at91rm9200_idle;
741         at91rm9200_pm_init();
742  }
743  
744 diff --git a/arch/arm/mach-at91/at91sam9.c b/arch/arm/mach-at91/at91sam9.c
745 index 7eb64f763034..cada2a6412b3 100644
746 --- a/arch/arm/mach-at91/at91sam9.c
747 +++ b/arch/arm/mach-at91/at91sam9.c
748 @@ -62,8 +62,6 @@ static void __init at91sam9_common_init(void)
749                 soc_dev = soc_device_to_device(soc);
750  
751         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
752 -
753 -       arm_pm_idle = at91sam9_idle;
754  }
755  
756  static void __init at91sam9_dt_device_init(void)
757 diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h
758 index b0fa7dc7286d..28ca57a2060f 100644
759 --- a/arch/arm/mach-at91/generic.h
760 +++ b/arch/arm/mach-at91/generic.h
761 @@ -11,27 +11,18 @@
762  #ifndef _AT91_GENERIC_H
763  #define _AT91_GENERIC_H
764  
765 -#include <linux/of.h>
766 -#include <linux/reboot.h>
767 -
768 - /* Map io */
769 -extern void __init at91_map_io(void);
770 -extern void __init at91_alt_map_io(void);
771 -
772 -/* idle */
773 -extern void at91rm9200_idle(void);
774 -extern void at91sam9_idle(void);
775 -
776  #ifdef CONFIG_PM
777  extern void __init at91rm9200_pm_init(void);
778  extern void __init at91sam9260_pm_init(void);
779  extern void __init at91sam9g45_pm_init(void);
780  extern void __init at91sam9x5_pm_init(void);
781 +extern void __init sama5_pm_init(void);
782  #else
783  static inline void __init at91rm9200_pm_init(void) { }
784  static inline void __init at91sam9260_pm_init(void) { }
785  static inline void __init at91sam9g45_pm_init(void) { }
786  static inline void __init at91sam9x5_pm_init(void) { }
787 +static inline void __init sama5_pm_init(void) { }
788  #endif
789  
790  #endif /* _AT91_GENERIC_H */
791 diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
792 index 23726fb31741..f06270198bf1 100644
793 --- a/arch/arm/mach-at91/pm.c
794 +++ b/arch/arm/mach-at91/pm.c
795 @@ -31,10 +31,13 @@
796  #include <asm/mach/irq.h>
797  #include <asm/fncpy.h>
798  #include <asm/cacheflush.h>
799 +#include <asm/system_misc.h>
800  
801  #include "generic.h"
802  #include "pm.h"
803  
804 +static void __iomem *pmc;
805 +
806  /*
807   * FIXME: this is needed to communicate between the pinctrl driver and
808   * the PM implementation in the machine. Possibly part of the PM
809 @@ -87,7 +90,7 @@ static int at91_pm_verify_clocks(void)
810         unsigned long scsr;
811         int i;
812  
813 -       scsr = at91_pmc_read(AT91_PMC_SCSR);
814 +       scsr = readl(pmc + AT91_PMC_SCSR);
815  
816         /* USB must not be using PLLB */
817         if ((scsr & at91_pm_data.uhp_udp_mask) != 0) {
818 @@ -101,8 +104,7 @@ static int at91_pm_verify_clocks(void)
819  
820                 if ((scsr & (AT91_PMC_PCK0 << i)) == 0)
821                         continue;
822 -
823 -               css = at91_pmc_read(AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
824 +               css = readl(pmc + AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
825                 if (css != AT91_PMC_CSS_SLOW) {
826                         pr_err("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
827                         return 0;
828 @@ -145,8 +147,8 @@ static void at91_pm_suspend(suspend_state_t state)
829         flush_cache_all();
830         outer_disable();
831  
832 -       at91_suspend_sram_fn(at91_pmc_base, at91_ramc_base[0],
833 -                               at91_ramc_base[1], pm_data);
834 +       at91_suspend_sram_fn(pmc, at91_ramc_base[0],
835 +                            at91_ramc_base[1], pm_data);
836  
837         outer_resume();
838  }
839 @@ -353,6 +355,21 @@ static __init void at91_dt_ramc(void)
840         at91_pm_set_standby(standby);
841  }
842  
843 +void at91rm9200_idle(void)
844 +{
845 +       /*
846 +        * Disable the processor clock.  The processor will be automatically
847 +        * re-enabled by an interrupt or by a reset.
848 +        */
849 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
850 +}
851 +
852 +void at91sam9_idle(void)
853 +{
854 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
855 +       cpu_do_idle();
856 +}
857 +
858  static void __init at91_pm_sram_init(void)
859  {
860         struct gen_pool *sram_pool;
861 @@ -399,13 +416,36 @@ static void __init at91_pm_sram_init(void)
862                         &at91_pm_suspend_in_sram, at91_pm_suspend_in_sram_sz);
863  }
864  
865 -static void __init at91_pm_init(void)
866 +static const struct of_device_id atmel_pmc_ids[] __initconst = {
867 +       { .compatible = "atmel,at91rm9200-pmc"  },
868 +       { .compatible = "atmel,at91sam9260-pmc" },
869 +       { .compatible = "atmel,at91sam9g45-pmc" },
870 +       { .compatible = "atmel,at91sam9n12-pmc" },
871 +       { .compatible = "atmel,at91sam9x5-pmc" },
872 +       { .compatible = "atmel,sama5d3-pmc" },
873 +       { .compatible = "atmel,sama5d2-pmc" },
874 +       { /* sentinel */ },
875 +};
876 +
877 +static void __init at91_pm_init(void (*pm_idle)(void))
878  {
879 -       at91_pm_sram_init();
880 +       struct device_node *pmc_np;
881  
882         if (at91_cpuidle_device.dev.platform_data)
883                 platform_device_register(&at91_cpuidle_device);
884  
885 +       pmc_np = of_find_matching_node(NULL, atmel_pmc_ids);
886 +       pmc = of_iomap(pmc_np, 0);
887 +       if (!pmc) {
888 +               pr_err("AT91: PM not supported, PMC not found\n");
889 +               return;
890 +       }
891 +
892 +       if (pm_idle)
893 +               arm_pm_idle = pm_idle;
894 +
895 +       at91_pm_sram_init();
896 +
897         if (at91_suspend_sram_fn)
898                 suspend_set_ops(&at91_pm_ops);
899         else
900 @@ -424,7 +464,7 @@ void __init at91rm9200_pm_init(void)
901         at91_pm_data.uhp_udp_mask = AT91RM9200_PMC_UHP | AT91RM9200_PMC_UDP;
902         at91_pm_data.memctrl = AT91_MEMCTRL_MC;
903  
904 -       at91_pm_init();
905 +       at91_pm_init(at91rm9200_idle);
906  }
907  
908  void __init at91sam9260_pm_init(void)
909 @@ -432,7 +472,7 @@ void __init at91sam9260_pm_init(void)
910         at91_dt_ramc();
911         at91_pm_data.memctrl = AT91_MEMCTRL_SDRAMC;
912         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
913 -       return at91_pm_init();
914 +       at91_pm_init(at91sam9_idle);
915  }
916  
917  void __init at91sam9g45_pm_init(void)
918 @@ -440,7 +480,7 @@ void __init at91sam9g45_pm_init(void)
919         at91_dt_ramc();
920         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP;
921         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
922 -       return at91_pm_init();
923 +       at91_pm_init(at91sam9_idle);
924  }
925  
926  void __init at91sam9x5_pm_init(void)
927 @@ -448,5 +488,13 @@ void __init at91sam9x5_pm_init(void)
928         at91_dt_ramc();
929         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
930         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
931 -       return at91_pm_init();
932 +       at91_pm_init(at91sam9_idle);
933 +}
934 +
935 +void __init sama5_pm_init(void)
936 +{
937 +       at91_dt_ramc();
938 +       at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
939 +       at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
940 +       at91_pm_init(NULL);
941  }
942 diff --git a/arch/arm/mach-at91/sama5.c b/arch/arm/mach-at91/sama5.c
943 index d9cf6799aec0..df8fdf1cf66d 100644
944 --- a/arch/arm/mach-at91/sama5.c
945 +++ b/arch/arm/mach-at91/sama5.c
946 @@ -51,7 +51,7 @@ static void __init sama5_dt_device_init(void)
947                 soc_dev = soc_device_to_device(soc);
948  
949         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
950 -       at91sam9x5_pm_init();
951 +       sama5_pm_init();
952  }
953  
954  static const char *const sama5_dt_board_compat[] __initconst = {
955 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
956 index 98a2c0cbb833..310dce500d3e 100644
957 --- a/arch/arm/mach-exynos/platsmp.c
958 +++ b/arch/arm/mach-exynos/platsmp.c
959 @@ -230,7 +230,7 @@ static void __iomem *scu_base_addr(void)
960         return (void __iomem *)(S5P_VA_SCU);
961  }
962  
963 -static DEFINE_SPINLOCK(boot_lock);
964 +static DEFINE_RAW_SPINLOCK(boot_lock);
965  
966  static void exynos_secondary_init(unsigned int cpu)
967  {
968 @@ -243,8 +243,8 @@ static void exynos_secondary_init(unsigned int cpu)
969         /*
970          * Synchronise with the boot thread.
971          */
972 -       spin_lock(&boot_lock);
973 -       spin_unlock(&boot_lock);
974 +       raw_spin_lock(&boot_lock);
975 +       raw_spin_unlock(&boot_lock);
976  }
977  
978  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
979 @@ -308,7 +308,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
980          * Set synchronisation state between this boot processor
981          * and the secondary one
982          */
983 -       spin_lock(&boot_lock);
984 +       raw_spin_lock(&boot_lock);
985  
986         /*
987          * The secondary processor is waiting to be released from
988 @@ -335,7 +335,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
989  
990                 if (timeout == 0) {
991                         printk(KERN_ERR "cpu1 power enable failed");
992 -                       spin_unlock(&boot_lock);
993 +                       raw_spin_unlock(&boot_lock);
994                         return -ETIMEDOUT;
995                 }
996         }
997 @@ -381,7 +381,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
998          * calibrations, then wait for it to finish
999          */
1000  fail:
1001 -       spin_unlock(&boot_lock);
1002 +       raw_spin_unlock(&boot_lock);
1003  
1004         return pen_release != -1 ? ret : 0;
1005  }
1006 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
1007 index b5f8f5ffda79..9753a84df9c4 100644
1008 --- a/arch/arm/mach-hisi/platmcpm.c
1009 +++ b/arch/arm/mach-hisi/platmcpm.c
1010 @@ -61,7 +61,7 @@
1011  
1012  static void __iomem *sysctrl, *fabric;
1013  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
1014 -static DEFINE_SPINLOCK(boot_lock);
1015 +static DEFINE_RAW_SPINLOCK(boot_lock);
1016  static u32 fabric_phys_addr;
1017  /*
1018   * [0]: bootwrapper physical address
1019 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1020         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
1021                 return -EINVAL;
1022  
1023 -       spin_lock_irq(&boot_lock);
1024 +       raw_spin_lock_irq(&boot_lock);
1025  
1026         if (hip04_cpu_table[cluster][cpu])
1027                 goto out;
1028 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1029  
1030  out:
1031         hip04_cpu_table[cluster][cpu]++;
1032 -       spin_unlock_irq(&boot_lock);
1033 +       raw_spin_unlock_irq(&boot_lock);
1034  
1035         return 0;
1036  }
1037 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
1038         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
1039         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
1040  
1041 -       spin_lock(&boot_lock);
1042 +       raw_spin_lock(&boot_lock);
1043         hip04_cpu_table[cluster][cpu]--;
1044         if (hip04_cpu_table[cluster][cpu] == 1) {
1045                 /* A power_up request went ahead of us. */
1046 -               spin_unlock(&boot_lock);
1047 +               raw_spin_unlock(&boot_lock);
1048                 return;
1049         } else if (hip04_cpu_table[cluster][cpu] > 1) {
1050                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
1051 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
1052         }
1053  
1054         last_man = hip04_cluster_is_down(cluster);
1055 -       spin_unlock(&boot_lock);
1056 +       raw_spin_unlock(&boot_lock);
1057         if (last_man) {
1058                 /* Since it's Cortex A15, disable L2 prefetching. */
1059                 asm volatile(
1060 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1061                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
1062  
1063         count = TIMEOUT_MSEC / POLL_MSEC;
1064 -       spin_lock_irq(&boot_lock);
1065 +       raw_spin_lock_irq(&boot_lock);
1066         for (tries = 0; tries < count; tries++) {
1067                 if (hip04_cpu_table[cluster][cpu])
1068                         goto err;
1069 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1070                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
1071                 if (data & CORE_WFI_STATUS(cpu))
1072                         break;
1073 -               spin_unlock_irq(&boot_lock);
1074 +               raw_spin_unlock_irq(&boot_lock);
1075                 /* Wait for clean L2 when the whole cluster is down. */
1076                 msleep(POLL_MSEC);
1077 -               spin_lock_irq(&boot_lock);
1078 +               raw_spin_lock_irq(&boot_lock);
1079         }
1080         if (tries >= count)
1081                 goto err;
1082 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1083                 goto err;
1084         if (hip04_cluster_is_down(cluster))
1085                 hip04_set_snoop_filter(cluster, 0);
1086 -       spin_unlock_irq(&boot_lock);
1087 +       raw_spin_unlock_irq(&boot_lock);
1088         return 1;
1089  err:
1090 -       spin_unlock_irq(&boot_lock);
1091 +       raw_spin_unlock_irq(&boot_lock);
1092         return 0;
1093  }
1094  #endif
1095 diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
1096 index 8ceda2844c4f..08bcf8fb76f2 100644
1097 --- a/arch/arm/mach-imx/Kconfig
1098 +++ b/arch/arm/mach-imx/Kconfig
1099 @@ -524,7 +524,7 @@ config SOC_IMX6Q
1100         bool "i.MX6 Quad/DualLite support"
1101         select ARM_ERRATA_764369 if SMP
1102         select HAVE_ARM_SCU if SMP
1103 -       select HAVE_ARM_TWD if SMP
1104 +       select HAVE_ARM_TWD
1105         select PCI_DOMAINS if PCI
1106         select PINCTRL_IMX6Q
1107         select SOC_IMX6
1108 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
1109 index 79e1f876d1c9..7e625c17f78e 100644
1110 --- a/arch/arm/mach-omap2/omap-smp.c
1111 +++ b/arch/arm/mach-omap2/omap-smp.c
1112 @@ -43,7 +43,7 @@
1113  /* SCU base address */
1114  static void __iomem *scu_base;
1115  
1116 -static DEFINE_SPINLOCK(boot_lock);
1117 +static DEFINE_RAW_SPINLOCK(boot_lock);
1118  
1119  void __iomem *omap4_get_scu_base(void)
1120  {
1121 @@ -74,8 +74,8 @@ static void omap4_secondary_init(unsigned int cpu)
1122         /*
1123          * Synchronise with the boot thread.
1124          */
1125 -       spin_lock(&boot_lock);
1126 -       spin_unlock(&boot_lock);
1127 +       raw_spin_lock(&boot_lock);
1128 +       raw_spin_unlock(&boot_lock);
1129  }
1130  
1131  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1132 @@ -89,7 +89,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1133          * Set synchronisation state between this boot processor
1134          * and the secondary one
1135          */
1136 -       spin_lock(&boot_lock);
1137 +       raw_spin_lock(&boot_lock);
1138  
1139         /*
1140          * Update the AuxCoreBoot0 with boot state for secondary core.
1141 @@ -166,7 +166,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1142          * Now the secondary core is starting up let it run its
1143          * calibrations, then wait for it to finish
1144          */
1145 -       spin_unlock(&boot_lock);
1146 +       raw_spin_unlock(&boot_lock);
1147  
1148         return 0;
1149  }
1150 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
1151 index e46c91094dde..dcb3ed0c26da 100644
1152 --- a/arch/arm/mach-prima2/platsmp.c
1153 +++ b/arch/arm/mach-prima2/platsmp.c
1154 @@ -22,7 +22,7 @@
1155  
1156  static void __iomem *clk_base;
1157  
1158 -static DEFINE_SPINLOCK(boot_lock);
1159 +static DEFINE_RAW_SPINLOCK(boot_lock);
1160  
1161  static void sirfsoc_secondary_init(unsigned int cpu)
1162  {
1163 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
1164         /*
1165          * Synchronise with the boot thread.
1166          */
1167 -       spin_lock(&boot_lock);
1168 -       spin_unlock(&boot_lock);
1169 +       raw_spin_lock(&boot_lock);
1170 +       raw_spin_unlock(&boot_lock);
1171  }
1172  
1173  static const struct of_device_id clk_ids[]  = {
1174 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1175         /* make sure write buffer is drained */
1176         mb();
1177  
1178 -       spin_lock(&boot_lock);
1179 +       raw_spin_lock(&boot_lock);
1180  
1181         /*
1182          * The secondary processor is waiting to be released from
1183 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1184          * now the secondary core is starting up let it run its
1185          * calibrations, then wait for it to finish
1186          */
1187 -       spin_unlock(&boot_lock);
1188 +       raw_spin_unlock(&boot_lock);
1189  
1190         return pen_release != -1 ? -ENOSYS : 0;
1191  }
1192 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
1193 index 9b00123a315d..0a49fe1bc8cf 100644
1194 --- a/arch/arm/mach-qcom/platsmp.c
1195 +++ b/arch/arm/mach-qcom/platsmp.c
1196 @@ -46,7 +46,7 @@
1197  
1198  extern void secondary_startup_arm(void);
1199  
1200 -static DEFINE_SPINLOCK(boot_lock);
1201 +static DEFINE_RAW_SPINLOCK(boot_lock);
1202  
1203  #ifdef CONFIG_HOTPLUG_CPU
1204  static void qcom_cpu_die(unsigned int cpu)
1205 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
1206         /*
1207          * Synchronise with the boot thread.
1208          */
1209 -       spin_lock(&boot_lock);
1210 -       spin_unlock(&boot_lock);
1211 +       raw_spin_lock(&boot_lock);
1212 +       raw_spin_unlock(&boot_lock);
1213  }
1214  
1215  static int scss_release_secondary(unsigned int cpu)
1216 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1217          * set synchronisation state between this boot processor
1218          * and the secondary one
1219          */
1220 -       spin_lock(&boot_lock);
1221 +       raw_spin_lock(&boot_lock);
1222  
1223         /*
1224          * Send the secondary CPU a soft interrupt, thereby causing
1225 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1226          * now the secondary core is starting up let it run its
1227          * calibrations, then wait for it to finish
1228          */
1229 -       spin_unlock(&boot_lock);
1230 +       raw_spin_unlock(&boot_lock);
1231  
1232         return ret;
1233  }
1234 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
1235 index fd4297713d67..b0553b2c2d53 100644
1236 --- a/arch/arm/mach-spear/platsmp.c
1237 +++ b/arch/arm/mach-spear/platsmp.c
1238 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1239         sync_cache_w(&pen_release);
1240  }
1241  
1242 -static DEFINE_SPINLOCK(boot_lock);
1243 +static DEFINE_RAW_SPINLOCK(boot_lock);
1244  
1245  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
1246  
1247 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
1248         /*
1249          * Synchronise with the boot thread.
1250          */
1251 -       spin_lock(&boot_lock);
1252 -       spin_unlock(&boot_lock);
1253 +       raw_spin_lock(&boot_lock);
1254 +       raw_spin_unlock(&boot_lock);
1255  }
1256  
1257  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1258 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1259          * set synchronisation state between this boot processor
1260          * and the secondary one
1261          */
1262 -       spin_lock(&boot_lock);
1263 +       raw_spin_lock(&boot_lock);
1264  
1265         /*
1266          * The secondary processor is waiting to be released from
1267 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1268          * now the secondary core is starting up let it run its
1269          * calibrations, then wait for it to finish
1270          */
1271 -       spin_unlock(&boot_lock);
1272 +       raw_spin_unlock(&boot_lock);
1273  
1274         return pen_release != -1 ? -ENOSYS : 0;
1275  }
1276 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
1277 index c4ad6eae67fa..e830b20b212f 100644
1278 --- a/arch/arm/mach-sti/platsmp.c
1279 +++ b/arch/arm/mach-sti/platsmp.c
1280 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
1281         sync_cache_w(&pen_release);
1282  }
1283  
1284 -static DEFINE_SPINLOCK(boot_lock);
1285 +static DEFINE_RAW_SPINLOCK(boot_lock);
1286  
1287  static void sti_secondary_init(unsigned int cpu)
1288  {
1289 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
1290         /*
1291          * Synchronise with the boot thread.
1292          */
1293 -       spin_lock(&boot_lock);
1294 -       spin_unlock(&boot_lock);
1295 +       raw_spin_lock(&boot_lock);
1296 +       raw_spin_unlock(&boot_lock);
1297  }
1298  
1299  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1300 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1301          * set synchronisation state between this boot processor
1302          * and the secondary one
1303          */
1304 -       spin_lock(&boot_lock);
1305 +       raw_spin_lock(&boot_lock);
1306  
1307         /*
1308          * The secondary processor is waiting to be released from
1309 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1310          * now the secondary core is starting up let it run its
1311          * calibrations, then wait for it to finish
1312          */
1313 -       spin_unlock(&boot_lock);
1314 +       raw_spin_unlock(&boot_lock);
1315  
1316         return pen_release != -1 ? -ENOSYS : 0;
1317  }
1318 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
1319 index daafcf121ce0..b8aa1e9ee8ee 100644
1320 --- a/arch/arm/mm/fault.c
1321 +++ b/arch/arm/mm/fault.c
1322 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1323         if (addr < TASK_SIZE)
1324                 return do_page_fault(addr, fsr, regs);
1325  
1326 +       if (interrupts_enabled(regs))
1327 +               local_irq_enable();
1328 +
1329         if (user_mode(regs))
1330                 goto bad_area;
1331  
1332 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1333  static int
1334  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1335  {
1336 +       if (interrupts_enabled(regs))
1337 +               local_irq_enable();
1338 +
1339         do_bad_area(addr, fsr, regs);
1340         return 0;
1341  }
1342 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1343 index d02f8187b1cc..542692dbd40a 100644
1344 --- a/arch/arm/mm/highmem.c
1345 +++ b/arch/arm/mm/highmem.c
1346 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1347         return *ptep;
1348  }
1349  
1350 +static unsigned int fixmap_idx(int type)
1351 +{
1352 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1353 +}
1354 +
1355  void *kmap(struct page *page)
1356  {
1357         might_sleep();
1358 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1359  
1360  void *kmap_atomic(struct page *page)
1361  {
1362 +       pte_t pte = mk_pte(page, kmap_prot);
1363         unsigned int idx;
1364         unsigned long vaddr;
1365         void *kmap;
1366         int type;
1367  
1368 -       preempt_disable();
1369 +       preempt_disable_nort();
1370         pagefault_disable();
1371         if (!PageHighMem(page))
1372                 return page_address(page);
1373 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1374  
1375         type = kmap_atomic_idx_push();
1376  
1377 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1378 +       idx = fixmap_idx(type);
1379         vaddr = __fix_to_virt(idx);
1380  #ifdef CONFIG_DEBUG_HIGHMEM
1381         /*
1382 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1383          * in place, so the contained TLB flush ensures the TLB is updated
1384          * with the new mapping.
1385          */
1386 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1387 +#ifdef CONFIG_PREEMPT_RT_FULL
1388 +       current->kmap_pte[type] = pte;
1389 +#endif
1390 +       set_fixmap_pte(idx, pte);
1391  
1392         return (void *)vaddr;
1393  }
1394 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1395  
1396         if (kvaddr >= (void *)FIXADDR_START) {
1397                 type = kmap_atomic_idx();
1398 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1399 +               idx = fixmap_idx(type);
1400  
1401                 if (cache_is_vivt())
1402                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1403 +#ifdef CONFIG_PREEMPT_RT_FULL
1404 +               current->kmap_pte[type] = __pte(0);
1405 +#endif
1406  #ifdef CONFIG_DEBUG_HIGHMEM
1407                 BUG_ON(vaddr != __fix_to_virt(idx));
1408 -               set_fixmap_pte(idx, __pte(0));
1409  #else
1410                 (void) idx;  /* to kill a warning */
1411  #endif
1412 +               set_fixmap_pte(idx, __pte(0));
1413                 kmap_atomic_idx_pop();
1414         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1415                 /* this address was obtained through kmap_high_get() */
1416                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1417         }
1418         pagefault_enable();
1419 -       preempt_enable();
1420 +       preempt_enable_nort();
1421  }
1422  EXPORT_SYMBOL(__kunmap_atomic);
1423  
1424  void *kmap_atomic_pfn(unsigned long pfn)
1425  {
1426 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1427         unsigned long vaddr;
1428         int idx, type;
1429         struct page *page = pfn_to_page(pfn);
1430  
1431 -       preempt_disable();
1432 +       preempt_disable_nort();
1433         pagefault_disable();
1434         if (!PageHighMem(page))
1435                 return page_address(page);
1436  
1437         type = kmap_atomic_idx_push();
1438 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1439 +       idx = fixmap_idx(type);
1440         vaddr = __fix_to_virt(idx);
1441  #ifdef CONFIG_DEBUG_HIGHMEM
1442         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1443  #endif
1444 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1445 +#ifdef CONFIG_PREEMPT_RT_FULL
1446 +       current->kmap_pte[type] = pte;
1447 +#endif
1448 +       set_fixmap_pte(idx, pte);
1449  
1450         return (void *)vaddr;
1451  }
1452 +#if defined CONFIG_PREEMPT_RT_FULL
1453 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1454 +{
1455 +       int i;
1456 +
1457 +       /*
1458 +        * Clear @prev's kmap_atomic mappings
1459 +        */
1460 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1461 +               int idx = fixmap_idx(i);
1462 +
1463 +               set_fixmap_pte(idx, __pte(0));
1464 +       }
1465 +       /*
1466 +        * Restore @next_p's kmap_atomic mappings
1467 +        */
1468 +       for (i = 0; i < next_p->kmap_idx; i++) {
1469 +               int idx = fixmap_idx(i);
1470 +
1471 +               if (!pte_none(next_p->kmap_pte[i]))
1472 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1473 +       }
1474 +}
1475 +#endif
1476 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1477 index 53feb90c840c..b4a8d54fc3f3 100644
1478 --- a/arch/arm/plat-versatile/platsmp.c
1479 +++ b/arch/arm/plat-versatile/platsmp.c
1480 @@ -30,7 +30,7 @@ static void write_pen_release(int val)
1481         sync_cache_w(&pen_release);
1482  }
1483  
1484 -static DEFINE_SPINLOCK(boot_lock);
1485 +static DEFINE_RAW_SPINLOCK(boot_lock);
1486  
1487  void versatile_secondary_init(unsigned int cpu)
1488  {
1489 @@ -43,8 +43,8 @@ void versatile_secondary_init(unsigned int cpu)
1490         /*
1491          * Synchronise with the boot thread.
1492          */
1493 -       spin_lock(&boot_lock);
1494 -       spin_unlock(&boot_lock);
1495 +       raw_spin_lock(&boot_lock);
1496 +       raw_spin_unlock(&boot_lock);
1497  }
1498  
1499  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1500 @@ -55,7 +55,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1501          * Set synchronisation state between this boot processor
1502          * and the secondary one
1503          */
1504 -       spin_lock(&boot_lock);
1505 +       raw_spin_lock(&boot_lock);
1506  
1507         /*
1508          * This is really belt and braces; we hold unintended secondary
1509 @@ -85,7 +85,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1510          * now the secondary core is starting up let it run its
1511          * calibrations, then wait for it to finish
1512          */
1513 -       spin_unlock(&boot_lock);
1514 +       raw_spin_unlock(&boot_lock);
1515  
1516         return pen_release != -1 ? -ENOSYS : 0;
1517  }
1518 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1519 index 14cdc6dea493..9196cf82f7be 100644
1520 --- a/arch/arm64/Kconfig
1521 +++ b/arch/arm64/Kconfig
1522 @@ -76,6 +76,7 @@ config ARM64
1523         select HAVE_PERF_REGS
1524         select HAVE_PERF_USER_STACK_DUMP
1525         select HAVE_RCU_TABLE_FREE
1526 +       select HAVE_PREEMPT_LAZY
1527         select HAVE_SYSCALL_TRACEPOINTS
1528         select IOMMU_DMA if IOMMU_SUPPORT
1529         select IRQ_DOMAIN
1530 @@ -582,7 +583,7 @@ config XEN_DOM0
1531  
1532  config XEN
1533         bool "Xen guest support on ARM64"
1534 -       depends on ARM64 && OF
1535 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1536         select SWIOTLB_XEN
1537         help
1538           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
1539 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1540 index 90c7ff233735..5f4e89fbc290 100644
1541 --- a/arch/arm64/include/asm/thread_info.h
1542 +++ b/arch/arm64/include/asm/thread_info.h
1543 @@ -49,6 +49,7 @@ struct thread_info {
1544         mm_segment_t            addr_limit;     /* address limit */
1545         struct task_struct      *task;          /* main task structure */
1546         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1547 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1548         int                     cpu;            /* cpu */
1549  };
1550  
1551 @@ -103,6 +104,7 @@ static inline struct thread_info *current_thread_info(void)
1552  #define TIF_NEED_RESCHED       1
1553  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1554  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1555 +#define TIF_NEED_RESCHED_LAZY  4
1556  #define TIF_NOHZ               7
1557  #define TIF_SYSCALL_TRACE      8
1558  #define TIF_SYSCALL_AUDIT      9
1559 @@ -118,6 +120,7 @@ static inline struct thread_info *current_thread_info(void)
1560  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1561  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1562  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1563 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1564  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1565  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1566  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1567 @@ -126,7 +129,8 @@ static inline struct thread_info *current_thread_info(void)
1568  #define _TIF_32BIT             (1 << TIF_32BIT)
1569  
1570  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1571 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1572 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1573 +                                _TIF_NEED_RESCHED_LAZY)
1574  
1575  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1576                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1577 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1578 index 087cf9a65359..d74475928399 100644
1579 --- a/arch/arm64/kernel/asm-offsets.c
1580 +++ b/arch/arm64/kernel/asm-offsets.c
1581 @@ -35,6 +35,7 @@ int main(void)
1582    BLANK();
1583    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1584    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1585 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1586    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1587    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1588    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1589 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1590 index 5a3753d09e20..05d73c4c03f6 100644
1591 --- a/arch/arm64/kernel/entry.S
1592 +++ b/arch/arm64/kernel/entry.S
1593 @@ -376,11 +376,16 @@ el1_irq:
1594  #ifdef CONFIG_PREEMPT
1595         get_thread_info tsk
1596         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1597 -       cbnz    w24, 1f                         // preempt count != 0
1598 +       cbnz    w24, 2f                         // preempt count != 0
1599         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1600 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1601 -       bl      el1_preempt
1602 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1603 +
1604 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1605 +       cbnz    w24, 2f                         // preempt lazy count != 0
1606 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1607  1:
1608 +       bl      el1_preempt
1609 +2:
1610  #endif
1611  #ifdef CONFIG_TRACE_IRQFLAGS
1612         bl      trace_hardirqs_on
1613 @@ -394,6 +399,7 @@ el1_preempt:
1614  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1615         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1616         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1617 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1618         ret     x24
1619  #endif
1620  
1621 @@ -638,6 +644,7 @@ ret_fast_syscall_trace:
1622   */
1623  work_pending:
1624         tbnz    x1, #TIF_NEED_RESCHED, work_resched
1625 +       tbnz    x1, #TIF_NEED_RESCHED_LAZY, work_resched
1626         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
1627         ldr     x2, [sp, #S_PSTATE]
1628         mov     x0, sp                          // 'regs'
1629 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1630 index db459612de44..bd8be6a0e745 100644
1631 --- a/arch/mips/Kconfig
1632 +++ b/arch/mips/Kconfig
1633 @@ -2410,7 +2410,7 @@ config CPU_R4400_WORKAROUNDS
1634  #
1635  config HIGHMEM
1636         bool "High Memory Support"
1637 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1638 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1639  
1640  config CPU_SUPPORTS_HIGHMEM
1641         bool
1642 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
1643 index e86b7499921a..b2a2f678c5dc 100644
1644 --- a/arch/mips/kvm/mips.c
1645 +++ b/arch/mips/kvm/mips.c
1646 @@ -454,8 +454,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1647  
1648         dvcpu->arch.wait = 0;
1649  
1650 -       if (waitqueue_active(&dvcpu->wq))
1651 -               wake_up_interruptible(&dvcpu->wq);
1652 +       if (swait_active(&dvcpu->wq))
1653 +               swake_up(&dvcpu->wq);
1654  
1655         return 0;
1656  }
1657 @@ -1183,8 +1183,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
1658         kvm_mips_callbacks->queue_timer_int(vcpu);
1659  
1660         vcpu->arch.wait = 0;
1661 -       if (waitqueue_active(&vcpu->wq))
1662 -               wake_up_interruptible(&vcpu->wq);
1663 +       if (swait_active(&vcpu->wq))
1664 +               swake_up(&vcpu->wq);
1665  }
1666  
1667  /* low level hrtimer wake routine */
1668 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1669 index db49e0d796b1..1d2be228661c 100644
1670 --- a/arch/powerpc/Kconfig
1671 +++ b/arch/powerpc/Kconfig
1672 @@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
1673  
1674  config RWSEM_GENERIC_SPINLOCK
1675         bool
1676 +       default y if PREEMPT_RT_FULL
1677  
1678  config RWSEM_XCHGADD_ALGORITHM
1679         bool
1680 -       default y
1681 +       default y if !PREEMPT_RT_FULL
1682  
1683  config GENERIC_LOCKBREAK
1684         bool
1685 @@ -141,6 +142,7 @@ config PPC
1686         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1687         select GENERIC_STRNCPY_FROM_USER
1688         select GENERIC_STRNLEN_USER
1689 +       select HAVE_PREEMPT_LAZY
1690         select HAVE_MOD_ARCH_SPECIFIC
1691         select MODULES_USE_ELF_RELA
1692         select CLONE_BACKWARDS
1693 @@ -319,7 +321,7 @@ menu "Kernel options"
1694  
1695  config HIGHMEM
1696         bool "High memory support"
1697 -       depends on PPC32
1698 +       depends on PPC32 && !PREEMPT_RT_FULL
1699  
1700  source kernel/Kconfig.hz
1701  source kernel/Kconfig.preempt
1702 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
1703 index cfa758c6b4f6..f8673ff84b31 100644
1704 --- a/arch/powerpc/include/asm/kvm_host.h
1705 +++ b/arch/powerpc/include/asm/kvm_host.h
1706 @@ -286,7 +286,7 @@ struct kvmppc_vcore {
1707         struct list_head runnable_threads;
1708         struct list_head preempt_list;
1709         spinlock_t lock;
1710 -       wait_queue_head_t wq;
1711 +       struct swait_queue_head wq;
1712         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
1713         u64 stolen_tb;
1714         u64 preempt_tb;
1715 @@ -626,7 +626,7 @@ struct kvm_vcpu_arch {
1716         u8 prodded;
1717         u32 last_inst;
1718  
1719 -       wait_queue_head_t *wqp;
1720 +       struct swait_queue_head *wqp;
1721         struct kvmppc_vcore *vcore;
1722         int ret;
1723         int trap;
1724 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1725 index 7efee4a3240b..40e6fa1b85b2 100644
1726 --- a/arch/powerpc/include/asm/thread_info.h
1727 +++ b/arch/powerpc/include/asm/thread_info.h
1728 @@ -42,6 +42,8 @@ struct thread_info {
1729         int             cpu;                    /* cpu we're on */
1730         int             preempt_count;          /* 0 => preemptable,
1731                                                    <0 => BUG */
1732 +       int             preempt_lazy_count;      /* 0 => preemptable,
1733 +                                                  <0 => BUG */
1734         unsigned long   local_flags;            /* private flags for thread */
1735  
1736         /* low level flags - has atomic operations done on it */
1737 @@ -82,8 +84,7 @@ static inline struct thread_info *current_thread_info(void)
1738  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1739  #define TIF_SIGPENDING         1       /* signal pending */
1740  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1741 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1742 -                                          TIF_NEED_RESCHED */
1743 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1744  #define TIF_32BIT              4       /* 32 bit binary */
1745  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1746  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1747 @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
1748  #if defined(CONFIG_PPC64)
1749  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1750  #endif
1751 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1752 +                                          TIF_NEED_RESCHED */
1753  
1754  /* as above, but as bit values */
1755  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1756 @@ -119,14 +122,16 @@ static inline struct thread_info *current_thread_info(void)
1757  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1758  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1759  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1760 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1761  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1762                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1763                                  _TIF_NOHZ)
1764  
1765  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1766                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1767 -                                _TIF_RESTORE_TM)
1768 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1769  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1770 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1771  
1772  /* Bits in local_flags */
1773  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1774 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1775 index 221d584d089f..d6d0c59ef8ae 100644
1776 --- a/arch/powerpc/kernel/asm-offsets.c
1777 +++ b/arch/powerpc/kernel/asm-offsets.c
1778 @@ -160,6 +160,7 @@ int main(void)
1779         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1780         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1781         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1782 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1783         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1784         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1785  
1786 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1787 index 2405631e91a2..c21b4b42eaa0 100644
1788 --- a/arch/powerpc/kernel/entry_32.S
1789 +++ b/arch/powerpc/kernel/entry_32.S
1790 @@ -818,7 +818,14 @@ resume_kernel:
1791         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1792         bne     restore
1793         andi.   r8,r8,_TIF_NEED_RESCHED
1794 +       bne+    1f
1795 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1796 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1797 +       bne     restore
1798 +       lwz     r0,TI_FLAGS(r9)
1799 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1800         beq+    restore
1801 +1:
1802         lwz     r3,_MSR(r1)
1803         andi.   r0,r3,MSR_EE    /* interrupts off? */
1804         beq     restore         /* don't schedule if so */
1805 @@ -829,11 +836,11 @@ resume_kernel:
1806          */
1807         bl      trace_hardirqs_off
1808  #endif
1809 -1:     bl      preempt_schedule_irq
1810 +2:     bl      preempt_schedule_irq
1811         CURRENT_THREAD_INFO(r9, r1)
1812         lwz     r3,TI_FLAGS(r9)
1813 -       andi.   r0,r3,_TIF_NEED_RESCHED
1814 -       bne-    1b
1815 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1816 +       bne-    2b
1817  #ifdef CONFIG_TRACE_IRQFLAGS
1818         /* And now, to properly rebalance the above, we tell lockdep they
1819          * are being turned back on, which will happen when we return
1820 @@ -1154,7 +1161,7 @@ global_dbcr0:
1821  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1822  
1823  do_work:                       /* r10 contains MSR_KERNEL here */
1824 -       andi.   r0,r9,_TIF_NEED_RESCHED
1825 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1826         beq     do_user_signal
1827  
1828  do_resched:                    /* r10 contains MSR_KERNEL here */
1829 @@ -1175,7 +1182,7 @@ recheck:
1830         MTMSRD(r10)             /* disable interrupts */
1831         CURRENT_THREAD_INFO(r9, r1)
1832         lwz     r9,TI_FLAGS(r9)
1833 -       andi.   r0,r9,_TIF_NEED_RESCHED
1834 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1835         bne-    do_resched
1836         andi.   r0,r9,_TIF_USER_WORK_MASK
1837         beq     restore_user
1838 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1839 index edba294620db..1aae3fdb0c2a 100644
1840 --- a/arch/powerpc/kernel/entry_64.S
1841 +++ b/arch/powerpc/kernel/entry_64.S
1842 @@ -683,7 +683,7 @@ _GLOBAL(ret_from_except_lite)
1843  #else
1844         beq     restore
1845  #endif
1846 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1847 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1848         beq     2f
1849         bl      restore_interrupts
1850         SCHEDULE_USER
1851 @@ -745,10 +745,18 @@ resume_kernel:
1852  
1853  #ifdef CONFIG_PREEMPT
1854         /* Check if we need to preempt */
1855 +       lwz     r8,TI_PREEMPT(r9)
1856 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1857 +       bne     restore
1858         andi.   r0,r4,_TIF_NEED_RESCHED
1859 +       bne+    check_count
1860 +
1861 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1862         beq+    restore
1863 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1864 +
1865         /* Check that preempt_count() == 0 and interrupts are enabled */
1866 -       lwz     r8,TI_PREEMPT(r9)
1867 +check_count:
1868         cmpwi   cr1,r8,0
1869         ld      r0,SOFTE(r1)
1870         cmpdi   r0,0
1871 @@ -765,7 +773,7 @@ resume_kernel:
1872         /* Re-test flags and eventually loop */
1873         CURRENT_THREAD_INFO(r9, r1)
1874         ld      r4,TI_FLAGS(r9)
1875 -       andi.   r0,r4,_TIF_NEED_RESCHED
1876 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1877         bne     1b
1878  
1879         /*
1880 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1881 index 290559df1e8b..070afa6da35d 100644
1882 --- a/arch/powerpc/kernel/irq.c
1883 +++ b/arch/powerpc/kernel/irq.c
1884 @@ -614,6 +614,7 @@ void irq_ctx_init(void)
1885         }
1886  }
1887  
1888 +#ifndef CONFIG_PREEMPT_RT_FULL
1889  void do_softirq_own_stack(void)
1890  {
1891         struct thread_info *curtp, *irqtp;
1892 @@ -631,6 +632,7 @@ void do_softirq_own_stack(void)
1893         if (irqtp->flags)
1894                 set_bits(irqtp->flags, &curtp->flags);
1895  }
1896 +#endif
1897  
1898  irq_hw_number_t virq_to_hw(unsigned int virq)
1899  {
1900 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1901 index ed3ab509faca..8b261416c070 100644
1902 --- a/arch/powerpc/kernel/misc_32.S
1903 +++ b/arch/powerpc/kernel/misc_32.S
1904 @@ -40,6 +40,7 @@
1905   * We store the saved ksp_limit in the unused part
1906   * of the STACK_FRAME_OVERHEAD
1907   */
1908 +#ifndef CONFIG_PREEMPT_RT_FULL
1909  _GLOBAL(call_do_softirq)
1910         mflr    r0
1911         stw     r0,4(r1)
1912 @@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
1913         stw     r10,THREAD+KSP_LIMIT(r2)
1914         mtlr    r0
1915         blr
1916 +#endif
1917  
1918  /*
1919   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1920 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1921 index db475d41b57a..96b7ef80e05d 100644
1922 --- a/arch/powerpc/kernel/misc_64.S
1923 +++ b/arch/powerpc/kernel/misc_64.S
1924 @@ -30,6 +30,7 @@
1925  
1926         .text
1927  
1928 +#ifndef CONFIG_PREEMPT_RT_FULL
1929  _GLOBAL(call_do_softirq)
1930         mflr    r0
1931         std     r0,16(r1)
1932 @@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
1933         ld      r0,16(r1)
1934         mtlr    r0
1935         blr
1936 +#endif
1937  
1938  _GLOBAL(call_do_irq)
1939         mflr    r0
1940 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1941 index c2024ac9d4e8..2303788da7e1 100644
1942 --- a/arch/powerpc/kvm/Kconfig
1943 +++ b/arch/powerpc/kvm/Kconfig
1944 @@ -172,6 +172,7 @@ config KVM_E500MC
1945  config KVM_MPIC
1946         bool "KVM in-kernel MPIC emulation"
1947         depends on KVM && E500
1948 +       depends on !PREEMPT_RT_FULL
1949         select HAVE_KVM_IRQCHIP
1950         select HAVE_KVM_IRQFD
1951         select HAVE_KVM_IRQ_ROUTING
1952 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
1953 index a7352b59e6f9..df34a6432873 100644
1954 --- a/arch/powerpc/kvm/book3s_hv.c
1955 +++ b/arch/powerpc/kvm/book3s_hv.c
1956 @@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
1957  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
1958  {
1959         int cpu;
1960 -       wait_queue_head_t *wqp;
1961 +       struct swait_queue_head *wqp;
1962  
1963         wqp = kvm_arch_vcpu_wq(vcpu);
1964 -       if (waitqueue_active(wqp)) {
1965 -               wake_up_interruptible(wqp);
1966 +       if (swait_active(wqp)) {
1967 +               swake_up(wqp);
1968                 ++vcpu->stat.halt_wakeup;
1969         }
1970  
1971 @@ -707,8 +707,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1972                 tvcpu->arch.prodded = 1;
1973                 smp_mb();
1974                 if (vcpu->arch.ceded) {
1975 -                       if (waitqueue_active(&vcpu->wq)) {
1976 -                               wake_up_interruptible(&vcpu->wq);
1977 +                       if (swait_active(&vcpu->wq)) {
1978 +                               swake_up(&vcpu->wq);
1979                                 vcpu->stat.halt_wakeup++;
1980                         }
1981                 }
1982 @@ -1447,7 +1447,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1983         INIT_LIST_HEAD(&vcore->runnable_threads);
1984         spin_lock_init(&vcore->lock);
1985         spin_lock_init(&vcore->stoltb_lock);
1986 -       init_waitqueue_head(&vcore->wq);
1987 +       init_swait_queue_head(&vcore->wq);
1988         vcore->preempt_tb = TB_NIL;
1989         vcore->lpcr = kvm->arch.lpcr;
1990         vcore->first_vcpuid = core * threads_per_subcore;
1991 @@ -2519,10 +2519,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
1992  {
1993         struct kvm_vcpu *vcpu;
1994         int do_sleep = 1;
1995 +       DECLARE_SWAITQUEUE(wait);
1996  
1997 -       DEFINE_WAIT(wait);
1998 -
1999 -       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2000 +       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2001  
2002         /*
2003          * Check one last time for pending exceptions and ceded state after
2004 @@ -2536,7 +2535,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2005         }
2006  
2007         if (!do_sleep) {
2008 -               finish_wait(&vc->wq, &wait);
2009 +               finish_swait(&vc->wq, &wait);
2010                 return;
2011         }
2012  
2013 @@ -2544,7 +2543,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2014         trace_kvmppc_vcore_blocked(vc, 0);
2015         spin_unlock(&vc->lock);
2016         schedule();
2017 -       finish_wait(&vc->wq, &wait);
2018 +       finish_swait(&vc->wq, &wait);
2019         spin_lock(&vc->lock);
2020         vc->vcore_state = VCORE_INACTIVE;
2021         trace_kvmppc_vcore_blocked(vc, 1);
2022 @@ -2600,7 +2599,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2023                         kvmppc_start_thread(vcpu, vc);
2024                         trace_kvm_guest_enter(vcpu);
2025                 } else if (vc->vcore_state == VCORE_SLEEPING) {
2026 -                       wake_up(&vc->wq);
2027 +                       swake_up(&vc->wq);
2028                 }
2029  
2030         }
2031 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
2032 index 3f175e8aedb4..c4c02f91904c 100644
2033 --- a/arch/powerpc/platforms/ps3/device-init.c
2034 +++ b/arch/powerpc/platforms/ps3/device-init.c
2035 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
2036         }
2037         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
2038  
2039 -       res = wait_event_interruptible(dev->done.wait,
2040 +       res = swait_event_interruptible(dev->done.wait,
2041                                        dev->done.done || kthread_should_stop());
2042         if (kthread_should_stop())
2043                 res = -EINTR;
2044 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
2045 index e9a983f40a24..bbdc539fb3c6 100644
2046 --- a/arch/s390/include/asm/kvm_host.h
2047 +++ b/arch/s390/include/asm/kvm_host.h
2048 @@ -427,7 +427,7 @@ struct kvm_s390_irq_payload {
2049  struct kvm_s390_local_interrupt {
2050         spinlock_t lock;
2051         struct kvm_s390_float_interrupt *float_int;
2052 -       wait_queue_head_t *wq;
2053 +       struct swait_queue_head *wq;
2054         atomic_t *cpuflags;
2055         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
2056         struct kvm_s390_irq_payload irq;
2057 diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
2058 index 6a75352f453c..cc862c486002 100644
2059 --- a/arch/s390/kvm/interrupt.c
2060 +++ b/arch/s390/kvm/interrupt.c
2061 @@ -868,13 +868,13 @@ no_timer:
2062  
2063  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
2064  {
2065 -       if (waitqueue_active(&vcpu->wq)) {
2066 +       if (swait_active(&vcpu->wq)) {
2067                 /*
2068                  * The vcpu gave up the cpu voluntarily, mark it as a good
2069                  * yield-candidate.
2070                  */
2071                 vcpu->preempted = true;
2072 -               wake_up_interruptible(&vcpu->wq);
2073 +               swake_up(&vcpu->wq);
2074                 vcpu->stat.halt_wakeup++;
2075         }
2076  }
2077 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
2078 index 6c0378c0b8b5..abd58b4dff97 100644
2079 --- a/arch/sh/kernel/irq.c
2080 +++ b/arch/sh/kernel/irq.c
2081 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
2082         hardirq_ctx[cpu] = NULL;
2083  }
2084  
2085 +#ifndef CONFIG_PREEMPT_RT_FULL
2086  void do_softirq_own_stack(void)
2087  {
2088         struct thread_info *curctx;
2089 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
2090                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
2091         );
2092  }
2093 +#endif
2094  #else
2095  static inline void handle_one_irq(unsigned int irq)
2096  {
2097 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
2098 index 56442d2d7bbc..8c9598f534c9 100644
2099 --- a/arch/sparc/Kconfig
2100 +++ b/arch/sparc/Kconfig
2101 @@ -189,12 +189,10 @@ config NR_CPUS
2102  source kernel/Kconfig.hz
2103  
2104  config RWSEM_GENERIC_SPINLOCK
2105 -       bool
2106 -       default y if SPARC32
2107 +       def_bool PREEMPT_RT_FULL
2108  
2109  config RWSEM_XCHGADD_ALGORITHM
2110 -       bool
2111 -       default y if SPARC64
2112 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2113  
2114  config GENERIC_HWEIGHT
2115         bool
2116 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
2117 index e22416ce56ea..d359de71153a 100644
2118 --- a/arch/sparc/kernel/irq_64.c
2119 +++ b/arch/sparc/kernel/irq_64.c
2120 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
2121         set_irq_regs(old_regs);
2122  }
2123  
2124 +#ifndef CONFIG_PREEMPT_RT_FULL
2125  void do_softirq_own_stack(void)
2126  {
2127         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
2128 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
2129         __asm__ __volatile__("mov %0, %%sp"
2130                              : : "r" (orig_sp));
2131  }
2132 +#endif
2133  
2134  #ifdef CONFIG_HOTPLUG_CPU
2135  void fixup_irqs(void)
2136 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2137 index 436639a31624..6ee1dd0deadc 100644
2138 --- a/arch/x86/Kconfig
2139 +++ b/arch/x86/Kconfig
2140 @@ -17,6 +17,7 @@ config X86_64
2141  ### Arch settings
2142  config X86
2143         def_bool y
2144 +       select HAVE_PREEMPT_LAZY
2145         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
2146         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
2147         select ANON_INODES
2148 @@ -212,8 +213,11 @@ config ARCH_MAY_HAVE_PC_FDC
2149         def_bool y
2150         depends on ISA_DMA_API
2151  
2152 +config RWSEM_GENERIC_SPINLOCK
2153 +       def_bool PREEMPT_RT_FULL
2154 +
2155  config RWSEM_XCHGADD_ALGORITHM
2156 -       def_bool y
2157 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2158  
2159  config GENERIC_CALIBRATE_DELAY
2160         def_bool y
2161 @@ -848,7 +852,7 @@ config IOMMU_HELPER
2162  config MAXSMP
2163         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2164         depends on X86_64 && SMP && DEBUG_KERNEL
2165 -       select CPUMASK_OFFSTACK
2166 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2167         ---help---
2168           Enable maximum number of CPUS and NUMA Nodes for this architecture.
2169           If unsure, say N.
2170 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
2171 index 3633ad6145c5..c6d5458ee7f9 100644
2172 --- a/arch/x86/crypto/aesni-intel_glue.c
2173 +++ b/arch/x86/crypto/aesni-intel_glue.c
2174 @@ -383,14 +383,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
2175         err = blkcipher_walk_virt(desc, &walk);
2176         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2177  
2178 -       kernel_fpu_begin();
2179         while ((nbytes = walk.nbytes)) {
2180 +               kernel_fpu_begin();
2181                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2182 -                             nbytes & AES_BLOCK_MASK);
2183 +                               nbytes & AES_BLOCK_MASK);
2184 +               kernel_fpu_end();
2185                 nbytes &= AES_BLOCK_SIZE - 1;
2186                 err = blkcipher_walk_done(desc, &walk, nbytes);
2187         }
2188 -       kernel_fpu_end();
2189  
2190         return err;
2191  }
2192 @@ -407,14 +407,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
2193         err = blkcipher_walk_virt(desc, &walk);
2194         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2195  
2196 -       kernel_fpu_begin();
2197         while ((nbytes = walk.nbytes)) {
2198 +               kernel_fpu_begin();
2199                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2200                               nbytes & AES_BLOCK_MASK);
2201 +               kernel_fpu_end();
2202                 nbytes &= AES_BLOCK_SIZE - 1;
2203                 err = blkcipher_walk_done(desc, &walk, nbytes);
2204         }
2205 -       kernel_fpu_end();
2206  
2207         return err;
2208  }
2209 @@ -431,14 +431,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
2210         err = blkcipher_walk_virt(desc, &walk);
2211         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2212  
2213 -       kernel_fpu_begin();
2214         while ((nbytes = walk.nbytes)) {
2215 +               kernel_fpu_begin();
2216                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2217                               nbytes & AES_BLOCK_MASK, walk.iv);
2218 +               kernel_fpu_end();
2219                 nbytes &= AES_BLOCK_SIZE - 1;
2220                 err = blkcipher_walk_done(desc, &walk, nbytes);
2221         }
2222 -       kernel_fpu_end();
2223  
2224         return err;
2225  }
2226 @@ -455,14 +455,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
2227         err = blkcipher_walk_virt(desc, &walk);
2228         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2229  
2230 -       kernel_fpu_begin();
2231         while ((nbytes = walk.nbytes)) {
2232 +               kernel_fpu_begin();
2233                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2234                               nbytes & AES_BLOCK_MASK, walk.iv);
2235 +               kernel_fpu_end();
2236                 nbytes &= AES_BLOCK_SIZE - 1;
2237                 err = blkcipher_walk_done(desc, &walk, nbytes);
2238         }
2239 -       kernel_fpu_end();
2240  
2241         return err;
2242  }
2243 @@ -514,18 +514,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
2244         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
2245         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2246  
2247 -       kernel_fpu_begin();
2248         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
2249 +               kernel_fpu_begin();
2250                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2251                                       nbytes & AES_BLOCK_MASK, walk.iv);
2252 +               kernel_fpu_end();
2253                 nbytes &= AES_BLOCK_SIZE - 1;
2254                 err = blkcipher_walk_done(desc, &walk, nbytes);
2255         }
2256         if (walk.nbytes) {
2257 +               kernel_fpu_begin();
2258                 ctr_crypt_final(ctx, &walk);
2259 +               kernel_fpu_end();
2260                 err = blkcipher_walk_done(desc, &walk, 0);
2261         }
2262 -       kernel_fpu_end();
2263  
2264         return err;
2265  }
2266 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
2267 index 8648158f3916..d7699130ee36 100644
2268 --- a/arch/x86/crypto/cast5_avx_glue.c
2269 +++ b/arch/x86/crypto/cast5_avx_glue.c
2270 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
2271  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2272                      bool enc)
2273  {
2274 -       bool fpu_enabled = false;
2275 +       bool fpu_enabled;
2276         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
2277         const unsigned int bsize = CAST5_BLOCK_SIZE;
2278         unsigned int nbytes;
2279 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2280                 u8 *wsrc = walk->src.virt.addr;
2281                 u8 *wdst = walk->dst.virt.addr;
2282  
2283 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2284 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2285  
2286                 /* Process multi-block batch */
2287                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
2288 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2289                 } while (nbytes >= bsize);
2290  
2291  done:
2292 +               cast5_fpu_end(fpu_enabled);
2293                 err = blkcipher_walk_done(desc, walk, nbytes);
2294         }
2295 -
2296 -       cast5_fpu_end(fpu_enabled);
2297         return err;
2298  }
2299  
2300 @@ -227,7 +226,7 @@ done:
2301  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2302                        struct scatterlist *src, unsigned int nbytes)
2303  {
2304 -       bool fpu_enabled = false;
2305 +       bool fpu_enabled;
2306         struct blkcipher_walk walk;
2307         int err;
2308  
2309 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2310         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2311  
2312         while ((nbytes = walk.nbytes)) {
2313 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2314 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2315                 nbytes = __cbc_decrypt(desc, &walk);
2316 +               cast5_fpu_end(fpu_enabled);
2317                 err = blkcipher_walk_done(desc, &walk, nbytes);
2318         }
2319 -
2320 -       cast5_fpu_end(fpu_enabled);
2321         return err;
2322  }
2323  
2324 @@ -311,7 +309,7 @@ done:
2325  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2326                      struct scatterlist *src, unsigned int nbytes)
2327  {
2328 -       bool fpu_enabled = false;
2329 +       bool fpu_enabled;
2330         struct blkcipher_walk walk;
2331         int err;
2332  
2333 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2334         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2335  
2336         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2337 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2338 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2339                 nbytes = __ctr_crypt(desc, &walk);
2340 +               cast5_fpu_end(fpu_enabled);
2341                 err = blkcipher_walk_done(desc, &walk, nbytes);
2342         }
2343  
2344 -       cast5_fpu_end(fpu_enabled);
2345 -
2346         if (walk.nbytes) {
2347                 ctr_crypt_final(desc, &walk);
2348                 err = blkcipher_walk_done(desc, &walk, 0);
2349 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
2350 index 6a85598931b5..3a506ce7ed93 100644
2351 --- a/arch/x86/crypto/glue_helper.c
2352 +++ b/arch/x86/crypto/glue_helper.c
2353 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2354         void *ctx = crypto_blkcipher_ctx(desc->tfm);
2355         const unsigned int bsize = 128 / 8;
2356         unsigned int nbytes, i, func_bytes;
2357 -       bool fpu_enabled = false;
2358 +       bool fpu_enabled;
2359         int err;
2360  
2361         err = blkcipher_walk_virt(desc, walk);
2362 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2363                 u8 *wdst = walk->dst.virt.addr;
2364  
2365                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2366 -                                            desc, fpu_enabled, nbytes);
2367 +                                            desc, false, nbytes);
2368  
2369                 for (i = 0; i < gctx->num_funcs; i++) {
2370                         func_bytes = bsize * gctx->funcs[i].num_blocks;
2371 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2372                 }
2373  
2374  done:
2375 +               glue_fpu_end(fpu_enabled);
2376                 err = blkcipher_walk_done(desc, walk, nbytes);
2377         }
2378  
2379 -       glue_fpu_end(fpu_enabled);
2380         return err;
2381  }
2382  
2383 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2384                             struct scatterlist *src, unsigned int nbytes)
2385  {
2386         const unsigned int bsize = 128 / 8;
2387 -       bool fpu_enabled = false;
2388 +       bool fpu_enabled;
2389         struct blkcipher_walk walk;
2390         int err;
2391  
2392 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2393  
2394         while ((nbytes = walk.nbytes)) {
2395                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2396 -                                            desc, fpu_enabled, nbytes);
2397 +                                            desc, false, nbytes);
2398                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2399 +               glue_fpu_end(fpu_enabled);
2400                 err = blkcipher_walk_done(desc, &walk, nbytes);
2401         }
2402  
2403 -       glue_fpu_end(fpu_enabled);
2404         return err;
2405  }
2406  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2407 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2408                           struct scatterlist *src, unsigned int nbytes)
2409  {
2410         const unsigned int bsize = 128 / 8;
2411 -       bool fpu_enabled = false;
2412 +       bool fpu_enabled;
2413         struct blkcipher_walk walk;
2414         int err;
2415  
2416 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2417  
2418         while ((nbytes = walk.nbytes) >= bsize) {
2419                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2420 -                                            desc, fpu_enabled, nbytes);
2421 +                                            desc, false, nbytes);
2422                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2423 +               glue_fpu_end(fpu_enabled);
2424                 err = blkcipher_walk_done(desc, &walk, nbytes);
2425         }
2426  
2427 -       glue_fpu_end(fpu_enabled);
2428 -
2429         if (walk.nbytes) {
2430                 glue_ctr_crypt_final_128bit(
2431                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2432 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2433                           void *tweak_ctx, void *crypt_ctx)
2434  {
2435         const unsigned int bsize = 128 / 8;
2436 -       bool fpu_enabled = false;
2437 +       bool fpu_enabled;
2438         struct blkcipher_walk walk;
2439         int err;
2440  
2441 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2442  
2443         /* set minimum length to bsize, for tweak_fn */
2444         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2445 -                                    desc, fpu_enabled,
2446 +                                    desc, false,
2447                                      nbytes < bsize ? bsize : nbytes);
2448 -
2449         /* calculate first value of T */
2450         tweak_fn(tweak_ctx, walk.iv, walk.iv);
2451 +       glue_fpu_end(fpu_enabled);
2452  
2453         while (nbytes) {
2454 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2455 +                               desc, false, nbytes);
2456                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2457  
2458 +               glue_fpu_end(fpu_enabled);
2459                 err = blkcipher_walk_done(desc, &walk, nbytes);
2460                 nbytes = walk.nbytes;
2461         }
2462 -
2463 -       glue_fpu_end(fpu_enabled);
2464 -
2465         return err;
2466  }
2467  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2468 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
2469 index 1a4477cedc49..75a301b6a5b6 100644
2470 --- a/arch/x86/entry/common.c
2471 +++ b/arch/x86/entry/common.c
2472 @@ -220,7 +220,7 @@ long syscall_trace_enter(struct pt_regs *regs)
2473  
2474  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2475         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2476 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
2477 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
2478  
2479  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2480  {
2481 @@ -236,9 +236,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2482                 /* We have work to do. */
2483                 local_irq_enable();
2484  
2485 -               if (cached_flags & _TIF_NEED_RESCHED)
2486 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2487                         schedule();
2488  
2489 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2490 +               if (unlikely(current->forced_info.si_signo)) {
2491 +                       struct task_struct *t = current;
2492 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2493 +                       t->forced_info.si_signo = 0;
2494 +               }
2495 +#endif
2496                 if (cached_flags & _TIF_UPROBE)
2497                         uprobe_notify_resume(regs);
2498  
2499 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2500 index f3b6d54e0042..2d722ee01fc2 100644
2501 --- a/arch/x86/entry/entry_32.S
2502 +++ b/arch/x86/entry/entry_32.S
2503 @@ -278,8 +278,24 @@ END(ret_from_exception)
2504  ENTRY(resume_kernel)
2505         DISABLE_INTERRUPTS(CLBR_ANY)
2506  need_resched:
2507 +       # preempt count == 0 + NEED_RS set?
2508         cmpl    $0, PER_CPU_VAR(__preempt_count)
2509 +#ifndef CONFIG_PREEMPT_LAZY
2510         jnz     restore_all
2511 +#else
2512 +       jz test_int_off
2513 +
2514 +       # atleast preempt count == 0 ?
2515 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2516 +       jne restore_all
2517 +
2518 +       cmpl $0,TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2519 +       jnz restore_all
2520 +
2521 +       testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
2522 +       jz restore_all
2523 +test_int_off:
2524 +#endif
2525         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2526         jz      restore_all
2527         call    preempt_schedule_irq
2528 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2529 index a55697d19824..316081a2ca85 100644
2530 --- a/arch/x86/entry/entry_64.S
2531 +++ b/arch/x86/entry/entry_64.S
2532 @@ -579,7 +579,23 @@ retint_kernel:
2533         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2534         jnc     1f
2535  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2536 +#ifndef CONFIG_PREEMPT_LAZY
2537         jnz     1f
2538 +#else
2539 +       jz      do_preempt_schedule_irq
2540 +
2541 +       # atleast preempt count == 0 ?
2542 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2543 +       jnz     1f
2544 +
2545 +       GET_THREAD_INFO(%rcx)
2546 +       cmpl    $0, TI_preempt_lazy_count(%rcx)
2547 +       jnz     1f
2548 +
2549 +       bt      $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
2550 +       jnc     1f
2551 +do_preempt_schedule_irq:
2552 +#endif
2553         call    preempt_schedule_irq
2554         jmp     0b
2555  1:
2556 @@ -867,6 +883,7 @@ bad_gs:
2557         jmp     2b
2558         .previous
2559  
2560 +#ifndef CONFIG_PREEMPT_RT_FULL
2561  /* Call softirq on interrupt stack. Interrupts are off. */
2562  ENTRY(do_softirq_own_stack)
2563         pushq   %rbp
2564 @@ -879,6 +896,7 @@ ENTRY(do_softirq_own_stack)
2565         decl    PER_CPU_VAR(irq_count)
2566         ret
2567  END(do_softirq_own_stack)
2568 +#endif
2569  
2570  #ifdef CONFIG_XEN
2571  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2572 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2573 index 01bcde84d3e4..6f432adc55cd 100644
2574 --- a/arch/x86/include/asm/preempt.h
2575 +++ b/arch/x86/include/asm/preempt.h
2576 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2577   * a decrement which hits zero means we have no preempt_count and should
2578   * reschedule.
2579   */
2580 -static __always_inline bool __preempt_count_dec_and_test(void)
2581 +static __always_inline bool ____preempt_count_dec_and_test(void)
2582  {
2583         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
2584  }
2585  
2586 +static __always_inline bool __preempt_count_dec_and_test(void)
2587 +{
2588 +       if (____preempt_count_dec_and_test())
2589 +               return true;
2590 +#ifdef CONFIG_PREEMPT_LAZY
2591 +       if (current_thread_info()->preempt_lazy_count)
2592 +               return false;
2593 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2594 +#else
2595 +       return false;
2596 +#endif
2597 +}
2598 +
2599  /*
2600   * Returns true when we need to resched and can (barring IRQ state).
2601   */
2602  static __always_inline bool should_resched(int preempt_offset)
2603  {
2604 +#ifdef CONFIG_PREEMPT_LAZY
2605 +       u32 tmp;
2606 +
2607 +       tmp = raw_cpu_read_4(__preempt_count);
2608 +       if (tmp == preempt_offset)
2609 +               return true;
2610 +
2611 +       /* preempt count == 0 ? */
2612 +       tmp &= ~PREEMPT_NEED_RESCHED;
2613 +       if (tmp)
2614 +               return false;
2615 +       if (current_thread_info()->preempt_lazy_count)
2616 +               return false;
2617 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2618 +#else
2619         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2620 +#endif
2621  }
2622  
2623  #ifdef CONFIG_PREEMPT
2624 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2625 index 2138c9ae19ee..3f5b4ee2e2c1 100644
2626 --- a/arch/x86/include/asm/signal.h
2627 +++ b/arch/x86/include/asm/signal.h
2628 @@ -23,6 +23,19 @@ typedef struct {
2629         unsigned long sig[_NSIG_WORDS];
2630  } sigset_t;
2631  
2632 +/*
2633 + * Because some traps use the IST stack, we must keep preemption
2634 + * disabled while calling do_trap(), but do_trap() may call
2635 + * force_sig_info() which will grab the signal spin_locks for the
2636 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2637 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2638 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2639 + * trap.
2640 + */
2641 +#if defined(CONFIG_PREEMPT_RT_FULL)
2642 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2643 +#endif
2644 +
2645  #ifndef CONFIG_COMPAT
2646  typedef sigset_t compat_sigset_t;
2647  #endif
2648 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2649 index 58505f01962f..02fa39652cd6 100644
2650 --- a/arch/x86/include/asm/stackprotector.h
2651 +++ b/arch/x86/include/asm/stackprotector.h
2652 @@ -59,7 +59,7 @@
2653   */
2654  static __always_inline void boot_init_stack_canary(void)
2655  {
2656 -       u64 canary;
2657 +       u64 uninitialized_var(canary);
2658         u64 tsc;
2659  
2660  #ifdef CONFIG_X86_64
2661 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2662          * of randomness. The TSC only matters for very early init,
2663          * there it already has some randomness on most systems. Later
2664          * on during the bootup the random pool has true entropy too.
2665 +        *
2666 +        * For preempt-rt we need to weaken the randomness a bit, as
2667 +        * we can't call into the random generator from atomic context
2668 +        * due to locking constraints. We just leave canary
2669 +        * uninitialized and use the TSC based randomness on top of it.
2670          */
2671 +#ifndef CONFIG_PREEMPT_RT_FULL
2672         get_random_bytes(&canary, sizeof(canary));
2673 +#endif
2674         tsc = rdtsc();
2675         canary += tsc + (tsc << 32UL);
2676  
2677 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2678 index c7b551028740..ddb63bd90e3c 100644
2679 --- a/arch/x86/include/asm/thread_info.h
2680 +++ b/arch/x86/include/asm/thread_info.h
2681 @@ -58,6 +58,8 @@ struct thread_info {
2682         __u32                   status;         /* thread synchronous flags */
2683         __u32                   cpu;            /* current CPU */
2684         mm_segment_t            addr_limit;
2685 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2686 +                                                         <0 => BUG */
2687         unsigned int            sig_on_uaccess_error:1;
2688         unsigned int            uaccess_err:1;  /* uaccess failed */
2689  };
2690 @@ -95,6 +97,7 @@ struct thread_info {
2691  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2692  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2693  #define TIF_SECCOMP            8       /* secure computing */
2694 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2695  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2696  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2697  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2698 @@ -119,6 +122,7 @@ struct thread_info {
2699  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2700  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2701  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2702 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2703  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2704  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2705  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2706 @@ -152,6 +156,8 @@ struct thread_info {
2707  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2708  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2709  
2710 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2711 +
2712  #define STACK_WARN             (THREAD_SIZE/8)
2713  
2714  /*
2715 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2716 index fc808b83fccb..ebb40118abf5 100644
2717 --- a/arch/x86/include/asm/uv/uv_bau.h
2718 +++ b/arch/x86/include/asm/uv/uv_bau.h
2719 @@ -615,9 +615,9 @@ struct bau_control {
2720         cycles_t                send_message;
2721         cycles_t                period_end;
2722         cycles_t                period_time;
2723 -       spinlock_t              uvhub_lock;
2724 -       spinlock_t              queue_lock;
2725 -       spinlock_t              disable_lock;
2726 +       raw_spinlock_t          uvhub_lock;
2727 +       raw_spinlock_t          queue_lock;
2728 +       raw_spinlock_t          disable_lock;
2729         /* tunables */
2730         int                     max_concurr;
2731         int                     max_concurr_const;
2732 @@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2733   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2734   * on equal.
2735   */
2736 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2737 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2738  {
2739 -       spin_lock(lock);
2740 +       raw_spin_lock(lock);
2741         if (atomic_read(v) >= u) {
2742 -               spin_unlock(lock);
2743 +               raw_spin_unlock(lock);
2744                 return 0;
2745         }
2746         atomic_inc(v);
2747 -       spin_unlock(lock);
2748 +       raw_spin_unlock(lock);
2749         return 1;
2750  }
2751  
2752 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
2753 index ea7074784cc4..01ec643ce66e 100644
2754 --- a/arch/x86/include/asm/uv/uv_hub.h
2755 +++ b/arch/x86/include/asm/uv/uv_hub.h
2756 @@ -492,7 +492,7 @@ struct uv_blade_info {
2757         unsigned short  nr_online_cpus;
2758         unsigned short  pnode;
2759         short           memory_nid;
2760 -       spinlock_t      nmi_lock;       /* obsolete, see uv_hub_nmi */
2761 +       raw_spinlock_t  nmi_lock;       /* obsolete, see uv_hub_nmi */
2762         unsigned long   nmi_count;      /* obsolete, see uv_hub_nmi */
2763  };
2764  extern struct uv_blade_info *uv_blade_info;
2765 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2766 index fdb0fbfb1197..678c711e2a16 100644
2767 --- a/arch/x86/kernel/apic/io_apic.c
2768 +++ b/arch/x86/kernel/apic/io_apic.c
2769 @@ -1711,7 +1711,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2770  static inline bool ioapic_irqd_mask(struct irq_data *data)
2771  {
2772         /* If we are moving the irq we need to mask it */
2773 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2774 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2775 +                    !irqd_irq_inprogress(data))) {
2776                 mask_ioapic_irq(data);
2777                 return true;
2778         }
2779 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
2780 index 4a139465f1d4..ad2afff02b36 100644
2781 --- a/arch/x86/kernel/apic/x2apic_uv_x.c
2782 +++ b/arch/x86/kernel/apic/x2apic_uv_x.c
2783 @@ -947,7 +947,7 @@ void __init uv_system_init(void)
2784                         uv_blade_info[blade].pnode = pnode;
2785                         uv_blade_info[blade].nr_possible_cpus = 0;
2786                         uv_blade_info[blade].nr_online_cpus = 0;
2787 -                       spin_lock_init(&uv_blade_info[blade].nmi_lock);
2788 +                       raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
2789                         min_pnode = min(pnode, min_pnode);
2790                         max_pnode = max(pnode, max_pnode);
2791                         blade++;
2792 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2793 index 439df975bc7a..b7954ddd6a0a 100644
2794 --- a/arch/x86/kernel/asm-offsets.c
2795 +++ b/arch/x86/kernel/asm-offsets.c
2796 @@ -32,6 +32,7 @@ void common(void) {
2797         OFFSET(TI_flags, thread_info, flags);
2798         OFFSET(TI_status, thread_info, status);
2799         OFFSET(TI_addr_limit, thread_info, addr_limit);
2800 +       OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
2801  
2802         BLANK();
2803         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
2804 @@ -89,4 +90,5 @@ void common(void) {
2805  
2806         BLANK();
2807         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2808 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2809  }
2810 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2811 index 7e8a736d09db..430a4ec07811 100644
2812 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2813 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2814 @@ -41,6 +41,8 @@
2815  #include <linux/debugfs.h>
2816  #include <linux/irq_work.h>
2817  #include <linux/export.h>
2818 +#include <linux/jiffies.h>
2819 +#include <linux/swork.h>
2820  
2821  #include <asm/processor.h>
2822  #include <asm/traps.h>
2823 @@ -1236,7 +1238,7 @@ void mce_log_therm_throt_event(__u64 status)
2824  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2825  
2826  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2827 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2828 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2829  
2830  static unsigned long mce_adjust_timer_default(unsigned long interval)
2831  {
2832 @@ -1245,32 +1247,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2833  
2834  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2835  
2836 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2837 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2838  {
2839 -       unsigned long when = jiffies + interval;
2840 -       unsigned long flags;
2841 -
2842 -       local_irq_save(flags);
2843 -
2844 -       if (timer_pending(t)) {
2845 -               if (time_before(when, t->expires))
2846 -                       mod_timer_pinned(t, when);
2847 -       } else {
2848 -               t->expires = round_jiffies(when);
2849 -               add_timer_on(t, smp_processor_id());
2850 -       }
2851 -
2852 -       local_irq_restore(flags);
2853 +       if (!interval)
2854 +               return HRTIMER_NORESTART;
2855 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2856 +       return HRTIMER_RESTART;
2857  }
2858  
2859 -static void mce_timer_fn(unsigned long data)
2860 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2861  {
2862 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2863 -       int cpu = smp_processor_id();
2864         unsigned long iv;
2865  
2866 -       WARN_ON(cpu != data);
2867 -
2868         iv = __this_cpu_read(mce_next_interval);
2869  
2870         if (mce_available(this_cpu_ptr(&cpu_info))) {
2871 @@ -1293,7 +1281,7 @@ static void mce_timer_fn(unsigned long data)
2872  
2873  done:
2874         __this_cpu_write(mce_next_interval, iv);
2875 -       __restart_timer(t, iv);
2876 +       return __restart_timer(timer, iv);
2877  }
2878  
2879  /*
2880 @@ -1301,7 +1289,7 @@ done:
2881   */
2882  void mce_timer_kick(unsigned long interval)
2883  {
2884 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2885 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2886         unsigned long iv = __this_cpu_read(mce_next_interval);
2887  
2888         __restart_timer(t, interval);
2889 @@ -1316,7 +1304,7 @@ static void mce_timer_delete_all(void)
2890         int cpu;
2891  
2892         for_each_online_cpu(cpu)
2893 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2894 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2895  }
2896  
2897  static void mce_do_trigger(struct work_struct *work)
2898 @@ -1326,6 +1314,56 @@ static void mce_do_trigger(struct work_struct *work)
2899  
2900  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2901  
2902 +static void __mce_notify_work(struct swork_event *event)
2903 +{
2904 +       /* Not more than two messages every minute */
2905 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2906 +
2907 +       /* wake processes polling /dev/mcelog */
2908 +       wake_up_interruptible(&mce_chrdev_wait);
2909 +
2910 +       /*
2911 +        * There is no risk of missing notifications because
2912 +        * work_pending is always cleared before the function is
2913 +        * executed.
2914 +        */
2915 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2916 +               schedule_work(&mce_trigger_work);
2917 +
2918 +       if (__ratelimit(&ratelimit))
2919 +               pr_info(HW_ERR "Machine check events logged\n");
2920 +}
2921 +
2922 +#ifdef CONFIG_PREEMPT_RT_FULL
2923 +static bool notify_work_ready __read_mostly;
2924 +static struct swork_event notify_work;
2925 +
2926 +static int mce_notify_work_init(void)
2927 +{
2928 +       int err;
2929 +
2930 +       err = swork_get();
2931 +       if (err)
2932 +               return err;
2933 +
2934 +       INIT_SWORK(&notify_work, __mce_notify_work);
2935 +       notify_work_ready = true;
2936 +       return 0;
2937 +}
2938 +
2939 +static void mce_notify_work(void)
2940 +{
2941 +       if (notify_work_ready)
2942 +               swork_queue(&notify_work);
2943 +}
2944 +#else
2945 +static void mce_notify_work(void)
2946 +{
2947 +       __mce_notify_work(NULL);
2948 +}
2949 +static inline int mce_notify_work_init(void) { return 0; }
2950 +#endif
2951 +
2952  /*
2953   * Notify the user(s) about new machine check events.
2954   * Can be called from interrupt context, but not from machine check/NMI
2955 @@ -1333,19 +1371,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2956   */
2957  int mce_notify_irq(void)
2958  {
2959 -       /* Not more than two messages every minute */
2960 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2961 -
2962         if (test_and_clear_bit(0, &mce_need_notify)) {
2963 -               /* wake processes polling /dev/mcelog */
2964 -               wake_up_interruptible(&mce_chrdev_wait);
2965 -
2966 -               if (mce_helper[0])
2967 -                       schedule_work(&mce_trigger_work);
2968 -
2969 -               if (__ratelimit(&ratelimit))
2970 -                       pr_info(HW_ERR "Machine check events logged\n");
2971 -
2972 +               mce_notify_work();
2973                 return 1;
2974         }
2975         return 0;
2976 @@ -1639,7 +1666,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2977         }
2978  }
2979  
2980 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2981 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2982  {
2983         unsigned long iv = check_interval * HZ;
2984  
2985 @@ -1648,16 +1675,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2986  
2987         per_cpu(mce_next_interval, cpu) = iv;
2988  
2989 -       t->expires = round_jiffies(jiffies + iv);
2990 -       add_timer_on(t, cpu);
2991 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2992 +                       0, HRTIMER_MODE_REL_PINNED);
2993  }
2994  
2995  static void __mcheck_cpu_init_timer(void)
2996  {
2997 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2998 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2999         unsigned int cpu = smp_processor_id();
3000  
3001 -       setup_timer(t, mce_timer_fn, cpu);
3002 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3003 +       t->function = mce_timer_fn;
3004         mce_start_timer(cpu, t);
3005  }
3006  
3007 @@ -2376,6 +2404,8 @@ static void mce_disable_cpu(void *h)
3008         if (!mce_available(raw_cpu_ptr(&cpu_info)))
3009                 return;
3010  
3011 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
3012 +
3013         if (!(action & CPU_TASKS_FROZEN))
3014                 cmci_clear();
3015  
3016 @@ -2398,6 +2428,7 @@ static void mce_reenable_cpu(void *h)
3017                 if (b->init)
3018                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
3019         }
3020 +       __mcheck_cpu_init_timer();
3021  }
3022  
3023  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
3024 @@ -2405,7 +2436,6 @@ static int
3025  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3026  {
3027         unsigned int cpu = (unsigned long)hcpu;
3028 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
3029  
3030         switch (action & ~CPU_TASKS_FROZEN) {
3031         case CPU_ONLINE:
3032 @@ -2425,11 +2455,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3033                 break;
3034         case CPU_DOWN_PREPARE:
3035                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
3036 -               del_timer_sync(t);
3037                 break;
3038         case CPU_DOWN_FAILED:
3039                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
3040 -               mce_start_timer(cpu, t);
3041                 break;
3042         }
3043  
3044 @@ -2468,6 +2496,10 @@ static __init int mcheck_init_device(void)
3045                 goto err_out;
3046         }
3047  
3048 +       err = mce_notify_work_init();
3049 +       if (err)
3050 +               goto err_out;
3051 +
3052         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
3053                 err = -ENOMEM;
3054                 goto err_out;
3055 diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3056 index ed446bdcbf31..d2ac364e2118 100644
3057 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3058 +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3059 @@ -117,7 +117,7 @@ static struct perf_pmu_events_attr event_attr_##v = {                       \
3060  };
3061  
3062  struct rapl_pmu {
3063 -       spinlock_t       lock;
3064 +       raw_spinlock_t   lock;
3065         int              n_active; /* number of active events */
3066         struct list_head active_list;
3067         struct pmu       *pmu; /* pointer to rapl_pmu_class */
3068 @@ -220,13 +220,13 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
3069         if (!pmu->n_active)
3070                 return HRTIMER_NORESTART;
3071  
3072 -       spin_lock_irqsave(&pmu->lock, flags);
3073 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3074  
3075         list_for_each_entry(event, &pmu->active_list, active_entry) {
3076                 rapl_event_update(event);
3077         }
3078  
3079 -       spin_unlock_irqrestore(&pmu->lock, flags);
3080 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3081  
3082         hrtimer_forward_now(hrtimer, pmu->timer_interval);
3083  
3084 @@ -263,9 +263,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
3085         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
3086         unsigned long flags;
3087  
3088 -       spin_lock_irqsave(&pmu->lock, flags);
3089 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3090         __rapl_pmu_event_start(pmu, event);
3091 -       spin_unlock_irqrestore(&pmu->lock, flags);
3092 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3093  }
3094  
3095  static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3096 @@ -274,7 +274,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3097         struct hw_perf_event *hwc = &event->hw;
3098         unsigned long flags;
3099  
3100 -       spin_lock_irqsave(&pmu->lock, flags);
3101 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3102  
3103         /* mark event as deactivated and stopped */
3104         if (!(hwc->state & PERF_HES_STOPPED)) {
3105 @@ -299,7 +299,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3106                 hwc->state |= PERF_HES_UPTODATE;
3107         }
3108  
3109 -       spin_unlock_irqrestore(&pmu->lock, flags);
3110 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3111  }
3112  
3113  static int rapl_pmu_event_add(struct perf_event *event, int mode)
3114 @@ -308,14 +308,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
3115         struct hw_perf_event *hwc = &event->hw;
3116         unsigned long flags;
3117  
3118 -       spin_lock_irqsave(&pmu->lock, flags);
3119 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3120  
3121         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
3122  
3123         if (mode & PERF_EF_START)
3124                 __rapl_pmu_event_start(pmu, event);
3125  
3126 -       spin_unlock_irqrestore(&pmu->lock, flags);
3127 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3128  
3129         return 0;
3130  }
3131 @@ -603,7 +603,7 @@ static int rapl_cpu_prepare(int cpu)
3132         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
3133         if (!pmu)
3134                 return -1;
3135 -       spin_lock_init(&pmu->lock);
3136 +       raw_spin_lock_init(&pmu->lock);
3137  
3138         INIT_LIST_HEAD(&pmu->active_list);
3139  
3140 diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
3141 index 464ffd69b92e..00db1aad1548 100644
3142 --- a/arch/x86/kernel/dumpstack_32.c
3143 +++ b/arch/x86/kernel/dumpstack_32.c
3144 @@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3145                 unsigned long *stack, unsigned long bp,
3146                 const struct stacktrace_ops *ops, void *data)
3147  {
3148 -       const unsigned cpu = get_cpu();
3149 +       const unsigned cpu = get_cpu_light();
3150         int graph = 0;
3151         u32 *prev_esp;
3152  
3153 @@ -86,7 +86,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3154                         break;
3155                 touch_nmi_watchdog();
3156         }
3157 -       put_cpu();
3158 +       put_cpu_light();
3159  }
3160  EXPORT_SYMBOL(dump_trace);
3161  
3162 diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
3163 index 5f1c6266eb30..c331e3fef465 100644
3164 --- a/arch/x86/kernel/dumpstack_64.c
3165 +++ b/arch/x86/kernel/dumpstack_64.c
3166 @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3167                 unsigned long *stack, unsigned long bp,
3168                 const struct stacktrace_ops *ops, void *data)
3169  {
3170 -       const unsigned cpu = get_cpu();
3171 +       const unsigned cpu = get_cpu_light();
3172         struct thread_info *tinfo;
3173         unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
3174         unsigned long dummy;
3175 @@ -241,7 +241,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3176          * This handles the process stack:
3177          */
3178         bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
3179 -       put_cpu();
3180 +       put_cpu_light();
3181  }
3182  EXPORT_SYMBOL(dump_trace);
3183  
3184 @@ -255,7 +255,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3185         int cpu;
3186         int i;
3187  
3188 -       preempt_disable();
3189 +       migrate_disable();
3190         cpu = smp_processor_id();
3191  
3192         irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
3193 @@ -291,7 +291,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3194                         pr_cont(" %016lx", *stack++);
3195                 touch_nmi_watchdog();
3196         }
3197 -       preempt_enable();
3198 +       migrate_enable();
3199  
3200         pr_cont("\n");
3201         show_trace_log_lvl(task, regs, sp, bp, log_lvl);
3202 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
3203 index 38da8f29a9c8..ce71f7098f15 100644
3204 --- a/arch/x86/kernel/irq_32.c
3205 +++ b/arch/x86/kernel/irq_32.c
3206 @@ -128,6 +128,7 @@ void irq_ctx_init(int cpu)
3207                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
3208  }
3209  
3210 +#ifndef CONFIG_PREEMPT_RT_FULL
3211  void do_softirq_own_stack(void)
3212  {
3213         struct thread_info *curstk;
3214 @@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
3215  
3216         call_on_stack(__do_softirq, isp);
3217  }
3218 +#endif
3219  
3220  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3221  {
3222 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
3223 index 47190bd399e7..807950860fb7 100644
3224 --- a/arch/x86/kernel/kvm.c
3225 +++ b/arch/x86/kernel/kvm.c
3226 @@ -36,6 +36,7 @@
3227  #include <linux/kprobes.h>
3228  #include <linux/debugfs.h>
3229  #include <linux/nmi.h>
3230 +#include <linux/swait.h>
3231  #include <asm/timer.h>
3232  #include <asm/cpu.h>
3233  #include <asm/traps.h>
3234 @@ -91,14 +92,14 @@ static void kvm_io_delay(void)
3235  
3236  struct kvm_task_sleep_node {
3237         struct hlist_node link;
3238 -       wait_queue_head_t wq;
3239 +       struct swait_queue_head wq;
3240         u32 token;
3241         int cpu;
3242         bool halted;
3243  };
3244  
3245  static struct kvm_task_sleep_head {
3246 -       spinlock_t lock;
3247 +       raw_spinlock_t lock;
3248         struct hlist_head list;
3249  } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
3250  
3251 @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
3252         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
3253         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
3254         struct kvm_task_sleep_node n, *e;
3255 -       DEFINE_WAIT(wait);
3256 +       DECLARE_SWAITQUEUE(wait);
3257  
3258         rcu_irq_enter();
3259  
3260 -       spin_lock(&b->lock);
3261 +       raw_spin_lock(&b->lock);
3262         e = _find_apf_task(b, token);
3263         if (e) {
3264                 /* dummy entry exist -> wake up was delivered ahead of PF */
3265                 hlist_del(&e->link);
3266                 kfree(e);
3267 -               spin_unlock(&b->lock);
3268 +               raw_spin_unlock(&b->lock);
3269  
3270                 rcu_irq_exit();
3271                 return;
3272 @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
3273         n.token = token;
3274         n.cpu = smp_processor_id();
3275         n.halted = is_idle_task(current) || preempt_count() > 1;
3276 -       init_waitqueue_head(&n.wq);
3277 +       init_swait_queue_head(&n.wq);
3278         hlist_add_head(&n.link, &b->list);
3279 -       spin_unlock(&b->lock);
3280 +       raw_spin_unlock(&b->lock);
3281  
3282         for (;;) {
3283                 if (!n.halted)
3284 -                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3285 +                       prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3286                 if (hlist_unhashed(&n.link))
3287                         break;
3288  
3289 @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
3290                 }
3291         }
3292         if (!n.halted)
3293 -               finish_wait(&n.wq, &wait);
3294 +               finish_swait(&n.wq, &wait);
3295  
3296         rcu_irq_exit();
3297         return;
3298 @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
3299         hlist_del_init(&n->link);
3300         if (n->halted)
3301                 smp_send_reschedule(n->cpu);
3302 -       else if (waitqueue_active(&n->wq))
3303 -               wake_up(&n->wq);
3304 +       else if (swait_active(&n->wq))
3305 +               swake_up(&n->wq);
3306  }
3307  
3308  static void apf_task_wake_all(void)
3309 @@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
3310         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
3311                 struct hlist_node *p, *next;
3312                 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
3313 -               spin_lock(&b->lock);
3314 +               raw_spin_lock(&b->lock);
3315                 hlist_for_each_safe(p, next, &b->list) {
3316                         struct kvm_task_sleep_node *n =
3317                                 hlist_entry(p, typeof(*n), link);
3318                         if (n->cpu == smp_processor_id())
3319                                 apf_task_wake_one(n);
3320                 }
3321 -               spin_unlock(&b->lock);
3322 +               raw_spin_unlock(&b->lock);
3323         }
3324  }
3325  
3326 @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
3327         }
3328  
3329  again:
3330 -       spin_lock(&b->lock);
3331 +       raw_spin_lock(&b->lock);
3332         n = _find_apf_task(b, token);
3333         if (!n) {
3334                 /*
3335 @@ -225,17 +226,17 @@ again:
3336                          * Allocation failed! Busy wait while other cpu
3337                          * handles async PF.
3338                          */
3339 -                       spin_unlock(&b->lock);
3340 +                       raw_spin_unlock(&b->lock);
3341                         cpu_relax();
3342                         goto again;
3343                 }
3344                 n->token = token;
3345                 n->cpu = smp_processor_id();
3346 -               init_waitqueue_head(&n->wq);
3347 +               init_swait_queue_head(&n->wq);
3348                 hlist_add_head(&n->link, &b->list);
3349         } else
3350                 apf_task_wake_one(n);
3351 -       spin_unlock(&b->lock);
3352 +       raw_spin_unlock(&b->lock);
3353         return;
3354  }
3355  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
3356 @@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
3357         paravirt_ops_setup();
3358         register_reboot_notifier(&kvm_pv_reboot_nb);
3359         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
3360 -               spin_lock_init(&async_pf_sleepers[i].lock);
3361 +               raw_spin_lock_init(&async_pf_sleepers[i].lock);
3362         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
3363                 x86_init.irqs.trap_init = kvm_apf_trap_init;
3364  
3365 diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
3366 index 697f90db0e37..424aec4a4c71 100644
3367 --- a/arch/x86/kernel/nmi.c
3368 +++ b/arch/x86/kernel/nmi.c
3369 @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
3370  #endif
3371  
3372         if (panic_on_unrecovered_nmi)
3373 -               panic("NMI: Not continuing");
3374 +               nmi_panic(regs, "NMI: Not continuing");
3375  
3376         pr_emerg("Dazed and confused, but trying to continue\n");
3377  
3378 @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
3379                  reason, smp_processor_id());
3380         show_regs(regs);
3381  
3382 -       if (panic_on_io_nmi)
3383 -               panic("NMI IOCK error: Not continuing");
3384 +       if (panic_on_io_nmi) {
3385 +               nmi_panic(regs, "NMI IOCK error: Not continuing");
3386 +
3387 +               /*
3388 +                * If we end up here, it means we have received an NMI while
3389 +                * processing panic(). Simply return without delaying and
3390 +                * re-enabling NMIs.
3391 +                */
3392 +               return;
3393 +       }
3394  
3395         /* Re-enable the IOCK line, wait for a few seconds */
3396         reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
3397 @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
3398  
3399         pr_emerg("Do you have a strange power saving mode enabled?\n");
3400         if (unknown_nmi_panic || panic_on_unrecovered_nmi)
3401 -               panic("NMI: Not continuing");
3402 +               nmi_panic(regs, "NMI: Not continuing");
3403  
3404         pr_emerg("Dazed and confused, but trying to continue\n");
3405  }
3406 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
3407 index 9f950917528b..4dd4beae917a 100644
3408 --- a/arch/x86/kernel/process_32.c
3409 +++ b/arch/x86/kernel/process_32.c
3410 @@ -35,6 +35,7 @@
3411  #include <linux/uaccess.h>
3412  #include <linux/io.h>
3413  #include <linux/kdebug.h>
3414 +#include <linux/highmem.h>
3415  
3416  #include <asm/pgtable.h>
3417  #include <asm/ldt.h>
3418 @@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
3419  }
3420  EXPORT_SYMBOL_GPL(start_thread);
3421  
3422 +#ifdef CONFIG_PREEMPT_RT_FULL
3423 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3424 +{
3425 +       int i;
3426 +
3427 +       /*
3428 +        * Clear @prev's kmap_atomic mappings
3429 +        */
3430 +       for (i = 0; i < prev_p->kmap_idx; i++) {
3431 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3432 +               pte_t *ptep = kmap_pte - idx;
3433 +
3434 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3435 +       }
3436 +       /*
3437 +        * Restore @next_p's kmap_atomic mappings
3438 +        */
3439 +       for (i = 0; i < next_p->kmap_idx; i++) {
3440 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3441 +
3442 +               if (!pte_none(next_p->kmap_pte[i]))
3443 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3444 +       }
3445 +}
3446 +#else
3447 +static inline void
3448 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3449 +#endif
3450 +
3451  
3452  /*
3453   *     switch_to(x,y) should switch tasks from x to y.
3454 @@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
3455                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3456                 __switch_to_xtra(prev_p, next_p, tss);
3457  
3458 +       switch_kmaps(prev_p, next_p);
3459 +
3460         /*
3461          * Leave lazy mode, flushing any hypercalls made here.
3462          * This must be done before restoring TLS segments so
3463 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
3464 index f660d63f40fe..8384207adde2 100644
3465 --- a/arch/x86/kernel/reboot.c
3466 +++ b/arch/x86/kernel/reboot.c
3467 @@ -726,6 +726,7 @@ static int crashing_cpu;
3468  static nmi_shootdown_cb shootdown_callback;
3469  
3470  static atomic_t waiting_for_crash_ipi;
3471 +static int crash_ipi_issued;
3472  
3473  static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
3474  {
3475 @@ -788,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3476  
3477         smp_send_nmi_allbutself();
3478  
3479 +       /* Kick CPUs looping in NMI context. */
3480 +       WRITE_ONCE(crash_ipi_issued, 1);
3481 +
3482         msecs = 1000; /* Wait at most a second for the other cpus to stop */
3483         while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
3484                 mdelay(1);
3485 @@ -796,6 +800,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3486  
3487         /* Leave the nmi callback set */
3488  }
3489 +
3490 +/* Override the weak function in kernel/panic.c */
3491 +void nmi_panic_self_stop(struct pt_regs *regs)
3492 +{
3493 +       while (1) {
3494 +               /*
3495 +                * Wait for the crash dumping IPI to be issued, and then
3496 +                * call its callback directly.
3497 +                */
3498 +               if (READ_ONCE(crash_ipi_issued))
3499 +                       crash_nmi_callback(0, regs); /* Don't return */
3500 +
3501 +               cpu_relax();
3502 +       }
3503 +}
3504 +
3505  #else /* !CONFIG_SMP */
3506  void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3507  {
3508 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
3509 index 4d30b865be30..20d9e9fb3b74 100644
3510 --- a/arch/x86/kvm/lapic.c
3511 +++ b/arch/x86/kvm/lapic.c
3512 @@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
3513  static void apic_timer_expired(struct kvm_lapic *apic)
3514  {
3515         struct kvm_vcpu *vcpu = apic->vcpu;
3516 -       wait_queue_head_t *q = &vcpu->wq;
3517 +       struct swait_queue_head *q = &vcpu->wq;
3518         struct kvm_timer *ktimer = &apic->lapic_timer;
3519  
3520         if (atomic_read(&apic->lapic_timer.pending))
3521 @@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
3522         atomic_inc(&apic->lapic_timer.pending);
3523         kvm_set_pending_timer(vcpu);
3524  
3525 -       if (waitqueue_active(q))
3526 -               wake_up_interruptible(q);
3527 +       if (swait_active(q))
3528 +               swake_up(q);
3529  
3530         if (apic_lvtt_tscdeadline(apic))
3531                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
3532 @@ -1801,6 +1801,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
3533         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3534                      HRTIMER_MODE_ABS);
3535         apic->lapic_timer.timer.function = apic_timer_fn;
3536 +       apic->lapic_timer.timer.irqsafe = 1;
3537  
3538         /*
3539          * APIC is created enabled. This will prevent kvm_lapic_set_base from
3540 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
3541 index d7cb9577fa31..77c1bdd802df 100644
3542 --- a/arch/x86/kvm/x86.c
3543 +++ b/arch/x86/kvm/x86.c
3544 @@ -5792,6 +5792,13 @@ int kvm_arch_init(void *opaque)
3545                 goto out;
3546         }
3547  
3548 +#ifdef CONFIG_PREEMPT_RT_FULL
3549 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3550 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3551 +               return -EOPNOTSUPP;
3552 +       }
3553 +#endif
3554 +
3555         r = kvm_mmu_module_init();
3556         if (r)
3557                 goto out_free_percpu;
3558 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
3559 index a6d739258137..bd24ba1c4a86 100644
3560 --- a/arch/x86/mm/highmem_32.c
3561 +++ b/arch/x86/mm/highmem_32.c
3562 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
3563   */
3564  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3565  {
3566 +       pte_t pte = mk_pte(page, prot);
3567         unsigned long vaddr;
3568         int idx, type;
3569  
3570 -       preempt_disable();
3571 +       preempt_disable_nort();
3572         pagefault_disable();
3573  
3574         if (!PageHighMem(page))
3575 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3576         idx = type + KM_TYPE_NR*smp_processor_id();
3577         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3578         BUG_ON(!pte_none(*(kmap_pte-idx)));
3579 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
3580 +#ifdef CONFIG_PREEMPT_RT_FULL
3581 +       current->kmap_pte[type] = pte;
3582 +#endif
3583 +       set_pte(kmap_pte-idx, pte);
3584         arch_flush_lazy_mmu_mode();
3585  
3586         return (void *)vaddr;
3587 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
3588                  * is a bad idea also, in case the page changes cacheability
3589                  * attributes or becomes a protected page in a hypervisor.
3590                  */
3591 +#ifdef CONFIG_PREEMPT_RT_FULL
3592 +               current->kmap_pte[type] = __pte(0);
3593 +#endif
3594                 kpte_clear_flush(kmap_pte-idx, vaddr);
3595                 kmap_atomic_idx_pop();
3596                 arch_flush_lazy_mmu_mode();
3597 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
3598  #endif
3599  
3600         pagefault_enable();
3601 -       preempt_enable();
3602 +       preempt_enable_nort();
3603  }
3604  EXPORT_SYMBOL(__kunmap_atomic);
3605  
3606 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
3607 index 9c0ff045fdd4..dd25dd1671b6 100644
3608 --- a/arch/x86/mm/iomap_32.c
3609 +++ b/arch/x86/mm/iomap_32.c
3610 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
3611  
3612  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3613  {
3614 +       pte_t pte = pfn_pte(pfn, prot);
3615         unsigned long vaddr;
3616         int idx, type;
3617  
3618 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3619         type = kmap_atomic_idx_push();
3620         idx = type + KM_TYPE_NR * smp_processor_id();
3621         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3622 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3623 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
3624 +
3625 +#ifdef CONFIG_PREEMPT_RT_FULL
3626 +       current->kmap_pte[type] = pte;
3627 +#endif
3628 +       set_pte(kmap_pte - idx, pte);
3629         arch_flush_lazy_mmu_mode();
3630  
3631         return (void *)vaddr;
3632 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
3633                  * is a bad idea also, in case the page changes cacheability
3634                  * attributes or becomes a protected page in a hypervisor.
3635                  */
3636 +#ifdef CONFIG_PREEMPT_RT_FULL
3637 +               current->kmap_pte[type] = __pte(0);
3638 +#endif
3639                 kpte_clear_flush(kmap_pte-idx, vaddr);
3640                 kmap_atomic_idx_pop();
3641         }
3642 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
3643 index 3b6ec42718e4..7871083de089 100644
3644 --- a/arch/x86/platform/uv/tlb_uv.c
3645 +++ b/arch/x86/platform/uv/tlb_uv.c
3646 @@ -714,9 +714,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
3647  
3648                 quiesce_local_uvhub(hmaster);
3649  
3650 -               spin_lock(&hmaster->queue_lock);
3651 +               raw_spin_lock(&hmaster->queue_lock);
3652                 reset_with_ipi(&bau_desc->distribution, bcp);
3653 -               spin_unlock(&hmaster->queue_lock);
3654 +               raw_spin_unlock(&hmaster->queue_lock);
3655  
3656                 end_uvhub_quiesce(hmaster);
3657  
3658 @@ -736,9 +736,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
3659  
3660                 quiesce_local_uvhub(hmaster);
3661  
3662 -               spin_lock(&hmaster->queue_lock);
3663 +               raw_spin_lock(&hmaster->queue_lock);
3664                 reset_with_ipi(&bau_desc->distribution, bcp);
3665 -               spin_unlock(&hmaster->queue_lock);
3666 +               raw_spin_unlock(&hmaster->queue_lock);
3667  
3668                 end_uvhub_quiesce(hmaster);
3669  
3670 @@ -759,7 +759,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3671         cycles_t tm1;
3672  
3673         hmaster = bcp->uvhub_master;
3674 -       spin_lock(&hmaster->disable_lock);
3675 +       raw_spin_lock(&hmaster->disable_lock);
3676         if (!bcp->baudisabled) {
3677                 stat->s_bau_disabled++;
3678                 tm1 = get_cycles();
3679 @@ -772,7 +772,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3680                         }
3681                 }
3682         }
3683 -       spin_unlock(&hmaster->disable_lock);
3684 +       raw_spin_unlock(&hmaster->disable_lock);
3685  }
3686  
3687  static void count_max_concurr(int stat, struct bau_control *bcp,
3688 @@ -835,7 +835,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
3689   */
3690  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3691  {
3692 -       spinlock_t *lock = &hmaster->uvhub_lock;
3693 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
3694         atomic_t *v;
3695  
3696         v = &hmaster->active_descriptor_count;
3697 @@ -968,7 +968,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3698         struct bau_control *hmaster;
3699  
3700         hmaster = bcp->uvhub_master;
3701 -       spin_lock(&hmaster->disable_lock);
3702 +       raw_spin_lock(&hmaster->disable_lock);
3703         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3704                 stat->s_bau_reenabled++;
3705                 for_each_present_cpu(tcpu) {
3706 @@ -980,10 +980,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3707                                 tbcp->period_giveups = 0;
3708                         }
3709                 }
3710 -               spin_unlock(&hmaster->disable_lock);
3711 +               raw_spin_unlock(&hmaster->disable_lock);
3712                 return 0;
3713         }
3714 -       spin_unlock(&hmaster->disable_lock);
3715 +       raw_spin_unlock(&hmaster->disable_lock);
3716         return -1;
3717  }
3718  
3719 @@ -1901,9 +1901,9 @@ static void __init init_per_cpu_tunables(void)
3720                 bcp->cong_reps                  = congested_reps;
3721                 bcp->disabled_period =          sec_2_cycles(disabled_period);
3722                 bcp->giveup_limit =             giveup_limit;
3723 -               spin_lock_init(&bcp->queue_lock);
3724 -               spin_lock_init(&bcp->uvhub_lock);
3725 -               spin_lock_init(&bcp->disable_lock);
3726 +               raw_spin_lock_init(&bcp->queue_lock);
3727 +               raw_spin_lock_init(&bcp->uvhub_lock);
3728 +               raw_spin_lock_init(&bcp->disable_lock);
3729         }
3730  }
3731  
3732 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
3733 index 2b158a9fa1d7..5e0b122620cb 100644
3734 --- a/arch/x86/platform/uv/uv_time.c
3735 +++ b/arch/x86/platform/uv/uv_time.c
3736 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
3737  
3738  /* There is one of these allocated per node */
3739  struct uv_rtc_timer_head {
3740 -       spinlock_t      lock;
3741 +       raw_spinlock_t  lock;
3742         /* next cpu waiting for timer, local node relative: */
3743         int             next_cpu;
3744         /* number of cpus on this node: */
3745 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
3746                                 uv_rtc_deallocate_timers();
3747                                 return -ENOMEM;
3748                         }
3749 -                       spin_lock_init(&head->lock);
3750 +                       raw_spin_lock_init(&head->lock);
3751                         head->ncpus = uv_blade_nr_possible_cpus(bid);
3752                         head->next_cpu = -1;
3753                         blade_info[bid] = head;
3754 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3755         unsigned long flags;
3756         int next_cpu;
3757  
3758 -       spin_lock_irqsave(&head->lock, flags);
3759 +       raw_spin_lock_irqsave(&head->lock, flags);
3760  
3761         next_cpu = head->next_cpu;
3762         *t = expires;
3763 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3764                 if (uv_setup_intr(cpu, expires)) {
3765                         *t = ULLONG_MAX;
3766                         uv_rtc_find_next_timer(head, pnode);
3767 -                       spin_unlock_irqrestore(&head->lock, flags);
3768 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
3769                         return -ETIME;
3770                 }
3771         }
3772  
3773 -       spin_unlock_irqrestore(&head->lock, flags);
3774 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3775         return 0;
3776  }
3777  
3778 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3779         unsigned long flags;
3780         int rc = 0;
3781  
3782 -       spin_lock_irqsave(&head->lock, flags);
3783 +       raw_spin_lock_irqsave(&head->lock, flags);
3784  
3785         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3786                 rc = 1;
3787 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3788                         uv_rtc_find_next_timer(head, pnode);
3789         }
3790  
3791 -       spin_unlock_irqrestore(&head->lock, flags);
3792 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3793  
3794         return rc;
3795  }
3796 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
3797  static cycle_t uv_read_rtc(struct clocksource *cs)
3798  {
3799         unsigned long offset;
3800 +       cycle_t cycles;
3801  
3802 +       preempt_disable();
3803         if (uv_get_min_hub_revision_id() == 1)
3804                 offset = 0;
3805         else
3806                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3807  
3808 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3809 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3810 +       preempt_enable();
3811 +
3812 +       return cycles;
3813  }
3814  
3815  /*
3816 diff --git a/block/blk-core.c b/block/blk-core.c
3817 index 4fab5d610805..52d2fe2fec8f 100644
3818 --- a/block/blk-core.c
3819 +++ b/block/blk-core.c
3820 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
3821  
3822         INIT_LIST_HEAD(&rq->queuelist);
3823         INIT_LIST_HEAD(&rq->timeout_list);
3824 +#ifdef CONFIG_PREEMPT_RT_FULL
3825 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3826 +#endif
3827         rq->cpu = -1;
3828         rq->q = q;
3829         rq->__sector = (sector_t) -1;
3830 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
3831   **/
3832  void blk_start_queue(struct request_queue *q)
3833  {
3834 -       WARN_ON(!irqs_disabled());
3835 +       WARN_ON_NONRT(!irqs_disabled());
3836  
3837         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3838         __blk_run_queue(q);
3839 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
3840                 if (!gfpflags_allow_blocking(gfp))
3841                         return -EBUSY;
3842  
3843 -               ret = wait_event_interruptible(q->mq_freeze_wq,
3844 +               ret = swait_event_interruptible(q->mq_freeze_wq,
3845                                 !atomic_read(&q->mq_freeze_depth) ||
3846                                 blk_queue_dying(q));
3847                 if (blk_queue_dying(q))
3848 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3849         struct request_queue *q =
3850                 container_of(ref, struct request_queue, q_usage_counter);
3851  
3852 -       wake_up_all(&q->mq_freeze_wq);
3853 +       swake_up_all(&q->mq_freeze_wq);
3854  }
3855  
3856  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3857 @@ -741,7 +744,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3858         q->bypass_depth = 1;
3859         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3860  
3861 -       init_waitqueue_head(&q->mq_freeze_wq);
3862 +       init_swait_queue_head(&q->mq_freeze_wq);
3863  
3864         /*
3865          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3866 @@ -3200,7 +3203,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3867                 blk_run_queue_async(q);
3868         else
3869                 __blk_run_queue(q);
3870 -       spin_unlock(q->queue_lock);
3871 +       spin_unlock_irq(q->queue_lock);
3872  }
3873  
3874  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3875 @@ -3248,7 +3251,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3876  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3877  {
3878         struct request_queue *q;
3879 -       unsigned long flags;
3880         struct request *rq;
3881         LIST_HEAD(list);
3882         unsigned int depth;
3883 @@ -3268,11 +3270,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3884         q = NULL;
3885         depth = 0;
3886  
3887 -       /*
3888 -        * Save and disable interrupts here, to avoid doing it for every
3889 -        * queue lock we have to take.
3890 -        */
3891 -       local_irq_save(flags);
3892         while (!list_empty(&list)) {
3893                 rq = list_entry_rq(list.next);
3894                 list_del_init(&rq->queuelist);
3895 @@ -3285,7 +3282,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3896                                 queue_unplugged(q, depth, from_schedule);
3897                         q = rq->q;
3898                         depth = 0;
3899 -                       spin_lock(q->queue_lock);
3900 +                       spin_lock_irq(q->queue_lock);
3901                 }
3902  
3903                 /*
3904 @@ -3312,8 +3309,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3905          */
3906         if (q)
3907                 queue_unplugged(q, depth, from_schedule);
3908 -
3909 -       local_irq_restore(flags);
3910  }
3911  
3912  void blk_finish_plug(struct blk_plug *plug)
3913 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3914 index 381cb50a673c..dc8785233d94 100644
3915 --- a/block/blk-ioc.c
3916 +++ b/block/blk-ioc.c
3917 @@ -7,6 +7,7 @@
3918  #include <linux/bio.h>
3919  #include <linux/blkdev.h>
3920  #include <linux/slab.h>
3921 +#include <linux/delay.h>
3922  
3923  #include "blk.h"
3924  
3925 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3926                         spin_unlock(q->queue_lock);
3927                 } else {
3928                         spin_unlock_irqrestore(&ioc->lock, flags);
3929 -                       cpu_relax();
3930 +                       cpu_chill();
3931                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3932                 }
3933         }
3934 @@ -187,7 +188,7 @@ retry:
3935                         spin_unlock(icq->q->queue_lock);
3936                 } else {
3937                         spin_unlock_irqrestore(&ioc->lock, flags);
3938 -                       cpu_relax();
3939 +                       cpu_chill();
3940                         goto retry;
3941                 }
3942         }
3943 diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
3944 index 0736729d6494..3e21e31d0d7e 100644
3945 --- a/block/blk-iopoll.c
3946 +++ b/block/blk-iopoll.c
3947 @@ -35,6 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
3948         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
3949         __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3950         local_irq_restore(flags);
3951 +       preempt_check_resched_rt();
3952  }
3953  EXPORT_SYMBOL(blk_iopoll_sched);
3954  
3955 @@ -132,6 +133,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
3956                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3957  
3958         local_irq_enable();
3959 +       preempt_check_resched_rt();
3960  }
3961  
3962  /**
3963 @@ -201,6 +203,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
3964                                  this_cpu_ptr(&blk_cpu_iopoll));
3965                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3966                 local_irq_enable();
3967 +               preempt_check_resched_rt();
3968         }
3969  
3970         return NOTIFY_OK;
3971 diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
3972 index bb3ed488f7b5..628c6c13c482 100644
3973 --- a/block/blk-mq-cpu.c
3974 +++ b/block/blk-mq-cpu.c
3975 @@ -16,7 +16,7 @@
3976  #include "blk-mq.h"
3977  
3978  static LIST_HEAD(blk_mq_cpu_notify_list);
3979 -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
3980 +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
3981  
3982  static int blk_mq_main_cpu_notify(struct notifier_block *self,
3983                                   unsigned long action, void *hcpu)
3984 @@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
3985         struct blk_mq_cpu_notifier *notify;
3986         int ret = NOTIFY_OK;
3987  
3988 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
3989 +       if (action != CPU_POST_DEAD)
3990 +               return NOTIFY_OK;
3991 +
3992 +       spin_lock(&blk_mq_cpu_notify_lock);
3993  
3994         list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
3995                 ret = notify->notify(notify->data, action, cpu);
3996 @@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
3997                         break;
3998         }
3999  
4000 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4001 +       spin_unlock(&blk_mq_cpu_notify_lock);
4002         return ret;
4003  }
4004  
4005 @@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4006  {
4007         BUG_ON(!notifier->notify);
4008  
4009 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4010 +       spin_lock(&blk_mq_cpu_notify_lock);
4011         list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
4012 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4013 +       spin_unlock(&blk_mq_cpu_notify_lock);
4014  }
4015  
4016  void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4017  {
4018 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4019 +       spin_lock(&blk_mq_cpu_notify_lock);
4020         list_del(&notifier->list);
4021 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4022 +       spin_unlock(&blk_mq_cpu_notify_lock);
4023  }
4024  
4025  void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
4026 diff --git a/block/blk-mq.c b/block/blk-mq.c
4027 index 839b1e17481b..0e205b886246 100644
4028 --- a/block/blk-mq.c
4029 +++ b/block/blk-mq.c
4030 @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
4031  
4032  static void blk_mq_freeze_queue_wait(struct request_queue *q)
4033  {
4034 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4035 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4036  }
4037  
4038  /*
4039 @@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
4040         WARN_ON_ONCE(freeze_depth < 0);
4041         if (!freeze_depth) {
4042                 percpu_ref_reinit(&q->q_usage_counter);
4043 -               wake_up_all(&q->mq_freeze_wq);
4044 +               swake_up_all(&q->mq_freeze_wq);
4045         }
4046  }
4047  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
4048 @@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
4049          * dying, we need to ensure that processes currently waiting on
4050          * the queue are notified as well.
4051          */
4052 -       wake_up_all(&q->mq_freeze_wq);
4053 +       swake_up_all(&q->mq_freeze_wq);
4054  }
4055  
4056  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
4057 @@ -196,6 +196,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
4058         rq->resid_len = 0;
4059         rq->sense = NULL;
4060  
4061 +#ifdef CONFIG_PREEMPT_RT_FULL
4062 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
4063 +#endif
4064         INIT_LIST_HEAD(&rq->timeout_list);
4065         rq->timeout = 0;
4066  
4067 @@ -325,6 +328,17 @@ void blk_mq_end_request(struct request *rq, int error)
4068  }
4069  EXPORT_SYMBOL(blk_mq_end_request);
4070  
4071 +#ifdef CONFIG_PREEMPT_RT_FULL
4072 +
4073 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
4074 +{
4075 +       struct request *rq = container_of(work, struct request, work);
4076 +
4077 +       rq->q->softirq_done_fn(rq);
4078 +}
4079 +
4080 +#else
4081 +
4082  static void __blk_mq_complete_request_remote(void *data)
4083  {
4084         struct request *rq = data;
4085 @@ -332,6 +346,8 @@ static void __blk_mq_complete_request_remote(void *data)
4086         rq->q->softirq_done_fn(rq);
4087  }
4088  
4089 +#endif
4090 +
4091  static void blk_mq_ipi_complete_request(struct request *rq)
4092  {
4093         struct blk_mq_ctx *ctx = rq->mq_ctx;
4094 @@ -343,19 +359,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
4095                 return;
4096         }
4097  
4098 -       cpu = get_cpu();
4099 +       cpu = get_cpu_light();
4100         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
4101                 shared = cpus_share_cache(cpu, ctx->cpu);
4102  
4103         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
4104 +#ifdef CONFIG_PREEMPT_RT_FULL
4105 +               schedule_work_on(ctx->cpu, &rq->work);
4106 +#else
4107                 rq->csd.func = __blk_mq_complete_request_remote;
4108                 rq->csd.info = rq;
4109                 rq->csd.flags = 0;
4110                 smp_call_function_single_async(ctx->cpu, &rq->csd);
4111 +#endif
4112         } else {
4113                 rq->q->softirq_done_fn(rq);
4114         }
4115 -       put_cpu();
4116 +       put_cpu_light();
4117  }
4118  
4119  static void __blk_mq_complete_request(struct request *rq)
4120 @@ -864,14 +884,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
4121                 return;
4122  
4123         if (!async) {
4124 -               int cpu = get_cpu();
4125 +               int cpu = get_cpu_light();
4126                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
4127                         __blk_mq_run_hw_queue(hctx);
4128 -                       put_cpu();
4129 +                       put_cpu_light();
4130                         return;
4131                 }
4132  
4133 -               put_cpu();
4134 +               put_cpu_light();
4135         }
4136  
4137         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
4138 @@ -1619,7 +1639,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
4139  {
4140         struct blk_mq_hw_ctx *hctx = data;
4141  
4142 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
4143 +       if (action == CPU_POST_DEAD)
4144                 return blk_mq_hctx_cpu_offline(hctx, cpu);
4145  
4146         /*
4147 diff --git a/block/blk-mq.h b/block/blk-mq.h
4148 index 713820b47b31..3cb6feb4fe23 100644
4149 --- a/block/blk-mq.h
4150 +++ b/block/blk-mq.h
4151 @@ -74,7 +74,10 @@ struct blk_align_bitmap {
4152  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4153                                            unsigned int cpu)
4154  {
4155 -       return per_cpu_ptr(q->queue_ctx, cpu);
4156 +       struct blk_mq_ctx *ctx;
4157 +
4158 +       ctx = per_cpu_ptr(q->queue_ctx, cpu);
4159 +       return ctx;
4160  }
4161  
4162  /*
4163 @@ -85,12 +88,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4164   */
4165  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
4166  {
4167 -       return __blk_mq_get_ctx(q, get_cpu());
4168 +       return __blk_mq_get_ctx(q, get_cpu_light());
4169  }
4170  
4171  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
4172  {
4173 -       put_cpu();
4174 +       put_cpu_light();
4175  }
4176  
4177  struct blk_mq_alloc_data {
4178 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
4179 index 53b1737e978d..81c3c0a62edf 100644
4180 --- a/block/blk-softirq.c
4181 +++ b/block/blk-softirq.c
4182 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
4183                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4184  
4185         local_irq_restore(flags);
4186 +       preempt_check_resched_rt();
4187  }
4188  
4189  /*
4190 @@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
4191                                  this_cpu_ptr(&blk_cpu_done));
4192                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4193                 local_irq_enable();
4194 +               preempt_check_resched_rt();
4195         }
4196  
4197         return NOTIFY_OK;
4198 @@ -150,6 +152,7 @@ do_local:
4199                 goto do_local;
4200  
4201         local_irq_restore(flags);
4202 +       preempt_check_resched_rt();
4203  }
4204  
4205  /**
4206 diff --git a/block/bounce.c b/block/bounce.c
4207 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
4208 --- a/block/bounce.c
4209 +++ b/block/bounce.c
4210 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
4211         unsigned long flags;
4212         unsigned char *vto;
4213  
4214 -       local_irq_save(flags);
4215 +       local_irq_save_nort(flags);
4216         vto = kmap_atomic(to->bv_page);
4217         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
4218         kunmap_atomic(vto);
4219 -       local_irq_restore(flags);
4220 +       local_irq_restore_nort(flags);
4221  }
4222  
4223  #else /* CONFIG_HIGHMEM */
4224 diff --git a/crypto/algapi.c b/crypto/algapi.c
4225 index 59bf491fe3d8..f98e79c8cd77 100644
4226 --- a/crypto/algapi.c
4227 +++ b/crypto/algapi.c
4228 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
4229  
4230  int crypto_register_notifier(struct notifier_block *nb)
4231  {
4232 -       return blocking_notifier_chain_register(&crypto_chain, nb);
4233 +       return srcu_notifier_chain_register(&crypto_chain, nb);
4234  }
4235  EXPORT_SYMBOL_GPL(crypto_register_notifier);
4236  
4237  int crypto_unregister_notifier(struct notifier_block *nb)
4238  {
4239 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
4240 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
4241  }
4242  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
4243  
4244 diff --git a/crypto/api.c b/crypto/api.c
4245 index bbc147cb5dec..bc1a848f02ec 100644
4246 --- a/crypto/api.c
4247 +++ b/crypto/api.c
4248 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
4249  DECLARE_RWSEM(crypto_alg_sem);
4250  EXPORT_SYMBOL_GPL(crypto_alg_sem);
4251  
4252 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
4253 +SRCU_NOTIFIER_HEAD(crypto_chain);
4254  EXPORT_SYMBOL_GPL(crypto_chain);
4255  
4256  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
4257 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
4258  {
4259         int ok;
4260  
4261 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4262 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4263         if (ok == NOTIFY_DONE) {
4264                 request_module("cryptomgr");
4265 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4266 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4267         }
4268  
4269         return ok;
4270 diff --git a/crypto/internal.h b/crypto/internal.h
4271 index 00e42a3ed814..2e85551e235f 100644
4272 --- a/crypto/internal.h
4273 +++ b/crypto/internal.h
4274 @@ -47,7 +47,7 @@ struct crypto_larval {
4275  
4276  extern struct list_head crypto_alg_list;
4277  extern struct rw_semaphore crypto_alg_sem;
4278 -extern struct blocking_notifier_head crypto_chain;
4279 +extern struct srcu_notifier_head crypto_chain;
4280  
4281  #ifdef CONFIG_PROC_FS
4282  void __init crypto_init_proc(void);
4283 @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
4284  
4285  static inline void crypto_notify(unsigned long val, void *v)
4286  {
4287 -       blocking_notifier_call_chain(&crypto_chain, val, v);
4288 +       srcu_notifier_call_chain(&crypto_chain, val, v);
4289  }
4290  
4291  #endif /* _CRYPTO_INTERNAL_H */
4292 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
4293 index faa97604d878..941497f31cf0 100644
4294 --- a/drivers/acpi/acpica/acglobal.h
4295 +++ b/drivers/acpi/acpica/acglobal.h
4296 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
4297   * interrupt level
4298   */
4299  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
4300 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
4301 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
4302  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
4303  
4304  /* Mutex for _OSI support */
4305 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
4306 index 3cf77afd142c..dc32e72132f1 100644
4307 --- a/drivers/acpi/acpica/hwregs.c
4308 +++ b/drivers/acpi/acpica/hwregs.c
4309 @@ -269,14 +269,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
4310                           ACPI_BITMASK_ALL_FIXED_STATUS,
4311                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
4312  
4313 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4314 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4315  
4316         /* Clear the fixed events in PM1 A/B */
4317  
4318         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
4319                                         ACPI_BITMASK_ALL_FIXED_STATUS);
4320  
4321 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4322 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4323  
4324         if (ACPI_FAILURE(status)) {
4325                 goto exit;
4326 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
4327 index 5f97468df8ff..8c017f15da7d 100644
4328 --- a/drivers/acpi/acpica/hwxface.c
4329 +++ b/drivers/acpi/acpica/hwxface.c
4330 @@ -374,7 +374,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4331                 return_ACPI_STATUS(AE_BAD_PARAMETER);
4332         }
4333  
4334 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4335 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4336  
4337         /*
4338          * At this point, we know that the parent register is one of the
4339 @@ -435,7 +435,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4340  
4341  unlock_and_exit:
4342  
4343 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4344 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4345         return_ACPI_STATUS(status);
4346  }
4347  
4348 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
4349 index ce406e39b669..41a75eb3ae9d 100644
4350 --- a/drivers/acpi/acpica/utmutex.c
4351 +++ b/drivers/acpi/acpica/utmutex.c
4352 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
4353                 return_ACPI_STATUS (status);
4354         }
4355  
4356 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
4357 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
4358         if (ACPI_FAILURE (status)) {
4359                 return_ACPI_STATUS (status);
4360         }
4361 @@ -156,7 +156,7 @@ void acpi_ut_mutex_terminate(void)
4362         /* Delete the spinlocks */
4363  
4364         acpi_os_delete_lock(acpi_gbl_gpe_lock);
4365 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
4366 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
4367         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
4368  
4369         /* Delete the reader/writer lock */
4370 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
4371 index 7dbba387d12a..65beb7abb4e7 100644
4372 --- a/drivers/ata/libata-sff.c
4373 +++ b/drivers/ata/libata-sff.c
4374 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
4375         unsigned long flags;
4376         unsigned int consumed;
4377  
4378 -       local_irq_save(flags);
4379 +       local_irq_save_nort(flags);
4380         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
4381 -       local_irq_restore(flags);
4382 +       local_irq_restore_nort(flags);
4383  
4384         return consumed;
4385  }
4386 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4387                 unsigned long flags;
4388  
4389                 /* FIXME: use a bounce buffer */
4390 -               local_irq_save(flags);
4391 +               local_irq_save_nort(flags);
4392                 buf = kmap_atomic(page);
4393  
4394                 /* do the actual data transfer */
4395 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4396                                        do_write);
4397  
4398                 kunmap_atomic(buf);
4399 -               local_irq_restore(flags);
4400 +               local_irq_restore_nort(flags);
4401         } else {
4402                 buf = page_address(page);
4403                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
4404 @@ -864,7 +864,7 @@ next_sg:
4405                 unsigned long flags;
4406  
4407                 /* FIXME: use bounce buffer */
4408 -               local_irq_save(flags);
4409 +               local_irq_save_nort(flags);
4410                 buf = kmap_atomic(page);
4411  
4412                 /* do the actual data transfer */
4413 @@ -872,7 +872,7 @@ next_sg:
4414                                                                 count, rw);
4415  
4416                 kunmap_atomic(buf);
4417 -               local_irq_restore(flags);
4418 +               local_irq_restore_nort(flags);
4419         } else {
4420                 buf = page_address(page);
4421                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
4422 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
4423 index 370c2f76016d..65e0b375a291 100644
4424 --- a/drivers/block/zram/zram_drv.c
4425 +++ b/drivers/block/zram/zram_drv.c
4426 @@ -520,6 +520,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
4427                 goto out_error;
4428         }
4429  
4430 +       zram_meta_init_table_locks(meta, disksize);
4431 +
4432         return meta;
4433  
4434  out_error:
4435 @@ -568,12 +570,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4436         unsigned long handle;
4437         size_t size;
4438  
4439 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4440 +       zram_lock_table(&meta->table[index]);
4441         handle = meta->table[index].handle;
4442         size = zram_get_obj_size(meta, index);
4443  
4444         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
4445 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4446 +               zram_unlock_table(&meta->table[index]);
4447                 clear_page(mem);
4448                 return 0;
4449         }
4450 @@ -584,7 +586,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4451         else
4452                 ret = zcomp_decompress(zram->comp, cmem, size, mem);
4453         zs_unmap_object(meta->mem_pool, handle);
4454 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4455 +       zram_unlock_table(&meta->table[index]);
4456  
4457         /* Should NEVER happen. Return bio error if it does. */
4458         if (unlikely(ret)) {
4459 @@ -604,14 +606,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
4460         struct zram_meta *meta = zram->meta;
4461         page = bvec->bv_page;
4462  
4463 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4464 +       zram_lock_table(&meta->table[index]);
4465         if (unlikely(!meta->table[index].handle) ||
4466                         zram_test_flag(meta, index, ZRAM_ZERO)) {
4467 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4468 +               zram_unlock_table(&meta->table[index]);
4469                 handle_zero_page(bvec);
4470                 return 0;
4471         }
4472 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4473 +       zram_unlock_table(&meta->table[index]);
4474  
4475         if (is_partial_io(bvec))
4476                 /* Use  a temporary buffer to decompress the page */
4477 @@ -689,10 +691,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4478                 if (user_mem)
4479                         kunmap_atomic(user_mem);
4480                 /* Free memory associated with this sector now. */
4481 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4482 +               zram_lock_table(&meta->table[index]);
4483                 zram_free_page(zram, index);
4484                 zram_set_flag(meta, index, ZRAM_ZERO);
4485 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4486 +               zram_unlock_table(&meta->table[index]);
4487  
4488                 atomic64_inc(&zram->stats.zero_pages);
4489                 ret = 0;
4490 @@ -752,12 +754,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4491          * Free memory associated with this sector
4492          * before overwriting unused sectors.
4493          */
4494 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4495 +       zram_lock_table(&meta->table[index]);
4496         zram_free_page(zram, index);
4497  
4498         meta->table[index].handle = handle;
4499         zram_set_obj_size(meta, index, clen);
4500 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4501 +       zram_unlock_table(&meta->table[index]);
4502  
4503         /* Update stats */
4504         atomic64_add(clen, &zram->stats.compr_data_size);
4505 @@ -800,9 +802,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
4506         }
4507  
4508         while (n >= PAGE_SIZE) {
4509 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4510 +               zram_lock_table(&meta->table[index]);
4511                 zram_free_page(zram, index);
4512 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4513 +               zram_unlock_table(&meta->table[index]);
4514                 atomic64_inc(&zram->stats.notify_free);
4515                 index++;
4516                 n -= PAGE_SIZE;
4517 @@ -928,9 +930,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
4518         zram = bdev->bd_disk->private_data;
4519         meta = zram->meta;
4520  
4521 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4522 +       zram_lock_table(&meta->table[index]);
4523         zram_free_page(zram, index);
4524 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4525 +       zram_unlock_table(&meta->table[index]);
4526         atomic64_inc(&zram->stats.notify_free);
4527  }
4528  
4529 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
4530 index 8e92339686d7..9e3e953d680e 100644
4531 --- a/drivers/block/zram/zram_drv.h
4532 +++ b/drivers/block/zram/zram_drv.h
4533 @@ -72,6 +72,9 @@ enum zram_pageflags {
4534  struct zram_table_entry {
4535         unsigned long handle;
4536         unsigned long value;
4537 +#ifdef CONFIG_PREEMPT_RT_BASE
4538 +       spinlock_t lock;
4539 +#endif
4540  };
4541  
4542  struct zram_stats {
4543 @@ -119,4 +122,42 @@ struct zram {
4544          */
4545         bool claim; /* Protected by bdev->bd_mutex */
4546  };
4547 +
4548 +#ifndef CONFIG_PREEMPT_RT_BASE
4549 +static inline void zram_lock_table(struct zram_table_entry *table)
4550 +{
4551 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
4552 +}
4553 +
4554 +static inline void zram_unlock_table(struct zram_table_entry *table)
4555 +{
4556 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
4557 +}
4558 +
4559 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
4560 +#else /* CONFIG_PREEMPT_RT_BASE */
4561 +static inline void zram_lock_table(struct zram_table_entry *table)
4562 +{
4563 +       spin_lock(&table->lock);
4564 +       __set_bit(ZRAM_ACCESS, &table->value);
4565 +}
4566 +
4567 +static inline void zram_unlock_table(struct zram_table_entry *table)
4568 +{
4569 +       __clear_bit(ZRAM_ACCESS, &table->value);
4570 +       spin_unlock(&table->lock);
4571 +}
4572 +
4573 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
4574 +{
4575 +        size_t num_pages = disksize >> PAGE_SHIFT;
4576 +        size_t index;
4577 +
4578 +        for (index = 0; index < num_pages; index++) {
4579 +               spinlock_t *lock = &meta->table[index].lock;
4580 +               spin_lock_init(lock);
4581 +        }
4582 +}
4583 +#endif /* CONFIG_PREEMPT_RT_BASE */
4584 +
4585  #endif
4586 diff --git a/drivers/char/random.c b/drivers/char/random.c
4587 index 491a4dce13fe..cf69b6b42208 100644
4588 --- a/drivers/char/random.c
4589 +++ b/drivers/char/random.c
4590 @@ -799,8 +799,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4591         } sample;
4592         long delta, delta2, delta3;
4593  
4594 -       preempt_disable();
4595 -
4596         sample.jiffies = jiffies;
4597         sample.cycles = random_get_entropy();
4598         sample.num = num;
4599 @@ -841,7 +839,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4600                  */
4601                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
4602         }
4603 -       preempt_enable();
4604  }
4605  
4606  void add_input_randomness(unsigned int type, unsigned int code,
4607 @@ -894,28 +891,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
4608         return *(ptr + f->reg_idx++);
4609  }
4610  
4611 -void add_interrupt_randomness(int irq, int irq_flags)
4612 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
4613  {
4614         struct entropy_store    *r;
4615         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
4616 -       struct pt_regs          *regs = get_irq_regs();
4617         unsigned long           now = jiffies;
4618         cycles_t                cycles = random_get_entropy();
4619         __u32                   c_high, j_high;
4620 -       __u64                   ip;
4621         unsigned long           seed;
4622         int                     credit = 0;
4623  
4624         if (cycles == 0)
4625 -               cycles = get_reg(fast_pool, regs);
4626 +               cycles = get_reg(fast_pool, NULL);
4627         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
4628         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
4629         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
4630         fast_pool->pool[1] ^= now ^ c_high;
4631 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
4632 +       if (!ip)
4633 +               ip = _RET_IP_;
4634         fast_pool->pool[2] ^= ip;
4635         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
4636 -               get_reg(fast_pool, regs);
4637 +               get_reg(fast_pool, NULL);
4638  
4639         fast_mix(fast_pool);
4640         add_interrupt_bench(cycles);
4641 diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c
4642 index abc80949e1dd..4ad3298eb372 100644
4643 --- a/drivers/clk/at91/clk-generated.c
4644 +++ b/drivers/clk/at91/clk-generated.c
4645 @@ -15,8 +15,8 @@
4646  #include <linux/clkdev.h>
4647  #include <linux/clk/at91_pmc.h>
4648  #include <linux/of.h>
4649 -#include <linux/of_address.h>
4650 -#include <linux/io.h>
4651 +#include <linux/mfd/syscon.h>
4652 +#include <linux/regmap.h>
4653  
4654  #include "pmc.h"
4655  
4656 @@ -28,8 +28,9 @@
4657  
4658  struct clk_generated {
4659         struct clk_hw hw;
4660 -       struct at91_pmc *pmc;
4661 +       struct regmap *regmap;
4662         struct clk_range range;
4663 +       spinlock_t *lock;
4664         u32 id;
4665         u32 gckdiv;
4666         u8 parent_id;
4667 @@ -41,49 +42,52 @@ struct clk_generated {
4668  static int clk_generated_enable(struct clk_hw *hw)
4669  {
4670         struct clk_generated *gck = to_clk_generated(hw);
4671 -       struct at91_pmc *pmc = gck->pmc;
4672 -       u32 tmp;
4673 +       unsigned long flags;
4674  
4675         pr_debug("GCLK: %s, gckdiv = %d, parent id = %d\n",
4676                  __func__, gck->gckdiv, gck->parent_id);
4677  
4678 -       pmc_lock(pmc);
4679 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4680 -       tmp = pmc_read(pmc, AT91_PMC_PCR) &
4681 -                       ~(AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK);
4682 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_GCKCSS(gck->parent_id)
4683 -                                        | AT91_PMC_PCR_CMD
4684 -                                        | AT91_PMC_PCR_GCKDIV(gck->gckdiv)
4685 -                                        | AT91_PMC_PCR_GCKEN);
4686 -       pmc_unlock(pmc);
4687 +       spin_lock_irqsave(gck->lock, flags);
4688 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4689 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4690 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4691 +                          AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK |
4692 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4693 +                          AT91_PMC_PCR_GCKCSS(gck->parent_id) |
4694 +                          AT91_PMC_PCR_CMD |
4695 +                          AT91_PMC_PCR_GCKDIV(gck->gckdiv) |
4696 +                          AT91_PMC_PCR_GCKEN);
4697 +       spin_unlock_irqrestore(gck->lock, flags);
4698         return 0;
4699  }
4700  
4701  static void clk_generated_disable(struct clk_hw *hw)
4702  {
4703         struct clk_generated *gck = to_clk_generated(hw);
4704 -       struct at91_pmc *pmc = gck->pmc;
4705 -       u32 tmp;
4706 -
4707 -       pmc_lock(pmc);
4708 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4709 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_GCKEN;
4710 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
4711 -       pmc_unlock(pmc);
4712 +       unsigned long flags;
4713 +
4714 +       spin_lock_irqsave(gck->lock, flags);
4715 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4716 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4717 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4718 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4719 +                          AT91_PMC_PCR_CMD);
4720 +       spin_unlock_irqrestore(gck->lock, flags);
4721  }
4722  
4723  static int clk_generated_is_enabled(struct clk_hw *hw)
4724  {
4725         struct clk_generated *gck = to_clk_generated(hw);
4726 -       struct at91_pmc *pmc = gck->pmc;
4727 -       int ret;
4728 +       unsigned long flags;
4729 +       unsigned int status;
4730  
4731 -       pmc_lock(pmc);
4732 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4733 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_GCKEN);
4734 -       pmc_unlock(pmc);
4735 +       spin_lock_irqsave(gck->lock, flags);
4736 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4737 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4738 +       regmap_read(gck->regmap, AT91_PMC_PCR, &status);
4739 +       spin_unlock_irqrestore(gck->lock, flags);
4740  
4741 -       return ret;
4742 +       return status & AT91_PMC_PCR_GCKEN ? 1 : 0;
4743  }
4744  
4745  static unsigned long
4746 @@ -214,13 +218,14 @@ static const struct clk_ops generated_ops = {
4747   */
4748  static void clk_generated_startup(struct clk_generated *gck)
4749  {
4750 -       struct at91_pmc *pmc = gck->pmc;
4751         u32 tmp;
4752 +       unsigned long flags;
4753  
4754 -       pmc_lock(pmc);
4755 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4756 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
4757 -       pmc_unlock(pmc);
4758 +       spin_lock_irqsave(gck->lock, flags);
4759 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4760 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4761 +       regmap_read(gck->regmap, AT91_PMC_PCR, &tmp);
4762 +       spin_unlock_irqrestore(gck->lock, flags);
4763  
4764         gck->parent_id = (tmp & AT91_PMC_PCR_GCKCSS_MASK)
4765                                         >> AT91_PMC_PCR_GCKCSS_OFFSET;
4766 @@ -229,8 +234,8 @@ static void clk_generated_startup(struct clk_generated *gck)
4767  }
4768  
4769  static struct clk * __init
4770 -at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4771 -                           const char **parent_names, u8 num_parents,
4772 +at91_clk_register_generated(struct regmap *regmap,  spinlock_t *lock, const char
4773 +                           *name, const char **parent_names, u8 num_parents,
4774                             u8 id, const struct clk_range *range)
4775  {
4776         struct clk_generated *gck;
4777 @@ -249,7 +254,8 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4778  
4779         gck->id = id;
4780         gck->hw.init = &init;
4781 -       gck->pmc = pmc;
4782 +       gck->regmap = regmap;
4783 +       gck->lock = lock;
4784         gck->range = *range;
4785  
4786         clk = clk_register(NULL, &gck->hw);
4787 @@ -261,8 +267,7 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4788         return clk;
4789  }
4790  
4791 -void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4792 -                                          struct at91_pmc *pmc)
4793 +void __init of_sama5d2_clk_generated_setup(struct device_node *np)
4794  {
4795         int num;
4796         u32 id;
4797 @@ -272,6 +277,7 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4798         const char *parent_names[GENERATED_SOURCE_MAX];
4799         struct device_node *gcknp;
4800         struct clk_range range = CLK_RANGE(0, 0);
4801 +       struct regmap *regmap;
4802  
4803         num_parents = of_clk_get_parent_count(np);
4804         if (num_parents <= 0 || num_parents > GENERATED_SOURCE_MAX)
4805 @@ -283,6 +289,10 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4806         if (!num || num > PERIPHERAL_MAX)
4807                 return;
4808  
4809 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4810 +       if (IS_ERR(regmap))
4811 +               return;
4812 +
4813         for_each_child_of_node(np, gcknp) {
4814                 if (of_property_read_u32(gcknp, "reg", &id))
4815                         continue;
4816 @@ -296,11 +306,14 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4817                 of_at91_get_clk_range(gcknp, "atmel,clk-output-range",
4818                                       &range);
4819  
4820 -               clk = at91_clk_register_generated(pmc, name, parent_names,
4821 -                                                 num_parents, id, &range);
4822 +               clk = at91_clk_register_generated(regmap, &pmc_pcr_lock, name,
4823 +                                                 parent_names, num_parents,
4824 +                                                 id, &range);
4825                 if (IS_ERR(clk))
4826                         continue;
4827  
4828                 of_clk_add_provider(gcknp, of_clk_src_simple_get, clk);
4829         }
4830  }
4831 +CLK_OF_DECLARE(of_sama5d2_clk_generated_setup, "atmel,sama5d2-clk-generated",
4832 +              of_sama5d2_clk_generated_setup);
4833 diff --git a/drivers/clk/at91/clk-h32mx.c b/drivers/clk/at91/clk-h32mx.c
4834 index a165230e7eda..8e20c8a76db7 100644
4835 --- a/drivers/clk/at91/clk-h32mx.c
4836 +++ b/drivers/clk/at91/clk-h32mx.c
4837 @@ -15,15 +15,9 @@
4838  #include <linux/clk-provider.h>
4839  #include <linux/clkdev.h>
4840  #include <linux/clk/at91_pmc.h>
4841 -#include <linux/delay.h>
4842  #include <linux/of.h>
4843 -#include <linux/of_address.h>
4844 -#include <linux/of_irq.h>
4845 -#include <linux/io.h>
4846 -#include <linux/interrupt.h>
4847 -#include <linux/irq.h>
4848 -#include <linux/sched.h>
4849 -#include <linux/wait.h>
4850 +#include <linux/regmap.h>
4851 +#include <linux/mfd/syscon.h>
4852  
4853  #include "pmc.h"
4854  
4855 @@ -31,7 +25,7 @@
4856  
4857  struct clk_sama5d4_h32mx {
4858         struct clk_hw hw;
4859 -       struct at91_pmc *pmc;
4860 +       struct regmap *regmap;
4861  };
4862  
4863  #define to_clk_sama5d4_h32mx(hw) container_of(hw, struct clk_sama5d4_h32mx, hw)
4864 @@ -40,8 +34,10 @@ static unsigned long clk_sama5d4_h32mx_recalc_rate(struct clk_hw *hw,
4865                                                  unsigned long parent_rate)
4866  {
4867         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4868 +       unsigned int mckr;
4869  
4870 -       if (pmc_read(h32mxclk->pmc, AT91_PMC_MCKR) & AT91_PMC_H32MXDIV)
4871 +       regmap_read(h32mxclk->regmap, AT91_PMC_MCKR, &mckr);
4872 +       if (mckr & AT91_PMC_H32MXDIV)
4873                 return parent_rate / 2;
4874  
4875         if (parent_rate > H32MX_MAX_FREQ)
4876 @@ -70,18 +66,16 @@ static int clk_sama5d4_h32mx_set_rate(struct clk_hw *hw, unsigned long rate,
4877                                     unsigned long parent_rate)
4878  {
4879         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4880 -       struct at91_pmc *pmc = h32mxclk->pmc;
4881 -       u32 tmp;
4882 +       u32 mckr = 0;
4883  
4884         if (parent_rate != rate && (parent_rate / 2) != rate)
4885                 return -EINVAL;
4886  
4887 -       pmc_lock(pmc);
4888 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_H32MXDIV;
4889         if ((parent_rate / 2) == rate)
4890 -               tmp |= AT91_PMC_H32MXDIV;
4891 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
4892 -       pmc_unlock(pmc);
4893 +               mckr = AT91_PMC_H32MXDIV;
4894 +
4895 +       regmap_update_bits(h32mxclk->regmap, AT91_PMC_MCKR,
4896 +                          AT91_PMC_H32MXDIV, mckr);
4897  
4898         return 0;
4899  }
4900 @@ -92,14 +86,18 @@ static const struct clk_ops h32mx_ops = {
4901         .set_rate = clk_sama5d4_h32mx_set_rate,
4902  };
4903  
4904 -void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4905 -                                    struct at91_pmc *pmc)
4906 +static void __init of_sama5d4_clk_h32mx_setup(struct device_node *np)
4907  {
4908         struct clk_sama5d4_h32mx *h32mxclk;
4909         struct clk_init_data init;
4910         const char *parent_name;
4911 +       struct regmap *regmap;
4912         struct clk *clk;
4913  
4914 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4915 +       if (IS_ERR(regmap))
4916 +               return;
4917 +
4918         h32mxclk = kzalloc(sizeof(*h32mxclk), GFP_KERNEL);
4919         if (!h32mxclk)
4920                 return;
4921 @@ -113,7 +111,7 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4922         init.flags = CLK_SET_RATE_GATE;
4923  
4924         h32mxclk->hw.init = &init;
4925 -       h32mxclk->pmc = pmc;
4926 +       h32mxclk->regmap = regmap;
4927  
4928         clk = clk_register(NULL, &h32mxclk->hw);
4929         if (IS_ERR(clk)) {
4930 @@ -123,3 +121,5 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4931  
4932         of_clk_add_provider(np, of_clk_src_simple_get, clk);
4933  }
4934 +CLK_OF_DECLARE(of_sama5d4_clk_h32mx_setup, "atmel,sama5d4-clk-h32mx",
4935 +              of_sama5d4_clk_h32mx_setup);
4936 diff --git a/drivers/clk/at91/clk-main.c b/drivers/clk/at91/clk-main.c
4937 index fd7247deabdc..4bfc94d6c26e 100644
4938 --- a/drivers/clk/at91/clk-main.c
4939 +++ b/drivers/clk/at91/clk-main.c
4940 @@ -13,13 +13,8 @@
4941  #include <linux/clk/at91_pmc.h>
4942  #include <linux/delay.h>
4943  #include <linux/of.h>
4944 -#include <linux/of_address.h>
4945 -#include <linux/of_irq.h>
4946 -#include <linux/io.h>
4947 -#include <linux/interrupt.h>
4948 -#include <linux/irq.h>
4949 -#include <linux/sched.h>
4950 -#include <linux/wait.h>
4951 +#include <linux/mfd/syscon.h>
4952 +#include <linux/regmap.h>
4953  
4954  #include "pmc.h"
4955  
4956 @@ -34,18 +29,14 @@
4957  
4958  struct clk_main_osc {
4959         struct clk_hw hw;
4960 -       struct at91_pmc *pmc;
4961 -       unsigned int irq;
4962 -       wait_queue_head_t wait;
4963 +       struct regmap *regmap;
4964  };
4965  
4966  #define to_clk_main_osc(hw) container_of(hw, struct clk_main_osc, hw)
4967  
4968  struct clk_main_rc_osc {
4969         struct clk_hw hw;
4970 -       struct at91_pmc *pmc;
4971 -       unsigned int irq;
4972 -       wait_queue_head_t wait;
4973 +       struct regmap *regmap;
4974         unsigned long frequency;
4975         unsigned long accuracy;
4976  };
4977 @@ -54,51 +45,47 @@ struct clk_main_rc_osc {
4978  
4979  struct clk_rm9200_main {
4980         struct clk_hw hw;
4981 -       struct at91_pmc *pmc;
4982 +       struct regmap *regmap;
4983  };
4984  
4985  #define to_clk_rm9200_main(hw) container_of(hw, struct clk_rm9200_main, hw)
4986  
4987  struct clk_sam9x5_main {
4988         struct clk_hw hw;
4989 -       struct at91_pmc *pmc;
4990 -       unsigned int irq;
4991 -       wait_queue_head_t wait;
4992 +       struct regmap *regmap;
4993         u8 parent;
4994  };
4995  
4996  #define to_clk_sam9x5_main(hw) container_of(hw, struct clk_sam9x5_main, hw)
4997  
4998 -static irqreturn_t clk_main_osc_irq_handler(int irq, void *dev_id)
4999 +static inline bool clk_main_osc_ready(struct regmap *regmap)
5000  {
5001 -       struct clk_main_osc *osc = dev_id;
5002 +       unsigned int status;
5003  
5004 -       wake_up(&osc->wait);
5005 -       disable_irq_nosync(osc->irq);
5006 +       regmap_read(regmap, AT91_PMC_SR, &status);
5007  
5008 -       return IRQ_HANDLED;
5009 +       return status & AT91_PMC_MOSCS;
5010  }
5011  
5012  static int clk_main_osc_prepare(struct clk_hw *hw)
5013  {
5014         struct clk_main_osc *osc = to_clk_main_osc(hw);
5015 -       struct at91_pmc *pmc = osc->pmc;
5016 +       struct regmap *regmap = osc->regmap;
5017         u32 tmp;
5018  
5019 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5020 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5021 +       tmp &= ~MOR_KEY_MASK;
5022 +
5023         if (tmp & AT91_PMC_OSCBYPASS)
5024                 return 0;
5025  
5026         if (!(tmp & AT91_PMC_MOSCEN)) {
5027                 tmp |= AT91_PMC_MOSCEN | AT91_PMC_KEY;
5028 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5029 +               regmap_write(regmap, AT91_CKGR_MOR, tmp);
5030         }
5031  
5032 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS)) {
5033 -               enable_irq(osc->irq);
5034 -               wait_event(osc->wait,
5035 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS);
5036 -       }
5037 +       while (!clk_main_osc_ready(regmap))
5038 +               cpu_relax();
5039  
5040         return 0;
5041  }
5042 @@ -106,9 +93,10 @@ static int clk_main_osc_prepare(struct clk_hw *hw)
5043  static void clk_main_osc_unprepare(struct clk_hw *hw)
5044  {
5045         struct clk_main_osc *osc = to_clk_main_osc(hw);
5046 -       struct at91_pmc *pmc = osc->pmc;
5047 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5048 +       struct regmap *regmap = osc->regmap;
5049 +       u32 tmp;
5050  
5051 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5052         if (tmp & AT91_PMC_OSCBYPASS)
5053                 return;
5054  
5055 @@ -116,20 +104,22 @@ static void clk_main_osc_unprepare(struct clk_hw *hw)
5056                 return;
5057  
5058         tmp &= ~(AT91_PMC_KEY | AT91_PMC_MOSCEN);
5059 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5060 +       regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5061  }
5062  
5063  static int clk_main_osc_is_prepared(struct clk_hw *hw)
5064  {
5065         struct clk_main_osc *osc = to_clk_main_osc(hw);
5066 -       struct at91_pmc *pmc = osc->pmc;
5067 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5068 +       struct regmap *regmap = osc->regmap;
5069 +       u32 tmp, status;
5070  
5071 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5072         if (tmp & AT91_PMC_OSCBYPASS)
5073                 return 1;
5074  
5075 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS) &&
5076 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN));
5077 +       regmap_read(regmap, AT91_PMC_SR, &status);
5078 +
5079 +       return (status & AT91_PMC_MOSCS) && (tmp & AT91_PMC_MOSCEN);
5080  }
5081  
5082  static const struct clk_ops main_osc_ops = {
5083 @@ -139,18 +129,16 @@ static const struct clk_ops main_osc_ops = {
5084  };
5085  
5086  static struct clk * __init
5087 -at91_clk_register_main_osc(struct at91_pmc *pmc,
5088 -                          unsigned int irq,
5089 +at91_clk_register_main_osc(struct regmap *regmap,
5090                            const char *name,
5091                            const char *parent_name,
5092                            bool bypass)
5093  {
5094 -       int ret;
5095         struct clk_main_osc *osc;
5096         struct clk *clk = NULL;
5097         struct clk_init_data init;
5098  
5099 -       if (!pmc || !irq || !name || !parent_name)
5100 +       if (!name || !parent_name)
5101                 return ERR_PTR(-EINVAL);
5102  
5103         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5104 @@ -164,85 +152,70 @@ at91_clk_register_main_osc(struct at91_pmc *pmc,
5105         init.flags = CLK_IGNORE_UNUSED;
5106  
5107         osc->hw.init = &init;
5108 -       osc->pmc = pmc;
5109 -       osc->irq = irq;
5110 -
5111 -       init_waitqueue_head(&osc->wait);
5112 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5113 -       ret = request_irq(osc->irq, clk_main_osc_irq_handler,
5114 -                         IRQF_TRIGGER_HIGH, name, osc);
5115 -       if (ret) {
5116 -               kfree(osc);
5117 -               return ERR_PTR(ret);
5118 -       }
5119 +       osc->regmap = regmap;
5120  
5121         if (bypass)
5122 -               pmc_write(pmc, AT91_CKGR_MOR,
5123 -                         (pmc_read(pmc, AT91_CKGR_MOR) &
5124 -                          ~(MOR_KEY_MASK | AT91_PMC_MOSCEN)) |
5125 -                         AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5126 +               regmap_update_bits(regmap,
5127 +                                  AT91_CKGR_MOR, MOR_KEY_MASK |
5128 +                                  AT91_PMC_MOSCEN,
5129 +                                  AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5130  
5131         clk = clk_register(NULL, &osc->hw);
5132 -       if (IS_ERR(clk)) {
5133 -               free_irq(irq, osc);
5134 +       if (IS_ERR(clk))
5135                 kfree(osc);
5136 -       }
5137  
5138         return clk;
5139  }
5140  
5141 -void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np,
5142 -                                            struct at91_pmc *pmc)
5143 +static void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np)
5144  {
5145         struct clk *clk;
5146 -       unsigned int irq;
5147         const char *name = np->name;
5148         const char *parent_name;
5149 +       struct regmap *regmap;
5150         bool bypass;
5151  
5152         of_property_read_string(np, "clock-output-names", &name);
5153         bypass = of_property_read_bool(np, "atmel,osc-bypass");
5154         parent_name = of_clk_get_parent_name(np, 0);
5155  
5156 -       irq = irq_of_parse_and_map(np, 0);
5157 -       if (!irq)
5158 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5159 +       if (IS_ERR(regmap))
5160                 return;
5161  
5162 -       clk = at91_clk_register_main_osc(pmc, irq, name, parent_name, bypass);
5163 +       clk = at91_clk_register_main_osc(regmap, name, parent_name, bypass);
5164         if (IS_ERR(clk))
5165                 return;
5166  
5167         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5168  }
5169 +CLK_OF_DECLARE(at91rm9200_clk_main_osc, "atmel,at91rm9200-clk-main-osc",
5170 +              of_at91rm9200_clk_main_osc_setup);
5171  
5172 -static irqreturn_t clk_main_rc_osc_irq_handler(int irq, void *dev_id)
5173 +static bool clk_main_rc_osc_ready(struct regmap *regmap)
5174  {
5175 -       struct clk_main_rc_osc *osc = dev_id;
5176 +       unsigned int status;
5177  
5178 -       wake_up(&osc->wait);
5179 -       disable_irq_nosync(osc->irq);
5180 +       regmap_read(regmap, AT91_PMC_SR, &status);
5181  
5182 -       return IRQ_HANDLED;
5183 +       return status & AT91_PMC_MOSCRCS;
5184  }
5185  
5186  static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5187  {
5188         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5189 -       struct at91_pmc *pmc = osc->pmc;
5190 -       u32 tmp;
5191 +       struct regmap *regmap = osc->regmap;
5192 +       unsigned int mor;
5193  
5194 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5195 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5196  
5197 -       if (!(tmp & AT91_PMC_MOSCRCEN)) {
5198 -               tmp |= AT91_PMC_MOSCRCEN | AT91_PMC_KEY;
5199 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5200 -       }
5201 +       if (!(mor & AT91_PMC_MOSCRCEN))
5202 +               regmap_update_bits(regmap, AT91_CKGR_MOR,
5203 +                                  MOR_KEY_MASK | AT91_PMC_MOSCRCEN,
5204 +                                  AT91_PMC_MOSCRCEN | AT91_PMC_KEY);
5205  
5206 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS)) {
5207 -               enable_irq(osc->irq);
5208 -               wait_event(osc->wait,
5209 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS);
5210 -       }
5211 +       while (!clk_main_rc_osc_ready(regmap))
5212 +               cpu_relax();
5213  
5214         return 0;
5215  }
5216 @@ -250,23 +223,28 @@ static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5217  static void clk_main_rc_osc_unprepare(struct clk_hw *hw)
5218  {
5219         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5220 -       struct at91_pmc *pmc = osc->pmc;
5221 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5222 +       struct regmap *regmap = osc->regmap;
5223 +       unsigned int mor;
5224 +
5225 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5226  
5227 -       if (!(tmp & AT91_PMC_MOSCRCEN))
5228 +       if (!(mor & AT91_PMC_MOSCRCEN))
5229                 return;
5230  
5231 -       tmp &= ~(MOR_KEY_MASK | AT91_PMC_MOSCRCEN);
5232 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5233 +       regmap_update_bits(regmap, AT91_CKGR_MOR,
5234 +                          MOR_KEY_MASK | AT91_PMC_MOSCRCEN, AT91_PMC_KEY);
5235  }
5236  
5237  static int clk_main_rc_osc_is_prepared(struct clk_hw *hw)
5238  {
5239         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5240 -       struct at91_pmc *pmc = osc->pmc;
5241 +       struct regmap *regmap = osc->regmap;
5242 +       unsigned int mor, status;
5243  
5244 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS) &&
5245 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCRCEN));
5246 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5247 +       regmap_read(regmap, AT91_PMC_SR, &status);
5248 +
5249 +       return (mor & AT91_PMC_MOSCRCEN) && (status & AT91_PMC_MOSCRCS);
5250  }
5251  
5252  static unsigned long clk_main_rc_osc_recalc_rate(struct clk_hw *hw,
5253 @@ -294,17 +272,15 @@ static const struct clk_ops main_rc_osc_ops = {
5254  };
5255  
5256  static struct clk * __init
5257 -at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5258 -                             unsigned int irq,
5259 +at91_clk_register_main_rc_osc(struct regmap *regmap,
5260                               const char *name,
5261                               u32 frequency, u32 accuracy)
5262  {
5263 -       int ret;
5264         struct clk_main_rc_osc *osc;
5265         struct clk *clk = NULL;
5266         struct clk_init_data init;
5267  
5268 -       if (!pmc || !irq || !name || !frequency)
5269 +       if (!name || !frequency)
5270                 return ERR_PTR(-EINVAL);
5271  
5272         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5273 @@ -318,63 +294,53 @@ at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5274         init.flags = CLK_IS_ROOT | CLK_IGNORE_UNUSED;
5275  
5276         osc->hw.init = &init;
5277 -       osc->pmc = pmc;
5278 -       osc->irq = irq;
5279 +       osc->regmap = regmap;
5280         osc->frequency = frequency;
5281         osc->accuracy = accuracy;
5282  
5283 -       init_waitqueue_head(&osc->wait);
5284 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5285 -       ret = request_irq(osc->irq, clk_main_rc_osc_irq_handler,
5286 -                         IRQF_TRIGGER_HIGH, name, osc);
5287 -       if (ret)
5288 -               return ERR_PTR(ret);
5289 -
5290         clk = clk_register(NULL, &osc->hw);
5291 -       if (IS_ERR(clk)) {
5292 -               free_irq(irq, osc);
5293 +       if (IS_ERR(clk))
5294                 kfree(osc);
5295 -       }
5296  
5297         return clk;
5298  }
5299  
5300 -void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
5301 -                                               struct at91_pmc *pmc)
5302 +static void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np)
5303  {
5304         struct clk *clk;
5305 -       unsigned int irq;
5306         u32 frequency = 0;
5307         u32 accuracy = 0;
5308         const char *name = np->name;
5309 +       struct regmap *regmap;
5310  
5311         of_property_read_string(np, "clock-output-names", &name);
5312         of_property_read_u32(np, "clock-frequency", &frequency);
5313         of_property_read_u32(np, "clock-accuracy", &accuracy);
5314  
5315 -       irq = irq_of_parse_and_map(np, 0);
5316 -       if (!irq)
5317 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5318 +       if (IS_ERR(regmap))
5319                 return;
5320  
5321 -       clk = at91_clk_register_main_rc_osc(pmc, irq, name, frequency,
5322 -                                           accuracy);
5323 +       clk = at91_clk_register_main_rc_osc(regmap, name, frequency, accuracy);
5324         if (IS_ERR(clk))
5325                 return;
5326  
5327         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5328  }
5329 +CLK_OF_DECLARE(at91sam9x5_clk_main_rc_osc, "atmel,at91sam9x5-clk-main-rc-osc",
5330 +              of_at91sam9x5_clk_main_rc_osc_setup);
5331  
5332  
5333 -static int clk_main_probe_frequency(struct at91_pmc *pmc)
5334 +static int clk_main_probe_frequency(struct regmap *regmap)
5335  {
5336         unsigned long prep_time, timeout;
5337 -       u32 tmp;
5338 +       unsigned int mcfr;
5339  
5340         timeout = jiffies + usecs_to_jiffies(MAINFRDY_TIMEOUT);
5341         do {
5342                 prep_time = jiffies;
5343 -               tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5344 -               if (tmp & AT91_PMC_MAINRDY)
5345 +               regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5346 +               if (mcfr & AT91_PMC_MAINRDY)
5347                         return 0;
5348                 usleep_range(MAINF_LOOP_MIN_WAIT, MAINF_LOOP_MAX_WAIT);
5349         } while (time_before(prep_time, timeout));
5350 @@ -382,34 +348,37 @@ static int clk_main_probe_frequency(struct at91_pmc *pmc)
5351         return -ETIMEDOUT;
5352  }
5353  
5354 -static unsigned long clk_main_recalc_rate(struct at91_pmc *pmc,
5355 +static unsigned long clk_main_recalc_rate(struct regmap *regmap,
5356                                           unsigned long parent_rate)
5357  {
5358 -       u32 tmp;
5359 +       unsigned int mcfr;
5360  
5361         if (parent_rate)
5362                 return parent_rate;
5363  
5364         pr_warn("Main crystal frequency not set, using approximate value\n");
5365 -       tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5366 -       if (!(tmp & AT91_PMC_MAINRDY))
5367 +       regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5368 +       if (!(mcfr & AT91_PMC_MAINRDY))
5369                 return 0;
5370  
5371 -       return ((tmp & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5372 +       return ((mcfr & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5373  }
5374  
5375  static int clk_rm9200_main_prepare(struct clk_hw *hw)
5376  {
5377         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5378  
5379 -       return clk_main_probe_frequency(clkmain->pmc);
5380 +       return clk_main_probe_frequency(clkmain->regmap);
5381  }
5382  
5383  static int clk_rm9200_main_is_prepared(struct clk_hw *hw)
5384  {
5385         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5386 +       unsigned int status;
5387 +
5388 +       regmap_read(clkmain->regmap, AT91_CKGR_MCFR, &status);
5389  
5390 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MCFR) & AT91_PMC_MAINRDY);
5391 +       return status & AT91_PMC_MAINRDY ? 1 : 0;
5392  }
5393  
5394  static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5395 @@ -417,7 +386,7 @@ static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5396  {
5397         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5398  
5399 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5400 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5401  }
5402  
5403  static const struct clk_ops rm9200_main_ops = {
5404 @@ -427,7 +396,7 @@ static const struct clk_ops rm9200_main_ops = {
5405  };
5406  
5407  static struct clk * __init
5408 -at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5409 +at91_clk_register_rm9200_main(struct regmap *regmap,
5410                               const char *name,
5411                               const char *parent_name)
5412  {
5413 @@ -435,7 +404,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5414         struct clk *clk = NULL;
5415         struct clk_init_data init;
5416  
5417 -       if (!pmc || !name)
5418 +       if (!name)
5419                 return ERR_PTR(-EINVAL);
5420  
5421         if (!parent_name)
5422 @@ -452,7 +421,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5423         init.flags = 0;
5424  
5425         clkmain->hw.init = &init;
5426 -       clkmain->pmc = pmc;
5427 +       clkmain->regmap = regmap;
5428  
5429         clk = clk_register(NULL, &clkmain->hw);
5430         if (IS_ERR(clk))
5431 @@ -461,52 +430,54 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5432         return clk;
5433  }
5434  
5435 -void __init of_at91rm9200_clk_main_setup(struct device_node *np,
5436 -                                        struct at91_pmc *pmc)
5437 +static void __init of_at91rm9200_clk_main_setup(struct device_node *np)
5438  {
5439         struct clk *clk;
5440         const char *parent_name;
5441         const char *name = np->name;
5442 +       struct regmap *regmap;
5443  
5444         parent_name = of_clk_get_parent_name(np, 0);
5445         of_property_read_string(np, "clock-output-names", &name);
5446  
5447 -       clk = at91_clk_register_rm9200_main(pmc, name, parent_name);
5448 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5449 +       if (IS_ERR(regmap))
5450 +               return;
5451 +
5452 +       clk = at91_clk_register_rm9200_main(regmap, name, parent_name);
5453         if (IS_ERR(clk))
5454                 return;
5455  
5456         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5457  }
5458 +CLK_OF_DECLARE(at91rm9200_clk_main, "atmel,at91rm9200-clk-main",
5459 +              of_at91rm9200_clk_main_setup);
5460  
5461 -static irqreturn_t clk_sam9x5_main_irq_handler(int irq, void *dev_id)
5462 +static inline bool clk_sam9x5_main_ready(struct regmap *regmap)
5463  {
5464 -       struct clk_sam9x5_main *clkmain = dev_id;
5465 +       unsigned int status;
5466  
5467 -       wake_up(&clkmain->wait);
5468 -       disable_irq_nosync(clkmain->irq);
5469 +       regmap_read(regmap, AT91_PMC_SR, &status);
5470  
5471 -       return IRQ_HANDLED;
5472 +       return status & AT91_PMC_MOSCSELS ? 1 : 0;
5473  }
5474  
5475  static int clk_sam9x5_main_prepare(struct clk_hw *hw)
5476  {
5477         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5478 -       struct at91_pmc *pmc = clkmain->pmc;
5479 +       struct regmap *regmap = clkmain->regmap;
5480  
5481 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5482 -               enable_irq(clkmain->irq);
5483 -               wait_event(clkmain->wait,
5484 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5485 -       }
5486 +       while (!clk_sam9x5_main_ready(regmap))
5487 +               cpu_relax();
5488  
5489 -       return clk_main_probe_frequency(pmc);
5490 +       return clk_main_probe_frequency(regmap);
5491  }
5492  
5493  static int clk_sam9x5_main_is_prepared(struct clk_hw *hw)
5494  {
5495         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5496  
5497 -       return !!(pmc_read(clkmain->pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5498 +       return clk_sam9x5_main_ready(clkmain->regmap);
5499  }
5500  
5501  static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5502 @@ -514,30 +485,28 @@ static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5503  {
5504         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5505  
5506 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5507 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5508  }
5509  
5510  static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5511  {
5512         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5513 -       struct at91_pmc *pmc = clkmain->pmc;
5514 -       u32 tmp;
5515 +       struct regmap *regmap = clkmain->regmap;
5516 +       unsigned int tmp;
5517  
5518         if (index > 1)
5519                 return -EINVAL;
5520  
5521 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5522 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5523 +       tmp &= ~MOR_KEY_MASK;
5524  
5525         if (index && !(tmp & AT91_PMC_MOSCSEL))
5526 -               pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5527 +               regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5528         else if (!index && (tmp & AT91_PMC_MOSCSEL))
5529 -               pmc_write(pmc, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5530 +               regmap_write(regmap, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5531  
5532 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5533 -               enable_irq(clkmain->irq);
5534 -               wait_event(clkmain->wait,
5535 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5536 -       }
5537 +       while (!clk_sam9x5_main_ready(regmap))
5538 +               cpu_relax();
5539  
5540         return 0;
5541  }
5542 @@ -545,8 +514,11 @@ static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5543  static u8 clk_sam9x5_main_get_parent(struct clk_hw *hw)
5544  {
5545         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5546 +       unsigned int status;
5547 +
5548 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5549  
5550 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN);
5551 +       return status & AT91_PMC_MOSCEN ? 1 : 0;
5552  }
5553  
5554  static const struct clk_ops sam9x5_main_ops = {
5555 @@ -558,18 +530,17 @@ static const struct clk_ops sam9x5_main_ops = {
5556  };
5557  
5558  static struct clk * __init
5559 -at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5560 -                             unsigned int irq,
5561 +at91_clk_register_sam9x5_main(struct regmap *regmap,
5562                               const char *name,
5563                               const char **parent_names,
5564                               int num_parents)
5565  {
5566 -       int ret;
5567         struct clk_sam9x5_main *clkmain;
5568         struct clk *clk = NULL;
5569         struct clk_init_data init;
5570 +       unsigned int status;
5571  
5572 -       if (!pmc || !irq || !name)
5573 +       if (!name)
5574                 return ERR_PTR(-EINVAL);
5575  
5576         if (!parent_names || !num_parents)
5577 @@ -586,51 +557,42 @@ at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5578         init.flags = CLK_SET_PARENT_GATE;
5579  
5580         clkmain->hw.init = &init;
5581 -       clkmain->pmc = pmc;
5582 -       clkmain->irq = irq;
5583 -       clkmain->parent = !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) &
5584 -                            AT91_PMC_MOSCEN);
5585 -       init_waitqueue_head(&clkmain->wait);
5586 -       irq_set_status_flags(clkmain->irq, IRQ_NOAUTOEN);
5587 -       ret = request_irq(clkmain->irq, clk_sam9x5_main_irq_handler,
5588 -                         IRQF_TRIGGER_HIGH, name, clkmain);
5589 -       if (ret)
5590 -               return ERR_PTR(ret);
5591 +       clkmain->regmap = regmap;
5592 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5593 +       clkmain->parent = status & AT91_PMC_MOSCEN ? 1 : 0;
5594  
5595         clk = clk_register(NULL, &clkmain->hw);
5596 -       if (IS_ERR(clk)) {
5597 -               free_irq(clkmain->irq, clkmain);
5598 +       if (IS_ERR(clk))
5599                 kfree(clkmain);
5600 -       }
5601  
5602         return clk;
5603  }
5604  
5605 -void __init of_at91sam9x5_clk_main_setup(struct device_node *np,
5606 -                                        struct at91_pmc *pmc)
5607 +static void __init of_at91sam9x5_clk_main_setup(struct device_node *np)
5608  {
5609         struct clk *clk;
5610         const char *parent_names[2];
5611         int num_parents;
5612 -       unsigned int irq;
5613         const char *name = np->name;
5614 +       struct regmap *regmap;
5615  
5616         num_parents = of_clk_get_parent_count(np);
5617         if (num_parents <= 0 || num_parents > 2)
5618                 return;
5619  
5620         of_clk_parent_fill(np, parent_names, num_parents);
5621 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5622 +       if (IS_ERR(regmap))
5623 +               return;
5624  
5625         of_property_read_string(np, "clock-output-names", &name);
5626  
5627 -       irq = irq_of_parse_and_map(np, 0);
5628 -       if (!irq)
5629 -               return;
5630 -
5631 -       clk = at91_clk_register_sam9x5_main(pmc, irq, name, parent_names,
5632 +       clk = at91_clk_register_sam9x5_main(regmap, name, parent_names,
5633                                             num_parents);
5634         if (IS_ERR(clk))
5635                 return;
5636  
5637         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5638  }
5639 +CLK_OF_DECLARE(at91sam9x5_clk_main, "atmel,at91sam9x5-clk-main",
5640 +              of_at91sam9x5_clk_main_setup);
5641 diff --git a/drivers/clk/at91/clk-master.c b/drivers/clk/at91/clk-master.c
5642 index 620ea323356b..7d4a1864ea7c 100644
5643 --- a/drivers/clk/at91/clk-master.c
5644 +++ b/drivers/clk/at91/clk-master.c
5645 @@ -12,13 +12,8 @@
5646  #include <linux/clkdev.h>
5647  #include <linux/clk/at91_pmc.h>
5648  #include <linux/of.h>
5649 -#include <linux/of_address.h>
5650 -#include <linux/of_irq.h>
5651 -#include <linux/io.h>
5652 -#include <linux/wait.h>
5653 -#include <linux/sched.h>
5654 -#include <linux/interrupt.h>
5655 -#include <linux/irq.h>
5656 +#include <linux/mfd/syscon.h>
5657 +#include <linux/regmap.h>
5658  
5659  #include "pmc.h"
5660  
5661 @@ -44,32 +39,26 @@ struct clk_master_layout {
5662  
5663  struct clk_master {
5664         struct clk_hw hw;
5665 -       struct at91_pmc *pmc;
5666 -       unsigned int irq;
5667 -       wait_queue_head_t wait;
5668 +       struct regmap *regmap;
5669         const struct clk_master_layout *layout;
5670         const struct clk_master_characteristics *characteristics;
5671  };
5672  
5673 -static irqreturn_t clk_master_irq_handler(int irq, void *dev_id)
5674 +static inline bool clk_master_ready(struct regmap *regmap)
5675  {
5676 -       struct clk_master *master = (struct clk_master *)dev_id;
5677 +       unsigned int status;
5678  
5679 -       wake_up(&master->wait);
5680 -       disable_irq_nosync(master->irq);
5681 +       regmap_read(regmap, AT91_PMC_SR, &status);
5682  
5683 -       return IRQ_HANDLED;
5684 +       return status & AT91_PMC_MCKRDY ? 1 : 0;
5685  }
5686 +
5687  static int clk_master_prepare(struct clk_hw *hw)
5688  {
5689         struct clk_master *master = to_clk_master(hw);
5690 -       struct at91_pmc *pmc = master->pmc;
5691  
5692 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY)) {
5693 -               enable_irq(master->irq);
5694 -               wait_event(master->wait,
5695 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5696 -       }
5697 +       while (!clk_master_ready(master->regmap))
5698 +               cpu_relax();
5699  
5700         return 0;
5701  }
5702 @@ -78,7 +67,7 @@ static int clk_master_is_prepared(struct clk_hw *hw)
5703  {
5704         struct clk_master *master = to_clk_master(hw);
5705  
5706 -       return !!(pmc_read(master->pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5707 +       return clk_master_ready(master->regmap);
5708  }
5709  
5710  static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5711 @@ -88,18 +77,16 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5712         u8 div;
5713         unsigned long rate = parent_rate;
5714         struct clk_master *master = to_clk_master(hw);
5715 -       struct at91_pmc *pmc = master->pmc;
5716         const struct clk_master_layout *layout = master->layout;
5717         const struct clk_master_characteristics *characteristics =
5718                                                 master->characteristics;
5719 -       u32 tmp;
5720 +       unsigned int mckr;
5721  
5722 -       pmc_lock(pmc);
5723 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & layout->mask;
5724 -       pmc_unlock(pmc);
5725 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5726 +       mckr &= layout->mask;
5727  
5728 -       pres = (tmp >> layout->pres_shift) & MASTER_PRES_MASK;
5729 -       div = (tmp >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5730 +       pres = (mckr >> layout->pres_shift) & MASTER_PRES_MASK;
5731 +       div = (mckr >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5732  
5733         if (characteristics->have_div3_pres && pres == MASTER_PRES_MAX)
5734                 rate /= 3;
5735 @@ -119,9 +106,11 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5736  static u8 clk_master_get_parent(struct clk_hw *hw)
5737  {
5738         struct clk_master *master = to_clk_master(hw);
5739 -       struct at91_pmc *pmc = master->pmc;
5740 +       unsigned int mckr;
5741  
5742 -       return pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_CSS;
5743 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5744 +
5745 +       return mckr & AT91_PMC_CSS;
5746  }
5747  
5748  static const struct clk_ops master_ops = {
5749 @@ -132,18 +121,17 @@ static const struct clk_ops master_ops = {
5750  };
5751  
5752  static struct clk * __init
5753 -at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5754 +at91_clk_register_master(struct regmap *regmap,
5755                 const char *name, int num_parents,
5756                 const char **parent_names,
5757                 const struct clk_master_layout *layout,
5758                 const struct clk_master_characteristics *characteristics)
5759  {
5760 -       int ret;
5761         struct clk_master *master;
5762         struct clk *clk = NULL;
5763         struct clk_init_data init;
5764  
5765 -       if (!pmc || !irq || !name || !num_parents || !parent_names)
5766 +       if (!name || !num_parents || !parent_names)
5767                 return ERR_PTR(-EINVAL);
5768  
5769         master = kzalloc(sizeof(*master), GFP_KERNEL);
5770 @@ -159,20 +147,10 @@ at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5771         master->hw.init = &init;
5772         master->layout = layout;
5773         master->characteristics = characteristics;
5774 -       master->pmc = pmc;
5775 -       master->irq = irq;
5776 -       init_waitqueue_head(&master->wait);
5777 -       irq_set_status_flags(master->irq, IRQ_NOAUTOEN);
5778 -       ret = request_irq(master->irq, clk_master_irq_handler,
5779 -                         IRQF_TRIGGER_HIGH, "clk-master", master);
5780 -       if (ret) {
5781 -               kfree(master);
5782 -               return ERR_PTR(ret);
5783 -       }
5784 +       master->regmap = regmap;
5785  
5786         clk = clk_register(NULL, &master->hw);
5787         if (IS_ERR(clk)) {
5788 -               free_irq(master->irq, master);
5789                 kfree(master);
5790         }
5791  
5792 @@ -217,15 +195,15 @@ out_free_characteristics:
5793  }
5794  
5795  static void __init
5796 -of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5797 +of_at91_clk_master_setup(struct device_node *np,
5798                          const struct clk_master_layout *layout)
5799  {
5800         struct clk *clk;
5801         int num_parents;
5802 -       unsigned int irq;
5803         const char *parent_names[MASTER_SOURCE_MAX];
5804         const char *name = np->name;
5805         struct clk_master_characteristics *characteristics;
5806 +       struct regmap *regmap;
5807  
5808         num_parents = of_clk_get_parent_count(np);
5809         if (num_parents <= 0 || num_parents > MASTER_SOURCE_MAX)
5810 @@ -239,11 +217,11 @@ of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5811         if (!characteristics)
5812                 return;
5813  
5814 -       irq = irq_of_parse_and_map(np, 0);
5815 -       if (!irq)
5816 -               goto out_free_characteristics;
5817 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5818 +       if (IS_ERR(regmap))
5819 +               return;
5820  
5821 -       clk = at91_clk_register_master(pmc, irq, name, num_parents,
5822 +       clk = at91_clk_register_master(regmap, name, num_parents,
5823                                        parent_names, layout,
5824                                        characteristics);
5825         if (IS_ERR(clk))
5826 @@ -256,14 +234,16 @@ out_free_characteristics:
5827         kfree(characteristics);
5828  }
5829  
5830 -void __init of_at91rm9200_clk_master_setup(struct device_node *np,
5831 -                                          struct at91_pmc *pmc)
5832 +static void __init of_at91rm9200_clk_master_setup(struct device_node *np)
5833  {
5834 -       of_at91_clk_master_setup(np, pmc, &at91rm9200_master_layout);
5835 +       of_at91_clk_master_setup(np, &at91rm9200_master_layout);
5836  }
5837 +CLK_OF_DECLARE(at91rm9200_clk_master, "atmel,at91rm9200-clk-master",
5838 +              of_at91rm9200_clk_master_setup);
5839  
5840 -void __init of_at91sam9x5_clk_master_setup(struct device_node *np,
5841 -                                          struct at91_pmc *pmc)
5842 +static void __init of_at91sam9x5_clk_master_setup(struct device_node *np)
5843  {
5844 -       of_at91_clk_master_setup(np, pmc, &at91sam9x5_master_layout);
5845 +       of_at91_clk_master_setup(np, &at91sam9x5_master_layout);
5846  }
5847 +CLK_OF_DECLARE(at91sam9x5_clk_master, "atmel,at91sam9x5-clk-master",
5848 +              of_at91sam9x5_clk_master_setup);
5849 diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c
5850 index 58f3b568e9cb..d69cd2a121b1 100644
5851 --- a/drivers/clk/at91/clk-peripheral.c
5852 +++ b/drivers/clk/at91/clk-peripheral.c
5853 @@ -12,11 +12,13 @@
5854  #include <linux/clkdev.h>
5855  #include <linux/clk/at91_pmc.h>
5856  #include <linux/of.h>
5857 -#include <linux/of_address.h>
5858 -#include <linux/io.h>
5859 +#include <linux/mfd/syscon.h>
5860 +#include <linux/regmap.h>
5861  
5862  #include "pmc.h"
5863  
5864 +DEFINE_SPINLOCK(pmc_pcr_lock);
5865 +
5866  #define PERIPHERAL_MAX         64
5867  
5868  #define PERIPHERAL_AT91RM9200  0
5869 @@ -33,7 +35,7 @@
5870  
5871  struct clk_peripheral {
5872         struct clk_hw hw;
5873 -       struct at91_pmc *pmc;
5874 +       struct regmap *regmap;
5875         u32 id;
5876  };
5877  
5878 @@ -41,8 +43,9 @@ struct clk_peripheral {
5879  
5880  struct clk_sam9x5_peripheral {
5881         struct clk_hw hw;
5882 -       struct at91_pmc *pmc;
5883 +       struct regmap *regmap;
5884         struct clk_range range;
5885 +       spinlock_t *lock;
5886         u32 id;
5887         u32 div;
5888         bool auto_div;
5889 @@ -54,7 +57,6 @@ struct clk_sam9x5_peripheral {
5890  static int clk_peripheral_enable(struct clk_hw *hw)
5891  {
5892         struct clk_peripheral *periph = to_clk_peripheral(hw);
5893 -       struct at91_pmc *pmc = periph->pmc;
5894         int offset = AT91_PMC_PCER;
5895         u32 id = periph->id;
5896  
5897 @@ -62,14 +64,14 @@ static int clk_peripheral_enable(struct clk_hw *hw)
5898                 return 0;
5899         if (id > PERIPHERAL_ID_MAX)
5900                 offset = AT91_PMC_PCER1;
5901 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5902 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5903 +
5904         return 0;
5905  }
5906  
5907  static void clk_peripheral_disable(struct clk_hw *hw)
5908  {
5909         struct clk_peripheral *periph = to_clk_peripheral(hw);
5910 -       struct at91_pmc *pmc = periph->pmc;
5911         int offset = AT91_PMC_PCDR;
5912         u32 id = periph->id;
5913  
5914 @@ -77,21 +79,23 @@ static void clk_peripheral_disable(struct clk_hw *hw)
5915                 return;
5916         if (id > PERIPHERAL_ID_MAX)
5917                 offset = AT91_PMC_PCDR1;
5918 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5919 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5920  }
5921  
5922  static int clk_peripheral_is_enabled(struct clk_hw *hw)
5923  {
5924         struct clk_peripheral *periph = to_clk_peripheral(hw);
5925 -       struct at91_pmc *pmc = periph->pmc;
5926         int offset = AT91_PMC_PCSR;
5927 +       unsigned int status;
5928         u32 id = periph->id;
5929  
5930         if (id < PERIPHERAL_ID_MIN)
5931                 return 1;
5932         if (id > PERIPHERAL_ID_MAX)
5933                 offset = AT91_PMC_PCSR1;
5934 -       return !!(pmc_read(pmc, offset) & PERIPHERAL_MASK(id));
5935 +       regmap_read(periph->regmap, offset, &status);
5936 +
5937 +       return status & PERIPHERAL_MASK(id) ? 1 : 0;
5938  }
5939  
5940  static const struct clk_ops peripheral_ops = {
5941 @@ -101,14 +105,14 @@ static const struct clk_ops peripheral_ops = {
5942  };
5943  
5944  static struct clk * __init
5945 -at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
5946 +at91_clk_register_peripheral(struct regmap *regmap, const char *name,
5947                              const char *parent_name, u32 id)
5948  {
5949         struct clk_peripheral *periph;
5950         struct clk *clk = NULL;
5951         struct clk_init_data init;
5952  
5953 -       if (!pmc || !name || !parent_name || id > PERIPHERAL_ID_MAX)
5954 +       if (!name || !parent_name || id > PERIPHERAL_ID_MAX)
5955                 return ERR_PTR(-EINVAL);
5956  
5957         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
5958 @@ -123,7 +127,7 @@ at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
5959  
5960         periph->id = id;
5961         periph->hw.init = &init;
5962 -       periph->pmc = pmc;
5963 +       periph->regmap = regmap;
5964  
5965         clk = clk_register(NULL, &periph->hw);
5966         if (IS_ERR(clk))
5967 @@ -160,53 +164,58 @@ static void clk_sam9x5_peripheral_autodiv(struct clk_sam9x5_peripheral *periph)
5968  static int clk_sam9x5_peripheral_enable(struct clk_hw *hw)
5969  {
5970         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
5971 -       struct at91_pmc *pmc = periph->pmc;
5972 -       u32 tmp;
5973 +       unsigned long flags;
5974  
5975         if (periph->id < PERIPHERAL_ID_MIN)
5976                 return 0;
5977  
5978 -       pmc_lock(pmc);
5979 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
5980 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_DIV_MASK;
5981 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_DIV(periph->div)
5982 -                                        | AT91_PMC_PCR_CMD
5983 -                                        | AT91_PMC_PCR_EN);
5984 -       pmc_unlock(pmc);
5985 +       spin_lock_irqsave(periph->lock, flags);
5986 +       regmap_write(periph->regmap, AT91_PMC_PCR,
5987 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
5988 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
5989 +                          AT91_PMC_PCR_DIV_MASK | AT91_PMC_PCR_CMD |
5990 +                          AT91_PMC_PCR_EN,
5991 +                          AT91_PMC_PCR_DIV(periph->div) |
5992 +                          AT91_PMC_PCR_CMD |
5993 +                          AT91_PMC_PCR_EN);
5994 +       spin_unlock_irqrestore(periph->lock, flags);
5995 +
5996         return 0;
5997  }
5998  
5999  static void clk_sam9x5_peripheral_disable(struct clk_hw *hw)
6000  {
6001         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6002 -       struct at91_pmc *pmc = periph->pmc;
6003 -       u32 tmp;
6004 +       unsigned long flags;
6005  
6006         if (periph->id < PERIPHERAL_ID_MIN)
6007                 return;
6008  
6009 -       pmc_lock(pmc);
6010 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6011 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_EN;
6012 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
6013 -       pmc_unlock(pmc);
6014 +       spin_lock_irqsave(periph->lock, flags);
6015 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6016 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6017 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6018 +                          AT91_PMC_PCR_EN | AT91_PMC_PCR_CMD,
6019 +                          AT91_PMC_PCR_CMD);
6020 +       spin_unlock_irqrestore(periph->lock, flags);
6021  }
6022  
6023  static int clk_sam9x5_peripheral_is_enabled(struct clk_hw *hw)
6024  {
6025         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6026 -       struct at91_pmc *pmc = periph->pmc;
6027 -       int ret;
6028 +       unsigned long flags;
6029 +       unsigned int status;
6030  
6031         if (periph->id < PERIPHERAL_ID_MIN)
6032                 return 1;
6033  
6034 -       pmc_lock(pmc);
6035 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6036 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_EN);
6037 -       pmc_unlock(pmc);
6038 +       spin_lock_irqsave(periph->lock, flags);
6039 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6040 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6041 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6042 +       spin_unlock_irqrestore(periph->lock, flags);
6043  
6044 -       return ret;
6045 +       return status & AT91_PMC_PCR_EN ? 1 : 0;
6046  }
6047  
6048  static unsigned long
6049 @@ -214,19 +223,20 @@ clk_sam9x5_peripheral_recalc_rate(struct clk_hw *hw,
6050                                   unsigned long parent_rate)
6051  {
6052         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6053 -       struct at91_pmc *pmc = periph->pmc;
6054 -       u32 tmp;
6055 +       unsigned long flags;
6056 +       unsigned int status;
6057  
6058         if (periph->id < PERIPHERAL_ID_MIN)
6059                 return parent_rate;
6060  
6061 -       pmc_lock(pmc);
6062 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6063 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
6064 -       pmc_unlock(pmc);
6065 +       spin_lock_irqsave(periph->lock, flags);
6066 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6067 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6068 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6069 +       spin_unlock_irqrestore(periph->lock, flags);
6070  
6071 -       if (tmp & AT91_PMC_PCR_EN) {
6072 -               periph->div = PERIPHERAL_RSHIFT(tmp);
6073 +       if (status & AT91_PMC_PCR_EN) {
6074 +               periph->div = PERIPHERAL_RSHIFT(status);
6075                 periph->auto_div = false;
6076         } else {
6077                 clk_sam9x5_peripheral_autodiv(periph);
6078 @@ -318,15 +328,15 @@ static const struct clk_ops sam9x5_peripheral_ops = {
6079  };
6080  
6081  static struct clk * __init
6082 -at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6083 -                                   const char *parent_name, u32 id,
6084 -                                   const struct clk_range *range)
6085 +at91_clk_register_sam9x5_peripheral(struct regmap *regmap, spinlock_t *lock,
6086 +                                   const char *name, const char *parent_name,
6087 +                                   u32 id, const struct clk_range *range)
6088  {
6089         struct clk_sam9x5_peripheral *periph;
6090         struct clk *clk = NULL;
6091         struct clk_init_data init;
6092  
6093 -       if (!pmc || !name || !parent_name)
6094 +       if (!name || !parent_name)
6095                 return ERR_PTR(-EINVAL);
6096  
6097         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6098 @@ -342,7 +352,8 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6099         periph->id = id;
6100         periph->hw.init = &init;
6101         periph->div = 0;
6102 -       periph->pmc = pmc;
6103 +       periph->regmap = regmap;
6104 +       periph->lock = lock;
6105         periph->auto_div = true;
6106         periph->range = *range;
6107  
6108 @@ -356,7 +367,7 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6109  }
6110  
6111  static void __init
6112 -of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6113 +of_at91_clk_periph_setup(struct device_node *np, u8 type)
6114  {
6115         int num;
6116         u32 id;
6117 @@ -364,6 +375,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6118         const char *parent_name;
6119         const char *name;
6120         struct device_node *periphclknp;
6121 +       struct regmap *regmap;
6122  
6123         parent_name = of_clk_get_parent_name(np, 0);
6124         if (!parent_name)
6125 @@ -373,6 +385,10 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6126         if (!num || num > PERIPHERAL_MAX)
6127                 return;
6128  
6129 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6130 +       if (IS_ERR(regmap))
6131 +               return;
6132 +
6133         for_each_child_of_node(np, periphclknp) {
6134                 if (of_property_read_u32(periphclknp, "reg", &id))
6135                         continue;
6136 @@ -384,7 +400,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6137                         name = periphclknp->name;
6138  
6139                 if (type == PERIPHERAL_AT91RM9200) {
6140 -                       clk = at91_clk_register_peripheral(pmc, name,
6141 +                       clk = at91_clk_register_peripheral(regmap, name,
6142                                                            parent_name, id);
6143                 } else {
6144                         struct clk_range range = CLK_RANGE(0, 0);
6145 @@ -393,7 +409,9 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6146                                               "atmel,clk-output-range",
6147                                               &range);
6148  
6149 -                       clk = at91_clk_register_sam9x5_peripheral(pmc, name,
6150 +                       clk = at91_clk_register_sam9x5_peripheral(regmap,
6151 +                                                                 &pmc_pcr_lock,
6152 +                                                                 name,
6153                                                                   parent_name,
6154                                                                   id, &range);
6155                 }
6156 @@ -405,14 +423,16 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6157         }
6158  }
6159  
6160 -void __init of_at91rm9200_clk_periph_setup(struct device_node *np,
6161 -                                          struct at91_pmc *pmc)
6162 +static void __init of_at91rm9200_clk_periph_setup(struct device_node *np)
6163  {
6164 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91RM9200);
6165 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91RM9200);
6166  }
6167 +CLK_OF_DECLARE(at91rm9200_clk_periph, "atmel,at91rm9200-clk-peripheral",
6168 +              of_at91rm9200_clk_periph_setup);
6169  
6170 -void __init of_at91sam9x5_clk_periph_setup(struct device_node *np,
6171 -                                          struct at91_pmc *pmc)
6172 +static void __init of_at91sam9x5_clk_periph_setup(struct device_node *np)
6173  {
6174 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91SAM9X5);
6175 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91SAM9X5);
6176  }
6177 +CLK_OF_DECLARE(at91sam9x5_clk_periph, "atmel,at91sam9x5-clk-peripheral",
6178 +              of_at91sam9x5_clk_periph_setup);
6179 diff --git a/drivers/clk/at91/clk-pll.c b/drivers/clk/at91/clk-pll.c
6180 index 18b60f4895a6..fb2e0b56d4b7 100644
6181 --- a/drivers/clk/at91/clk-pll.c
6182 +++ b/drivers/clk/at91/clk-pll.c
6183 @@ -12,14 +12,8 @@
6184  #include <linux/clkdev.h>
6185  #include <linux/clk/at91_pmc.h>
6186  #include <linux/of.h>
6187 -#include <linux/of_address.h>
6188 -#include <linux/of_irq.h>
6189 -#include <linux/io.h>
6190 -#include <linux/kernel.h>
6191 -#include <linux/wait.h>
6192 -#include <linux/sched.h>
6193 -#include <linux/interrupt.h>
6194 -#include <linux/irq.h>
6195 +#include <linux/mfd/syscon.h>
6196 +#include <linux/regmap.h>
6197  
6198  #include "pmc.h"
6199  
6200 @@ -58,9 +52,7 @@ struct clk_pll_layout {
6201  
6202  struct clk_pll {
6203         struct clk_hw hw;
6204 -       struct at91_pmc *pmc;
6205 -       unsigned int irq;
6206 -       wait_queue_head_t wait;
6207 +       struct regmap *regmap;
6208         u8 id;
6209         u8 div;
6210         u8 range;
6211 @@ -69,20 +61,19 @@ struct clk_pll {
6212         const struct clk_pll_characteristics *characteristics;
6213  };
6214  
6215 -static irqreturn_t clk_pll_irq_handler(int irq, void *dev_id)
6216 +static inline bool clk_pll_ready(struct regmap *regmap, int id)
6217  {
6218 -       struct clk_pll *pll = (struct clk_pll *)dev_id;
6219 +       unsigned int status;
6220  
6221 -       wake_up(&pll->wait);
6222 -       disable_irq_nosync(pll->irq);
6223 +       regmap_read(regmap, AT91_PMC_SR, &status);
6224  
6225 -       return IRQ_HANDLED;
6226 +       return status & PLL_STATUS_MASK(id) ? 1 : 0;
6227  }
6228  
6229  static int clk_pll_prepare(struct clk_hw *hw)
6230  {
6231         struct clk_pll *pll = to_clk_pll(hw);
6232 -       struct at91_pmc *pmc = pll->pmc;
6233 +       struct regmap *regmap = pll->regmap;
6234         const struct clk_pll_layout *layout = pll->layout;
6235         const struct clk_pll_characteristics *characteristics =
6236                                                         pll->characteristics;
6237 @@ -90,39 +81,34 @@ static int clk_pll_prepare(struct clk_hw *hw)
6238         u32 mask = PLL_STATUS_MASK(id);
6239         int offset = PLL_REG(id);
6240         u8 out = 0;
6241 -       u32 pllr, icpr;
6242 +       unsigned int pllr;
6243 +       unsigned int status;
6244         u8 div;
6245         u16 mul;
6246  
6247 -       pllr = pmc_read(pmc, offset);
6248 +       regmap_read(regmap, offset, &pllr);
6249         div = PLL_DIV(pllr);
6250         mul = PLL_MUL(pllr, layout);
6251  
6252 -       if ((pmc_read(pmc, AT91_PMC_SR) & mask) &&
6253 +       regmap_read(regmap, AT91_PMC_SR, &status);
6254 +       if ((status & mask) &&
6255             (div == pll->div && mul == pll->mul))
6256                 return 0;
6257  
6258         if (characteristics->out)
6259                 out = characteristics->out[pll->range];
6260 -       if (characteristics->icpll) {
6261 -               icpr = pmc_read(pmc, AT91_PMC_PLLICPR) & ~PLL_ICPR_MASK(id);
6262 -               icpr |= (characteristics->icpll[pll->range] <<
6263 -                       PLL_ICPR_SHIFT(id));
6264 -               pmc_write(pmc, AT91_PMC_PLLICPR, icpr);
6265 -       }
6266  
6267 -       pllr &= ~layout->pllr_mask;
6268 -       pllr |= layout->pllr_mask &
6269 -              (pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6270 -               (out << PLL_OUT_SHIFT) |
6271 -               ((pll->mul & layout->mul_mask) << layout->mul_shift));
6272 -       pmc_write(pmc, offset, pllr);
6273 -
6274 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
6275 -               enable_irq(pll->irq);
6276 -               wait_event(pll->wait,
6277 -                          pmc_read(pmc, AT91_PMC_SR) & mask);
6278 -       }
6279 +       if (characteristics->icpll)
6280 +               regmap_update_bits(regmap, AT91_PMC_PLLICPR, PLL_ICPR_MASK(id),
6281 +                       characteristics->icpll[pll->range] << PLL_ICPR_SHIFT(id));
6282 +
6283 +       regmap_update_bits(regmap, offset, layout->pllr_mask,
6284 +                       pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6285 +                       (out << PLL_OUT_SHIFT) |
6286 +                       ((pll->mul & layout->mul_mask) << layout->mul_shift));
6287 +
6288 +       while (!clk_pll_ready(regmap, pll->id))
6289 +               cpu_relax();
6290  
6291         return 0;
6292  }
6293 @@ -130,32 +116,35 @@ static int clk_pll_prepare(struct clk_hw *hw)
6294  static int clk_pll_is_prepared(struct clk_hw *hw)
6295  {
6296         struct clk_pll *pll = to_clk_pll(hw);
6297 -       struct at91_pmc *pmc = pll->pmc;
6298  
6299 -       return !!(pmc_read(pmc, AT91_PMC_SR) &
6300 -                 PLL_STATUS_MASK(pll->id));
6301 +       return clk_pll_ready(pll->regmap, pll->id);
6302  }
6303  
6304  static void clk_pll_unprepare(struct clk_hw *hw)
6305  {
6306         struct clk_pll *pll = to_clk_pll(hw);
6307 -       struct at91_pmc *pmc = pll->pmc;
6308 -       const struct clk_pll_layout *layout = pll->layout;
6309 -       int offset = PLL_REG(pll->id);
6310 -       u32 tmp = pmc_read(pmc, offset) & ~(layout->pllr_mask);
6311 +       unsigned int mask = pll->layout->pllr_mask;
6312  
6313 -       pmc_write(pmc, offset, tmp);
6314 +       regmap_update_bits(pll->regmap, PLL_REG(pll->id), mask, ~mask);
6315  }
6316  
6317  static unsigned long clk_pll_recalc_rate(struct clk_hw *hw,
6318                                          unsigned long parent_rate)
6319  {
6320         struct clk_pll *pll = to_clk_pll(hw);
6321 +       unsigned int pllr;
6322 +       u16 mul;
6323 +       u8 div;
6324  
6325 -       if (!pll->div || !pll->mul)
6326 +       regmap_read(pll->regmap, PLL_REG(pll->id), &pllr);
6327 +
6328 +       div = PLL_DIV(pllr);
6329 +       mul = PLL_MUL(pllr, pll->layout);
6330 +
6331 +       if (!div || !mul)
6332                 return 0;
6333  
6334 -       return (parent_rate / pll->div) * (pll->mul + 1);
6335 +       return (parent_rate / div) * (mul + 1);
6336  }
6337  
6338  static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
6339 @@ -308,7 +297,7 @@ static const struct clk_ops pll_ops = {
6340  };
6341  
6342  static struct clk * __init
6343 -at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6344 +at91_clk_register_pll(struct regmap *regmap, const char *name,
6345                       const char *parent_name, u8 id,
6346                       const struct clk_pll_layout *layout,
6347                       const struct clk_pll_characteristics *characteristics)
6348 @@ -316,9 +305,8 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6349         struct clk_pll *pll;
6350         struct clk *clk = NULL;
6351         struct clk_init_data init;
6352 -       int ret;
6353         int offset = PLL_REG(id);
6354 -       u32 tmp;
6355 +       unsigned int pllr;
6356  
6357         if (id > PLL_MAX_ID)
6358                 return ERR_PTR(-EINVAL);
6359 @@ -337,23 +325,13 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6360         pll->hw.init = &init;
6361         pll->layout = layout;
6362         pll->characteristics = characteristics;
6363 -       pll->pmc = pmc;
6364 -       pll->irq = irq;
6365 -       tmp = pmc_read(pmc, offset) & layout->pllr_mask;
6366 -       pll->div = PLL_DIV(tmp);
6367 -       pll->mul = PLL_MUL(tmp, layout);
6368 -       init_waitqueue_head(&pll->wait);
6369 -       irq_set_status_flags(pll->irq, IRQ_NOAUTOEN);
6370 -       ret = request_irq(pll->irq, clk_pll_irq_handler, IRQF_TRIGGER_HIGH,
6371 -                         id ? "clk-pllb" : "clk-plla", pll);
6372 -       if (ret) {
6373 -               kfree(pll);
6374 -               return ERR_PTR(ret);
6375 -       }
6376 +       pll->regmap = regmap;
6377 +       regmap_read(regmap, offset, &pllr);
6378 +       pll->div = PLL_DIV(pllr);
6379 +       pll->mul = PLL_MUL(pllr, layout);
6380  
6381         clk = clk_register(NULL, &pll->hw);
6382         if (IS_ERR(clk)) {
6383 -               free_irq(pll->irq, pll);
6384                 kfree(pll);
6385         }
6386  
6387 @@ -483,12 +461,12 @@ out_free_characteristics:
6388  }
6389  
6390  static void __init
6391 -of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6392 +of_at91_clk_pll_setup(struct device_node *np,
6393                       const struct clk_pll_layout *layout)
6394  {
6395         u32 id;
6396 -       unsigned int irq;
6397         struct clk *clk;
6398 +       struct regmap *regmap;
6399         const char *parent_name;
6400         const char *name = np->name;
6401         struct clk_pll_characteristics *characteristics;
6402 @@ -500,15 +478,15 @@ of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6403  
6404         of_property_read_string(np, "clock-output-names", &name);
6405  
6406 -       characteristics = of_at91_clk_pll_get_characteristics(np);
6407 -       if (!characteristics)
6408 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6409 +       if (IS_ERR(regmap))
6410                 return;
6411  
6412 -       irq = irq_of_parse_and_map(np, 0);
6413 -       if (!irq)
6414 +       characteristics = of_at91_clk_pll_get_characteristics(np);
6415 +       if (!characteristics)
6416                 return;
6417  
6418 -       clk = at91_clk_register_pll(pmc, irq, name, parent_name, id, layout,
6419 +       clk = at91_clk_register_pll(regmap, name, parent_name, id, layout,
6420                                     characteristics);
6421         if (IS_ERR(clk))
6422                 goto out_free_characteristics;
6423 @@ -520,26 +498,30 @@ out_free_characteristics:
6424         kfree(characteristics);
6425  }
6426  
6427 -void __init of_at91rm9200_clk_pll_setup(struct device_node *np,
6428 -                                              struct at91_pmc *pmc)
6429 +static void __init of_at91rm9200_clk_pll_setup(struct device_node *np)
6430  {
6431 -       of_at91_clk_pll_setup(np, pmc, &at91rm9200_pll_layout);
6432 +       of_at91_clk_pll_setup(np, &at91rm9200_pll_layout);
6433  }
6434 +CLK_OF_DECLARE(at91rm9200_clk_pll, "atmel,at91rm9200-clk-pll",
6435 +              of_at91rm9200_clk_pll_setup);
6436  
6437 -void __init of_at91sam9g45_clk_pll_setup(struct device_node *np,
6438 -                                               struct at91_pmc *pmc)
6439 +static void __init of_at91sam9g45_clk_pll_setup(struct device_node *np)
6440  {
6441 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g45_pll_layout);
6442 +       of_at91_clk_pll_setup(np, &at91sam9g45_pll_layout);
6443  }
6444 +CLK_OF_DECLARE(at91sam9g45_clk_pll, "atmel,at91sam9g45-clk-pll",
6445 +              of_at91sam9g45_clk_pll_setup);
6446  
6447 -void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np,
6448 -                                                struct at91_pmc *pmc)
6449 +static void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np)
6450  {
6451 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g20_pllb_layout);
6452 +       of_at91_clk_pll_setup(np, &at91sam9g20_pllb_layout);
6453  }
6454 +CLK_OF_DECLARE(at91sam9g20_clk_pllb, "atmel,at91sam9g20-clk-pllb",
6455 +              of_at91sam9g20_clk_pllb_setup);
6456  
6457 -void __init of_sama5d3_clk_pll_setup(struct device_node *np,
6458 -                                           struct at91_pmc *pmc)
6459 +static void __init of_sama5d3_clk_pll_setup(struct device_node *np)
6460  {
6461 -       of_at91_clk_pll_setup(np, pmc, &sama5d3_pll_layout);
6462 +       of_at91_clk_pll_setup(np, &sama5d3_pll_layout);
6463  }
6464 +CLK_OF_DECLARE(sama5d3_clk_pll, "atmel,sama5d3-clk-pll",
6465 +              of_sama5d3_clk_pll_setup);
6466 diff --git a/drivers/clk/at91/clk-plldiv.c b/drivers/clk/at91/clk-plldiv.c
6467 index ea226562bb40..2bed26481027 100644
6468 --- a/drivers/clk/at91/clk-plldiv.c
6469 +++ b/drivers/clk/at91/clk-plldiv.c
6470 @@ -12,8 +12,8 @@
6471  #include <linux/clkdev.h>
6472  #include <linux/clk/at91_pmc.h>
6473  #include <linux/of.h>
6474 -#include <linux/of_address.h>
6475 -#include <linux/io.h>
6476 +#include <linux/mfd/syscon.h>
6477 +#include <linux/regmap.h>
6478  
6479  #include "pmc.h"
6480  
6481 @@ -21,16 +21,18 @@
6482  
6483  struct clk_plldiv {
6484         struct clk_hw hw;
6485 -       struct at91_pmc *pmc;
6486 +       struct regmap *regmap;
6487  };
6488  
6489  static unsigned long clk_plldiv_recalc_rate(struct clk_hw *hw,
6490                                             unsigned long parent_rate)
6491  {
6492         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6493 -       struct at91_pmc *pmc = plldiv->pmc;
6494 +       unsigned int mckr;
6495  
6496 -       if (pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_PLLADIV2)
6497 +       regmap_read(plldiv->regmap, AT91_PMC_MCKR, &mckr);
6498 +
6499 +       if (mckr & AT91_PMC_PLLADIV2)
6500                 return parent_rate / 2;
6501  
6502         return parent_rate;
6503 @@ -57,18 +59,12 @@ static int clk_plldiv_set_rate(struct clk_hw *hw, unsigned long rate,
6504                                unsigned long parent_rate)
6505  {
6506         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6507 -       struct at91_pmc *pmc = plldiv->pmc;
6508 -       u32 tmp;
6509  
6510 -       if (parent_rate != rate && (parent_rate / 2) != rate)
6511 +       if ((parent_rate != rate) && (parent_rate / 2 != rate))
6512                 return -EINVAL;
6513  
6514 -       pmc_lock(pmc);
6515 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_PLLADIV2;
6516 -       if ((parent_rate / 2) == rate)
6517 -               tmp |= AT91_PMC_PLLADIV2;
6518 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
6519 -       pmc_unlock(pmc);
6520 +       regmap_update_bits(plldiv->regmap, AT91_PMC_MCKR, AT91_PMC_PLLADIV2,
6521 +                          parent_rate != rate ? AT91_PMC_PLLADIV2 : 0);
6522  
6523         return 0;
6524  }
6525 @@ -80,7 +76,7 @@ static const struct clk_ops plldiv_ops = {
6526  };
6527  
6528  static struct clk * __init
6529 -at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6530 +at91_clk_register_plldiv(struct regmap *regmap, const char *name,
6531                          const char *parent_name)
6532  {
6533         struct clk_plldiv *plldiv;
6534 @@ -98,7 +94,7 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6535         init.flags = CLK_SET_RATE_GATE;
6536  
6537         plldiv->hw.init = &init;
6538 -       plldiv->pmc = pmc;
6539 +       plldiv->regmap = regmap;
6540  
6541         clk = clk_register(NULL, &plldiv->hw);
6542  
6543 @@ -109,27 +105,27 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6544  }
6545  
6546  static void __init
6547 -of_at91_clk_plldiv_setup(struct device_node *np, struct at91_pmc *pmc)
6548 +of_at91sam9x5_clk_plldiv_setup(struct device_node *np)
6549  {
6550         struct clk *clk;
6551         const char *parent_name;
6552         const char *name = np->name;
6553 +       struct regmap *regmap;
6554  
6555         parent_name = of_clk_get_parent_name(np, 0);
6556  
6557         of_property_read_string(np, "clock-output-names", &name);
6558  
6559 -       clk = at91_clk_register_plldiv(pmc, name, parent_name);
6560 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6561 +       if (IS_ERR(regmap))
6562 +               return;
6563  
6564 +       clk = at91_clk_register_plldiv(regmap, name, parent_name);
6565         if (IS_ERR(clk))
6566                 return;
6567  
6568         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6569         return;
6570  }
6571 -
6572 -void __init of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
6573 -                                          struct at91_pmc *pmc)
6574 -{
6575 -       of_at91_clk_plldiv_setup(np, pmc);
6576 -}
6577 +CLK_OF_DECLARE(at91sam9x5_clk_plldiv, "atmel,at91sam9x5-clk-plldiv",
6578 +              of_at91sam9x5_clk_plldiv_setup);
6579 diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
6580 index 14b270b85fec..bc0be629671b 100644
6581 --- a/drivers/clk/at91/clk-programmable.c
6582 +++ b/drivers/clk/at91/clk-programmable.c
6583 @@ -12,10 +12,8 @@
6584  #include <linux/clkdev.h>
6585  #include <linux/clk/at91_pmc.h>
6586  #include <linux/of.h>
6587 -#include <linux/of_address.h>
6588 -#include <linux/io.h>
6589 -#include <linux/wait.h>
6590 -#include <linux/sched.h>
6591 +#include <linux/mfd/syscon.h>
6592 +#include <linux/regmap.h>
6593  
6594  #include "pmc.h"
6595  
6596 @@ -24,6 +22,7 @@
6597  
6598  #define PROG_STATUS_MASK(id)   (1 << ((id) + 8))
6599  #define PROG_PRES_MASK         0x7
6600 +#define PROG_PRES(layout, pckr)        ((pckr >> layout->pres_shift) & PROG_PRES_MASK)
6601  #define PROG_MAX_RM9200_CSS    3
6602  
6603  struct clk_programmable_layout {
6604 @@ -34,7 +33,7 @@ struct clk_programmable_layout {
6605  
6606  struct clk_programmable {
6607         struct clk_hw hw;
6608 -       struct at91_pmc *pmc;
6609 +       struct regmap *regmap;
6610         u8 id;
6611         const struct clk_programmable_layout *layout;
6612  };
6613 @@ -44,14 +43,12 @@ struct clk_programmable {
6614  static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
6615                                                   unsigned long parent_rate)
6616  {
6617 -       u32 pres;
6618         struct clk_programmable *prog = to_clk_programmable(hw);
6619 -       struct at91_pmc *pmc = prog->pmc;
6620 -       const struct clk_programmable_layout *layout = prog->layout;
6621 +       unsigned int pckr;
6622 +
6623 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6624  
6625 -       pres = (pmc_read(pmc, AT91_PMC_PCKR(prog->id)) >> layout->pres_shift) &
6626 -              PROG_PRES_MASK;
6627 -       return parent_rate >> pres;
6628 +       return parent_rate >> PROG_PRES(prog->layout, pckr);
6629  }
6630  
6631  static int clk_programmable_determine_rate(struct clk_hw *hw,
6632 @@ -101,36 +98,36 @@ static int clk_programmable_set_parent(struct clk_hw *hw, u8 index)
6633  {
6634         struct clk_programmable *prog = to_clk_programmable(hw);
6635         const struct clk_programmable_layout *layout = prog->layout;
6636 -       struct at91_pmc *pmc = prog->pmc;
6637 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) & ~layout->css_mask;
6638 +       unsigned int mask = layout->css_mask;
6639 +       unsigned int pckr = 0;
6640  
6641         if (layout->have_slck_mck)
6642 -               tmp &= AT91_PMC_CSSMCK_MCK;
6643 +               mask |= AT91_PMC_CSSMCK_MCK;
6644  
6645         if (index > layout->css_mask) {
6646 -               if (index > PROG_MAX_RM9200_CSS && layout->have_slck_mck) {
6647 -                       tmp |= AT91_PMC_CSSMCK_MCK;
6648 -                       return 0;
6649 -               } else {
6650 +               if (index > PROG_MAX_RM9200_CSS && !layout->have_slck_mck)
6651                         return -EINVAL;
6652 -               }
6653 +
6654 +               pckr |= AT91_PMC_CSSMCK_MCK;
6655         }
6656  
6657 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id), tmp | index);
6658 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id), mask, pckr);
6659 +
6660         return 0;
6661  }
6662  
6663  static u8 clk_programmable_get_parent(struct clk_hw *hw)
6664  {
6665 -       u32 tmp;
6666 -       u8 ret;
6667         struct clk_programmable *prog = to_clk_programmable(hw);
6668 -       struct at91_pmc *pmc = prog->pmc;
6669         const struct clk_programmable_layout *layout = prog->layout;
6670 +       unsigned int pckr;
6671 +       u8 ret;
6672 +
6673 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6674 +
6675 +       ret = pckr & layout->css_mask;
6676  
6677 -       tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id));
6678 -       ret = tmp & layout->css_mask;
6679 -       if (layout->have_slck_mck && (tmp & AT91_PMC_CSSMCK_MCK) && !ret)
6680 +       if (layout->have_slck_mck && (pckr & AT91_PMC_CSSMCK_MCK) && !ret)
6681                 ret = PROG_MAX_RM9200_CSS + 1;
6682  
6683         return ret;
6684 @@ -140,26 +137,27 @@ static int clk_programmable_set_rate(struct clk_hw *hw, unsigned long rate,
6685                                      unsigned long parent_rate)
6686  {
6687         struct clk_programmable *prog = to_clk_programmable(hw);
6688 -       struct at91_pmc *pmc = prog->pmc;
6689         const struct clk_programmable_layout *layout = prog->layout;
6690         unsigned long div = parent_rate / rate;
6691 +       unsigned int pckr;
6692         int shift = 0;
6693 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) &
6694 -                 ~(PROG_PRES_MASK << layout->pres_shift);
6695 +
6696 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6697  
6698         if (!div)
6699                 return -EINVAL;
6700  
6701         shift = fls(div) - 1;
6702  
6703 -       if (div != (1<<shift))
6704 +       if (div != (1 << shift))
6705                 return -EINVAL;
6706  
6707         if (shift >= PROG_PRES_MASK)
6708                 return -EINVAL;
6709  
6710 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id),
6711 -                 tmp | (shift << layout->pres_shift));
6712 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id),
6713 +                          PROG_PRES_MASK << layout->pres_shift,
6714 +                          shift << layout->pres_shift);
6715  
6716         return 0;
6717  }
6718 @@ -173,7 +171,7 @@ static const struct clk_ops programmable_ops = {
6719  };
6720  
6721  static struct clk * __init
6722 -at91_clk_register_programmable(struct at91_pmc *pmc,
6723 +at91_clk_register_programmable(struct regmap *regmap,
6724                                const char *name, const char **parent_names,
6725                                u8 num_parents, u8 id,
6726                                const struct clk_programmable_layout *layout)
6727 @@ -198,7 +196,7 @@ at91_clk_register_programmable(struct at91_pmc *pmc,
6728         prog->id = id;
6729         prog->layout = layout;
6730         prog->hw.init = &init;
6731 -       prog->pmc = pmc;
6732 +       prog->regmap = regmap;
6733  
6734         clk = clk_register(NULL, &prog->hw);
6735         if (IS_ERR(clk))
6736 @@ -226,7 +224,7 @@ static const struct clk_programmable_layout at91sam9x5_programmable_layout = {
6737  };
6738  
6739  static void __init
6740 -of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6741 +of_at91_clk_prog_setup(struct device_node *np,
6742                        const struct clk_programmable_layout *layout)
6743  {
6744         int num;
6745 @@ -236,6 +234,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6746         const char *parent_names[PROG_SOURCE_MAX];
6747         const char *name;
6748         struct device_node *progclknp;
6749 +       struct regmap *regmap;
6750  
6751         num_parents = of_clk_get_parent_count(np);
6752         if (num_parents <= 0 || num_parents > PROG_SOURCE_MAX)
6753 @@ -247,6 +246,10 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6754         if (!num || num > (PROG_ID_MAX + 1))
6755                 return;
6756  
6757 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6758 +       if (IS_ERR(regmap))
6759 +               return;
6760 +
6761         for_each_child_of_node(np, progclknp) {
6762                 if (of_property_read_u32(progclknp, "reg", &id))
6763                         continue;
6764 @@ -254,7 +257,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6765                 if (of_property_read_string(np, "clock-output-names", &name))
6766                         name = progclknp->name;
6767  
6768 -               clk = at91_clk_register_programmable(pmc, name,
6769 +               clk = at91_clk_register_programmable(regmap, name,
6770                                                      parent_names, num_parents,
6771                                                      id, layout);
6772                 if (IS_ERR(clk))
6773 @@ -265,20 +268,23 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6774  }
6775  
6776  
6777 -void __init of_at91rm9200_clk_prog_setup(struct device_node *np,
6778 -                                        struct at91_pmc *pmc)
6779 +static void __init of_at91rm9200_clk_prog_setup(struct device_node *np)
6780  {
6781 -       of_at91_clk_prog_setup(np, pmc, &at91rm9200_programmable_layout);
6782 +       of_at91_clk_prog_setup(np, &at91rm9200_programmable_layout);
6783  }
6784 +CLK_OF_DECLARE(at91rm9200_clk_prog, "atmel,at91rm9200-clk-programmable",
6785 +              of_at91rm9200_clk_prog_setup);
6786  
6787 -void __init of_at91sam9g45_clk_prog_setup(struct device_node *np,
6788 -                                         struct at91_pmc *pmc)
6789 +static void __init of_at91sam9g45_clk_prog_setup(struct device_node *np)
6790  {
6791 -       of_at91_clk_prog_setup(np, pmc, &at91sam9g45_programmable_layout);
6792 +       of_at91_clk_prog_setup(np, &at91sam9g45_programmable_layout);
6793  }
6794 +CLK_OF_DECLARE(at91sam9g45_clk_prog, "atmel,at91sam9g45-clk-programmable",
6795 +              of_at91sam9g45_clk_prog_setup);
6796  
6797 -void __init of_at91sam9x5_clk_prog_setup(struct device_node *np,
6798 -                                        struct at91_pmc *pmc)
6799 +static void __init of_at91sam9x5_clk_prog_setup(struct device_node *np)
6800  {
6801 -       of_at91_clk_prog_setup(np, pmc, &at91sam9x5_programmable_layout);
6802 +       of_at91_clk_prog_setup(np, &at91sam9x5_programmable_layout);
6803  }
6804 +CLK_OF_DECLARE(at91sam9x5_clk_prog, "atmel,at91sam9x5-clk-programmable",
6805 +              of_at91sam9x5_clk_prog_setup);
6806 diff --git a/drivers/clk/at91/clk-slow.c b/drivers/clk/at91/clk-slow.c
6807 index d0d5076a9b94..221c09684ba3 100644
6808 --- a/drivers/clk/at91/clk-slow.c
6809 +++ b/drivers/clk/at91/clk-slow.c
6810 @@ -13,17 +13,11 @@
6811  #include <linux/clk.h>
6812  #include <linux/clk-provider.h>
6813  #include <linux/clkdev.h>
6814 -#include <linux/slab.h>
6815  #include <linux/clk/at91_pmc.h>
6816  #include <linux/delay.h>
6817  #include <linux/of.h>
6818 -#include <linux/of_address.h>
6819 -#include <linux/of_irq.h>
6820 -#include <linux/io.h>
6821 -#include <linux/interrupt.h>
6822 -#include <linux/irq.h>
6823 -#include <linux/sched.h>
6824 -#include <linux/wait.h>
6825 +#include <linux/mfd/syscon.h>
6826 +#include <linux/regmap.h>
6827  
6828  #include "pmc.h"
6829  #include "sckc.h"
6830 @@ -59,7 +53,7 @@ struct clk_slow_rc_osc {
6831  
6832  struct clk_sam9260_slow {
6833         struct clk_hw hw;
6834 -       struct at91_pmc *pmc;
6835 +       struct regmap *regmap;
6836  };
6837  
6838  #define to_clk_sam9260_slow(hw) container_of(hw, struct clk_sam9260_slow, hw)
6839 @@ -393,8 +387,11 @@ void __init of_at91sam9x5_clk_slow_setup(struct device_node *np,
6840  static u8 clk_sam9260_slow_get_parent(struct clk_hw *hw)
6841  {
6842         struct clk_sam9260_slow *slowck = to_clk_sam9260_slow(hw);
6843 +       unsigned int status;
6844  
6845 -       return !!(pmc_read(slowck->pmc, AT91_PMC_SR) & AT91_PMC_OSCSEL);
6846 +       regmap_read(slowck->regmap, AT91_PMC_SR, &status);
6847 +
6848 +       return status & AT91_PMC_OSCSEL ? 1 : 0;
6849  }
6850  
6851  static const struct clk_ops sam9260_slow_ops = {
6852 @@ -402,7 +399,7 @@ static const struct clk_ops sam9260_slow_ops = {
6853  };
6854  
6855  static struct clk * __init
6856 -at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6857 +at91_clk_register_sam9260_slow(struct regmap *regmap,
6858                                const char *name,
6859                                const char **parent_names,
6860                                int num_parents)
6861 @@ -411,7 +408,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6862         struct clk *clk = NULL;
6863         struct clk_init_data init;
6864  
6865 -       if (!pmc || !name)
6866 +       if (!name)
6867                 return ERR_PTR(-EINVAL);
6868  
6869         if (!parent_names || !num_parents)
6870 @@ -428,7 +425,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6871         init.flags = 0;
6872  
6873         slowck->hw.init = &init;
6874 -       slowck->pmc = pmc;
6875 +       slowck->regmap = regmap;
6876  
6877         clk = clk_register(NULL, &slowck->hw);
6878         if (IS_ERR(clk))
6879 @@ -439,29 +436,34 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6880         return clk;
6881  }
6882  
6883 -void __init of_at91sam9260_clk_slow_setup(struct device_node *np,
6884 -                                         struct at91_pmc *pmc)
6885 +static void __init of_at91sam9260_clk_slow_setup(struct device_node *np)
6886  {
6887         struct clk *clk;
6888         const char *parent_names[2];
6889         int num_parents;
6890         const char *name = np->name;
6891 +       struct regmap *regmap;
6892  
6893         num_parents = of_clk_get_parent_count(np);
6894         if (num_parents != 2)
6895                 return;
6896  
6897         of_clk_parent_fill(np, parent_names, num_parents);
6898 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6899 +       if (IS_ERR(regmap))
6900 +               return;
6901  
6902         of_property_read_string(np, "clock-output-names", &name);
6903  
6904 -       clk = at91_clk_register_sam9260_slow(pmc, name, parent_names,
6905 +       clk = at91_clk_register_sam9260_slow(regmap, name, parent_names,
6906                                              num_parents);
6907         if (IS_ERR(clk))
6908                 return;
6909  
6910         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6911  }
6912 +CLK_OF_DECLARE(at91sam9260_clk_slow, "atmel,at91sam9260-clk-slow",
6913 +              of_at91sam9260_clk_slow_setup);
6914  
6915  /*
6916   * FIXME: All slow clk users are not properly claiming it (get + prepare +
6917 diff --git a/drivers/clk/at91/clk-smd.c b/drivers/clk/at91/clk-smd.c
6918 index a7f8501cfa05..e6948a52005a 100644
6919 --- a/drivers/clk/at91/clk-smd.c
6920 +++ b/drivers/clk/at91/clk-smd.c
6921 @@ -12,8 +12,8 @@
6922  #include <linux/clkdev.h>
6923  #include <linux/clk/at91_pmc.h>
6924  #include <linux/of.h>
6925 -#include <linux/of_address.h>
6926 -#include <linux/io.h>
6927 +#include <linux/mfd/syscon.h>
6928 +#include <linux/regmap.h>
6929  
6930  #include "pmc.h"
6931  
6932 @@ -24,7 +24,7 @@
6933  
6934  struct at91sam9x5_clk_smd {
6935         struct clk_hw hw;
6936 -       struct at91_pmc *pmc;
6937 +       struct regmap *regmap;
6938  };
6939  
6940  #define to_at91sam9x5_clk_smd(hw) \
6941 @@ -33,13 +33,13 @@ struct at91sam9x5_clk_smd {
6942  static unsigned long at91sam9x5_clk_smd_recalc_rate(struct clk_hw *hw,
6943                                                     unsigned long parent_rate)
6944  {
6945 -       u32 tmp;
6946 -       u8 smddiv;
6947         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6948 -       struct at91_pmc *pmc = smd->pmc;
6949 +       unsigned int smdr;
6950 +       u8 smddiv;
6951 +
6952 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
6953 +       smddiv = (smdr & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
6954  
6955 -       tmp = pmc_read(pmc, AT91_PMC_SMD);
6956 -       smddiv = (tmp & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
6957         return parent_rate / (smddiv + 1);
6958  }
6959  
6960 @@ -67,40 +67,38 @@ static long at91sam9x5_clk_smd_round_rate(struct clk_hw *hw, unsigned long rate,
6961  
6962  static int at91sam9x5_clk_smd_set_parent(struct clk_hw *hw, u8 index)
6963  {
6964 -       u32 tmp;
6965         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6966 -       struct at91_pmc *pmc = smd->pmc;
6967  
6968         if (index > 1)
6969                 return -EINVAL;
6970 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMDS;
6971 -       if (index)
6972 -               tmp |= AT91_PMC_SMDS;
6973 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
6974 +
6975 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMDS,
6976 +                          index ? AT91_PMC_SMDS : 0);
6977 +
6978         return 0;
6979  }
6980  
6981  static u8 at91sam9x5_clk_smd_get_parent(struct clk_hw *hw)
6982  {
6983         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6984 -       struct at91_pmc *pmc = smd->pmc;
6985 +       unsigned int smdr;
6986  
6987 -       return pmc_read(pmc, AT91_PMC_SMD) & AT91_PMC_SMDS;
6988 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
6989 +
6990 +       return smdr & AT91_PMC_SMDS;
6991  }
6992  
6993  static int at91sam9x5_clk_smd_set_rate(struct clk_hw *hw, unsigned long rate,
6994                                        unsigned long parent_rate)
6995  {
6996 -       u32 tmp;
6997         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6998 -       struct at91_pmc *pmc = smd->pmc;
6999         unsigned long div = parent_rate / rate;
7000  
7001         if (parent_rate % rate || div < 1 || div > (SMD_MAX_DIV + 1))
7002                 return -EINVAL;
7003 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMD_DIV;
7004 -       tmp |= (div - 1) << SMD_DIV_SHIFT;
7005 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7006 +
7007 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMD_DIV,
7008 +                          (div - 1) << SMD_DIV_SHIFT);
7009  
7010         return 0;
7011  }
7012 @@ -114,7 +112,7 @@ static const struct clk_ops at91sam9x5_smd_ops = {
7013  };
7014  
7015  static struct clk * __init
7016 -at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7017 +at91sam9x5_clk_register_smd(struct regmap *regmap, const char *name,
7018                             const char **parent_names, u8 num_parents)
7019  {
7020         struct at91sam9x5_clk_smd *smd;
7021 @@ -132,7 +130,7 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7022         init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE;
7023  
7024         smd->hw.init = &init;
7025 -       smd->pmc = pmc;
7026 +       smd->regmap = regmap;
7027  
7028         clk = clk_register(NULL, &smd->hw);
7029         if (IS_ERR(clk))
7030 @@ -141,13 +139,13 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7031         return clk;
7032  }
7033  
7034 -void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7035 -                                       struct at91_pmc *pmc)
7036 +static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np)
7037  {
7038         struct clk *clk;
7039         int num_parents;
7040         const char *parent_names[SMD_SOURCE_MAX];
7041         const char *name = np->name;
7042 +       struct regmap *regmap;
7043  
7044         num_parents = of_clk_get_parent_count(np);
7045         if (num_parents <= 0 || num_parents > SMD_SOURCE_MAX)
7046 @@ -157,10 +155,16 @@ void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7047  
7048         of_property_read_string(np, "clock-output-names", &name);
7049  
7050 -       clk = at91sam9x5_clk_register_smd(pmc, name, parent_names,
7051 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7052 +       if (IS_ERR(regmap))
7053 +               return;
7054 +
7055 +       clk = at91sam9x5_clk_register_smd(regmap, name, parent_names,
7056                                           num_parents);
7057         if (IS_ERR(clk))
7058                 return;
7059  
7060         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7061  }
7062 +CLK_OF_DECLARE(at91sam9x5_clk_smd, "atmel,at91sam9x5-clk-smd",
7063 +              of_at91sam9x5_clk_smd_setup);
7064 diff --git a/drivers/clk/at91/clk-system.c b/drivers/clk/at91/clk-system.c
7065 index 3f5314344286..8f35d8172909 100644
7066 --- a/drivers/clk/at91/clk-system.c
7067 +++ b/drivers/clk/at91/clk-system.c
7068 @@ -12,13 +12,8 @@
7069  #include <linux/clkdev.h>
7070  #include <linux/clk/at91_pmc.h>
7071  #include <linux/of.h>
7072 -#include <linux/of_address.h>
7073 -#include <linux/io.h>
7074 -#include <linux/irq.h>
7075 -#include <linux/of_irq.h>
7076 -#include <linux/interrupt.h>
7077 -#include <linux/wait.h>
7078 -#include <linux/sched.h>
7079 +#include <linux/mfd/syscon.h>
7080 +#include <linux/regmap.h>
7081  
7082  #include "pmc.h"
7083  
7084 @@ -29,9 +24,7 @@
7085  #define to_clk_system(hw) container_of(hw, struct clk_system, hw)
7086  struct clk_system {
7087         struct clk_hw hw;
7088 -       struct at91_pmc *pmc;
7089 -       unsigned int irq;
7090 -       wait_queue_head_t wait;
7091 +       struct regmap *regmap;
7092         u8 id;
7093  };
7094  
7095 @@ -39,58 +32,54 @@ static inline int is_pck(int id)
7096  {
7097         return (id >= 8) && (id <= 15);
7098  }
7099 -static irqreturn_t clk_system_irq_handler(int irq, void *dev_id)
7100 +
7101 +static inline bool clk_system_ready(struct regmap *regmap, int id)
7102  {
7103 -       struct clk_system *sys = (struct clk_system *)dev_id;
7104 +       unsigned int status;
7105  
7106 -       wake_up(&sys->wait);
7107 -       disable_irq_nosync(sys->irq);
7108 +       regmap_read(regmap, AT91_PMC_SR, &status);
7109  
7110 -       return IRQ_HANDLED;
7111 +       return status & (1 << id) ? 1 : 0;
7112  }
7113  
7114  static int clk_system_prepare(struct clk_hw *hw)
7115  {
7116         struct clk_system *sys = to_clk_system(hw);
7117 -       struct at91_pmc *pmc = sys->pmc;
7118 -       u32 mask = 1 << sys->id;
7119  
7120 -       pmc_write(pmc, AT91_PMC_SCER, mask);
7121 +       regmap_write(sys->regmap, AT91_PMC_SCER, 1 << sys->id);
7122  
7123         if (!is_pck(sys->id))
7124                 return 0;
7125  
7126 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
7127 -               if (sys->irq) {
7128 -                       enable_irq(sys->irq);
7129 -                       wait_event(sys->wait,
7130 -                                  pmc_read(pmc, AT91_PMC_SR) & mask);
7131 -               } else
7132 -                       cpu_relax();
7133 -       }
7134 +       while (!clk_system_ready(sys->regmap, sys->id))
7135 +               cpu_relax();
7136 +
7137         return 0;
7138  }
7139  
7140  static void clk_system_unprepare(struct clk_hw *hw)
7141  {
7142         struct clk_system *sys = to_clk_system(hw);
7143 -       struct at91_pmc *pmc = sys->pmc;
7144  
7145 -       pmc_write(pmc, AT91_PMC_SCDR, 1 << sys->id);
7146 +       regmap_write(sys->regmap, AT91_PMC_SCDR, 1 << sys->id);
7147  }
7148  
7149  static int clk_system_is_prepared(struct clk_hw *hw)
7150  {
7151         struct clk_system *sys = to_clk_system(hw);
7152 -       struct at91_pmc *pmc = sys->pmc;
7153 +       unsigned int status;
7154 +
7155 +       regmap_read(sys->regmap, AT91_PMC_SCSR, &status);
7156  
7157 -       if (!(pmc_read(pmc, AT91_PMC_SCSR) & (1 << sys->id)))
7158 +       if (!(status & (1 << sys->id)))
7159                 return 0;
7160  
7161         if (!is_pck(sys->id))
7162                 return 1;
7163  
7164 -       return !!(pmc_read(pmc, AT91_PMC_SR) & (1 << sys->id));
7165 +       regmap_read(sys->regmap, AT91_PMC_SR, &status);
7166 +
7167 +       return status & (1 << sys->id) ? 1 : 0;
7168  }
7169  
7170  static const struct clk_ops system_ops = {
7171 @@ -100,13 +89,12 @@ static const struct clk_ops system_ops = {
7172  };
7173  
7174  static struct clk * __init
7175 -at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7176 -                        const char *parent_name, u8 id, int irq)
7177 +at91_clk_register_system(struct regmap *regmap, const char *name,
7178 +                        const char *parent_name, u8 id)
7179  {
7180         struct clk_system *sys;
7181         struct clk *clk = NULL;
7182         struct clk_init_data init;
7183 -       int ret;
7184  
7185         if (!parent_name || id > SYSTEM_MAX_ID)
7186                 return ERR_PTR(-EINVAL);
7187 @@ -123,44 +111,33 @@ at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7188  
7189         sys->id = id;
7190         sys->hw.init = &init;
7191 -       sys->pmc = pmc;
7192 -       sys->irq = irq;
7193 -       if (irq) {
7194 -               init_waitqueue_head(&sys->wait);
7195 -               irq_set_status_flags(sys->irq, IRQ_NOAUTOEN);
7196 -               ret = request_irq(sys->irq, clk_system_irq_handler,
7197 -                               IRQF_TRIGGER_HIGH, name, sys);
7198 -               if (ret) {
7199 -                       kfree(sys);
7200 -                       return ERR_PTR(ret);
7201 -               }
7202 -       }
7203 +       sys->regmap = regmap;
7204  
7205         clk = clk_register(NULL, &sys->hw);
7206 -       if (IS_ERR(clk)) {
7207 -               if (irq)
7208 -                       free_irq(sys->irq, sys);
7209 +       if (IS_ERR(clk))
7210                 kfree(sys);
7211 -       }
7212  
7213         return clk;
7214  }
7215  
7216 -static void __init
7217 -of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7218 +static void __init of_at91rm9200_clk_sys_setup(struct device_node *np)
7219  {
7220         int num;
7221 -       int irq = 0;
7222         u32 id;
7223         struct clk *clk;
7224         const char *name;
7225         struct device_node *sysclknp;
7226         const char *parent_name;
7227 +       struct regmap *regmap;
7228  
7229         num = of_get_child_count(np);
7230         if (num > (SYSTEM_MAX_ID + 1))
7231                 return;
7232  
7233 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7234 +       if (IS_ERR(regmap))
7235 +               return;
7236 +
7237         for_each_child_of_node(np, sysclknp) {
7238                 if (of_property_read_u32(sysclknp, "reg", &id))
7239                         continue;
7240 @@ -168,21 +145,14 @@ of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7241                 if (of_property_read_string(np, "clock-output-names", &name))
7242                         name = sysclknp->name;
7243  
7244 -               if (is_pck(id))
7245 -                       irq = irq_of_parse_and_map(sysclknp, 0);
7246 -
7247                 parent_name = of_clk_get_parent_name(sysclknp, 0);
7248  
7249 -               clk = at91_clk_register_system(pmc, name, parent_name, id, irq);
7250 +               clk = at91_clk_register_system(regmap, name, parent_name, id);
7251                 if (IS_ERR(clk))
7252                         continue;
7253  
7254                 of_clk_add_provider(sysclknp, of_clk_src_simple_get, clk);
7255         }
7256  }
7257 -
7258 -void __init of_at91rm9200_clk_sys_setup(struct device_node *np,
7259 -                                       struct at91_pmc *pmc)
7260 -{
7261 -       of_at91_clk_sys_setup(np, pmc);
7262 -}
7263 +CLK_OF_DECLARE(at91rm9200_clk_sys, "atmel,at91rm9200-clk-system",
7264 +              of_at91rm9200_clk_sys_setup);
7265 diff --git a/drivers/clk/at91/clk-usb.c b/drivers/clk/at91/clk-usb.c
7266 index 8ab8502778a2..650ca45892c0 100644
7267 --- a/drivers/clk/at91/clk-usb.c
7268 +++ b/drivers/clk/at91/clk-usb.c
7269 @@ -12,8 +12,8 @@
7270  #include <linux/clkdev.h>
7271  #include <linux/clk/at91_pmc.h>
7272  #include <linux/of.h>
7273 -#include <linux/of_address.h>
7274 -#include <linux/io.h>
7275 +#include <linux/mfd/syscon.h>
7276 +#include <linux/regmap.h>
7277  
7278  #include "pmc.h"
7279  
7280 @@ -27,7 +27,7 @@
7281  
7282  struct at91sam9x5_clk_usb {
7283         struct clk_hw hw;
7284 -       struct at91_pmc *pmc;
7285 +       struct regmap *regmap;
7286  };
7287  
7288  #define to_at91sam9x5_clk_usb(hw) \
7289 @@ -35,7 +35,7 @@ struct at91sam9x5_clk_usb {
7290  
7291  struct at91rm9200_clk_usb {
7292         struct clk_hw hw;
7293 -       struct at91_pmc *pmc;
7294 +       struct regmap *regmap;
7295         u32 divisors[4];
7296  };
7297  
7298 @@ -45,13 +45,12 @@ struct at91rm9200_clk_usb {
7299  static unsigned long at91sam9x5_clk_usb_recalc_rate(struct clk_hw *hw,
7300                                                     unsigned long parent_rate)
7301  {
7302 -       u32 tmp;
7303 -       u8 usbdiv;
7304         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7305 -       struct at91_pmc *pmc = usb->pmc;
7306 +       unsigned int usbr;
7307 +       u8 usbdiv;
7308  
7309 -       tmp = pmc_read(pmc, AT91_PMC_USB);
7310 -       usbdiv = (tmp & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7311 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7312 +       usbdiv = (usbr & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7313  
7314         return DIV_ROUND_CLOSEST(parent_rate, (usbdiv + 1));
7315  }
7316 @@ -109,33 +108,31 @@ static int at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
7317  
7318  static int at91sam9x5_clk_usb_set_parent(struct clk_hw *hw, u8 index)
7319  {
7320 -       u32 tmp;
7321         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7322 -       struct at91_pmc *pmc = usb->pmc;
7323  
7324         if (index > 1)
7325                 return -EINVAL;
7326 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS;
7327 -       if (index)
7328 -               tmp |= AT91_PMC_USBS;
7329 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7330 +
7331 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7332 +                          index ? AT91_PMC_USBS : 0);
7333 +
7334         return 0;
7335  }
7336  
7337  static u8 at91sam9x5_clk_usb_get_parent(struct clk_hw *hw)
7338  {
7339         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7340 -       struct at91_pmc *pmc = usb->pmc;
7341 +       unsigned int usbr;
7342  
7343 -       return pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS;
7344 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7345 +
7346 +       return usbr & AT91_PMC_USBS;
7347  }
7348  
7349  static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7350                                        unsigned long parent_rate)
7351  {
7352 -       u32 tmp;
7353         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7354 -       struct at91_pmc *pmc = usb->pmc;
7355         unsigned long div;
7356  
7357         if (!rate)
7358 @@ -145,9 +142,8 @@ static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7359         if (div > SAM9X5_USB_MAX_DIV + 1 || !div)
7360                 return -EINVAL;
7361  
7362 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_OHCIUSBDIV;
7363 -       tmp |= (div - 1) << SAM9X5_USB_DIV_SHIFT;
7364 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7365 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_OHCIUSBDIV,
7366 +                          (div - 1) << SAM9X5_USB_DIV_SHIFT);
7367  
7368         return 0;
7369  }
7370 @@ -163,28 +159,28 @@ static const struct clk_ops at91sam9x5_usb_ops = {
7371  static int at91sam9n12_clk_usb_enable(struct clk_hw *hw)
7372  {
7373         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7374 -       struct at91_pmc *pmc = usb->pmc;
7375  
7376 -       pmc_write(pmc, AT91_PMC_USB,
7377 -                 pmc_read(pmc, AT91_PMC_USB) | AT91_PMC_USBS);
7378 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7379 +                          AT91_PMC_USBS);
7380 +
7381         return 0;
7382  }
7383  
7384  static void at91sam9n12_clk_usb_disable(struct clk_hw *hw)
7385  {
7386         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7387 -       struct at91_pmc *pmc = usb->pmc;
7388  
7389 -       pmc_write(pmc, AT91_PMC_USB,
7390 -                 pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS);
7391 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS, 0);
7392  }
7393  
7394  static int at91sam9n12_clk_usb_is_enabled(struct clk_hw *hw)
7395  {
7396         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7397 -       struct at91_pmc *pmc = usb->pmc;
7398 +       unsigned int usbr;
7399  
7400 -       return !!(pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS);
7401 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7402 +
7403 +       return usbr & AT91_PMC_USBS;
7404  }
7405  
7406  static const struct clk_ops at91sam9n12_usb_ops = {
7407 @@ -197,7 +193,7 @@ static const struct clk_ops at91sam9n12_usb_ops = {
7408  };
7409  
7410  static struct clk * __init
7411 -at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7412 +at91sam9x5_clk_register_usb(struct regmap *regmap, const char *name,
7413                             const char **parent_names, u8 num_parents)
7414  {
7415         struct at91sam9x5_clk_usb *usb;
7416 @@ -216,7 +212,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7417                      CLK_SET_RATE_PARENT;
7418  
7419         usb->hw.init = &init;
7420 -       usb->pmc = pmc;
7421 +       usb->regmap = regmap;
7422  
7423         clk = clk_register(NULL, &usb->hw);
7424         if (IS_ERR(clk))
7425 @@ -226,7 +222,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7426  }
7427  
7428  static struct clk * __init
7429 -at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7430 +at91sam9n12_clk_register_usb(struct regmap *regmap, const char *name,
7431                              const char *parent_name)
7432  {
7433         struct at91sam9x5_clk_usb *usb;
7434 @@ -244,7 +240,7 @@ at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7435         init.flags = CLK_SET_RATE_GATE | CLK_SET_RATE_PARENT;
7436  
7437         usb->hw.init = &init;
7438 -       usb->pmc = pmc;
7439 +       usb->regmap = regmap;
7440  
7441         clk = clk_register(NULL, &usb->hw);
7442         if (IS_ERR(clk))
7443 @@ -257,12 +253,12 @@ static unsigned long at91rm9200_clk_usb_recalc_rate(struct clk_hw *hw,
7444                                                     unsigned long parent_rate)
7445  {
7446         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7447 -       struct at91_pmc *pmc = usb->pmc;
7448 -       u32 tmp;
7449 +       unsigned int pllbr;
7450         u8 usbdiv;
7451  
7452 -       tmp = pmc_read(pmc, AT91_CKGR_PLLBR);
7453 -       usbdiv = (tmp & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7454 +       regmap_read(usb->regmap, AT91_CKGR_PLLBR, &pllbr);
7455 +
7456 +       usbdiv = (pllbr & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7457         if (usb->divisors[usbdiv])
7458                 return parent_rate / usb->divisors[usbdiv];
7459  
7460 @@ -310,10 +306,8 @@ static long at91rm9200_clk_usb_round_rate(struct clk_hw *hw, unsigned long rate,
7461  static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7462                                        unsigned long parent_rate)
7463  {
7464 -       u32 tmp;
7465         int i;
7466         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7467 -       struct at91_pmc *pmc = usb->pmc;
7468         unsigned long div;
7469  
7470         if (!rate)
7471 @@ -323,10 +317,10 @@ static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7472  
7473         for (i = 0; i < RM9200_USB_DIV_TAB_SIZE; i++) {
7474                 if (usb->divisors[i] == div) {
7475 -                       tmp = pmc_read(pmc, AT91_CKGR_PLLBR) &
7476 -                             ~AT91_PMC_USBDIV;
7477 -                       tmp |= i << RM9200_USB_DIV_SHIFT;
7478 -                       pmc_write(pmc, AT91_CKGR_PLLBR, tmp);
7479 +                       regmap_update_bits(usb->regmap, AT91_CKGR_PLLBR,
7480 +                                          AT91_PMC_USBDIV,
7481 +                                          i << RM9200_USB_DIV_SHIFT);
7482 +
7483                         return 0;
7484                 }
7485         }
7486 @@ -341,7 +335,7 @@ static const struct clk_ops at91rm9200_usb_ops = {
7487  };
7488  
7489  static struct clk * __init
7490 -at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7491 +at91rm9200_clk_register_usb(struct regmap *regmap, const char *name,
7492                             const char *parent_name, const u32 *divisors)
7493  {
7494         struct at91rm9200_clk_usb *usb;
7495 @@ -359,7 +353,7 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7496         init.flags = CLK_SET_RATE_PARENT;
7497  
7498         usb->hw.init = &init;
7499 -       usb->pmc = pmc;
7500 +       usb->regmap = regmap;
7501         memcpy(usb->divisors, divisors, sizeof(usb->divisors));
7502  
7503         clk = clk_register(NULL, &usb->hw);
7504 @@ -369,13 +363,13 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7505         return clk;
7506  }
7507  
7508 -void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7509 -                                       struct at91_pmc *pmc)
7510 +static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np)
7511  {
7512         struct clk *clk;
7513         int num_parents;
7514         const char *parent_names[USB_SOURCE_MAX];
7515         const char *name = np->name;
7516 +       struct regmap *regmap;
7517  
7518         num_parents = of_clk_get_parent_count(np);
7519         if (num_parents <= 0 || num_parents > USB_SOURCE_MAX)
7520 @@ -385,19 +379,26 @@ void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7521  
7522         of_property_read_string(np, "clock-output-names", &name);
7523  
7524 -       clk = at91sam9x5_clk_register_usb(pmc, name, parent_names, num_parents);
7525 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7526 +       if (IS_ERR(regmap))
7527 +               return;
7528 +
7529 +       clk = at91sam9x5_clk_register_usb(regmap, name, parent_names,
7530 +                                         num_parents);
7531         if (IS_ERR(clk))
7532                 return;
7533  
7534         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7535  }
7536 +CLK_OF_DECLARE(at91sam9x5_clk_usb, "atmel,at91sam9x5-clk-usb",
7537 +              of_at91sam9x5_clk_usb_setup);
7538  
7539 -void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7540 -                                        struct at91_pmc *pmc)
7541 +static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np)
7542  {
7543         struct clk *clk;
7544         const char *parent_name;
7545         const char *name = np->name;
7546 +       struct regmap *regmap;
7547  
7548         parent_name = of_clk_get_parent_name(np, 0);
7549         if (!parent_name)
7550 @@ -405,20 +406,26 @@ void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7551  
7552         of_property_read_string(np, "clock-output-names", &name);
7553  
7554 -       clk = at91sam9n12_clk_register_usb(pmc, name, parent_name);
7555 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7556 +       if (IS_ERR(regmap))
7557 +               return;
7558 +
7559 +       clk = at91sam9n12_clk_register_usb(regmap, name, parent_name);
7560         if (IS_ERR(clk))
7561                 return;
7562  
7563         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7564  }
7565 +CLK_OF_DECLARE(at91sam9n12_clk_usb, "atmel,at91sam9n12-clk-usb",
7566 +              of_at91sam9n12_clk_usb_setup);
7567  
7568 -void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7569 -                                       struct at91_pmc *pmc)
7570 +static void __init of_at91rm9200_clk_usb_setup(struct device_node *np)
7571  {
7572         struct clk *clk;
7573         const char *parent_name;
7574         const char *name = np->name;
7575         u32 divisors[4] = {0, 0, 0, 0};
7576 +       struct regmap *regmap;
7577  
7578         parent_name = of_clk_get_parent_name(np, 0);
7579         if (!parent_name)
7580 @@ -430,9 +437,15 @@ void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7581  
7582         of_property_read_string(np, "clock-output-names", &name);
7583  
7584 -       clk = at91rm9200_clk_register_usb(pmc, name, parent_name, divisors);
7585 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7586 +       if (IS_ERR(regmap))
7587 +               return;
7588 +
7589 +       clk = at91rm9200_clk_register_usb(regmap, name, parent_name, divisors);
7590         if (IS_ERR(clk))
7591                 return;
7592  
7593         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7594  }
7595 +CLK_OF_DECLARE(at91rm9200_clk_usb, "atmel,at91rm9200-clk-usb",
7596 +              of_at91rm9200_clk_usb_setup);
7597 diff --git a/drivers/clk/at91/clk-utmi.c b/drivers/clk/at91/clk-utmi.c
7598 index ca561e90a60f..61fcf399e58c 100644
7599 --- a/drivers/clk/at91/clk-utmi.c
7600 +++ b/drivers/clk/at91/clk-utmi.c
7601 @@ -11,14 +11,9 @@
7602  #include <linux/clk-provider.h>
7603  #include <linux/clkdev.h>
7604  #include <linux/clk/at91_pmc.h>
7605 -#include <linux/interrupt.h>
7606 -#include <linux/irq.h>
7607  #include <linux/of.h>
7608 -#include <linux/of_address.h>
7609 -#include <linux/of_irq.h>
7610 -#include <linux/io.h>
7611 -#include <linux/sched.h>
7612 -#include <linux/wait.h>
7613 +#include <linux/mfd/syscon.h>
7614 +#include <linux/regmap.h>
7615  
7616  #include "pmc.h"
7617  
7618 @@ -26,37 +21,30 @@
7619  
7620  struct clk_utmi {
7621         struct clk_hw hw;
7622 -       struct at91_pmc *pmc;
7623 -       unsigned int irq;
7624 -       wait_queue_head_t wait;
7625 +       struct regmap *regmap;
7626  };
7627  
7628  #define to_clk_utmi(hw) container_of(hw, struct clk_utmi, hw)
7629  
7630 -static irqreturn_t clk_utmi_irq_handler(int irq, void *dev_id)
7631 +static inline bool clk_utmi_ready(struct regmap *regmap)
7632  {
7633 -       struct clk_utmi *utmi = (struct clk_utmi *)dev_id;
7634 +       unsigned int status;
7635  
7636 -       wake_up(&utmi->wait);
7637 -       disable_irq_nosync(utmi->irq);
7638 +       regmap_read(regmap, AT91_PMC_SR, &status);
7639  
7640 -       return IRQ_HANDLED;
7641 +       return status & AT91_PMC_LOCKU;
7642  }
7643  
7644  static int clk_utmi_prepare(struct clk_hw *hw)
7645  {
7646         struct clk_utmi *utmi = to_clk_utmi(hw);
7647 -       struct at91_pmc *pmc = utmi->pmc;
7648 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) | AT91_PMC_UPLLEN |
7649 -                 AT91_PMC_UPLLCOUNT | AT91_PMC_BIASEN;
7650 +       unsigned int uckr = AT91_PMC_UPLLEN | AT91_PMC_UPLLCOUNT |
7651 +                           AT91_PMC_BIASEN;
7652  
7653 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7654 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, uckr, uckr);
7655  
7656 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU)) {
7657 -               enable_irq(utmi->irq);
7658 -               wait_event(utmi->wait,
7659 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7660 -       }
7661 +       while (!clk_utmi_ready(utmi->regmap))
7662 +               cpu_relax();
7663  
7664         return 0;
7665  }
7666 @@ -64,18 +52,15 @@ static int clk_utmi_prepare(struct clk_hw *hw)
7667  static int clk_utmi_is_prepared(struct clk_hw *hw)
7668  {
7669         struct clk_utmi *utmi = to_clk_utmi(hw);
7670 -       struct at91_pmc *pmc = utmi->pmc;
7671  
7672 -       return !!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7673 +       return clk_utmi_ready(utmi->regmap);
7674  }
7675  
7676  static void clk_utmi_unprepare(struct clk_hw *hw)
7677  {
7678         struct clk_utmi *utmi = to_clk_utmi(hw);
7679 -       struct at91_pmc *pmc = utmi->pmc;
7680 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) & ~AT91_PMC_UPLLEN;
7681  
7682 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7683 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, AT91_PMC_UPLLEN, 0);
7684  }
7685  
7686  static unsigned long clk_utmi_recalc_rate(struct clk_hw *hw,
7687 @@ -93,10 +78,9 @@ static const struct clk_ops utmi_ops = {
7688  };
7689  
7690  static struct clk * __init
7691 -at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7692 +at91_clk_register_utmi(struct regmap *regmap,
7693                        const char *name, const char *parent_name)
7694  {
7695 -       int ret;
7696         struct clk_utmi *utmi;
7697         struct clk *clk = NULL;
7698         struct clk_init_data init;
7699 @@ -112,52 +96,36 @@ at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7700         init.flags = CLK_SET_RATE_GATE;
7701  
7702         utmi->hw.init = &init;
7703 -       utmi->pmc = pmc;
7704 -       utmi->irq = irq;
7705 -       init_waitqueue_head(&utmi->wait);
7706 -       irq_set_status_flags(utmi->irq, IRQ_NOAUTOEN);
7707 -       ret = request_irq(utmi->irq, clk_utmi_irq_handler,
7708 -                         IRQF_TRIGGER_HIGH, "clk-utmi", utmi);
7709 -       if (ret) {
7710 -               kfree(utmi);
7711 -               return ERR_PTR(ret);
7712 -       }
7713 +       utmi->regmap = regmap;
7714  
7715         clk = clk_register(NULL, &utmi->hw);
7716 -       if (IS_ERR(clk)) {
7717 -               free_irq(utmi->irq, utmi);
7718 +       if (IS_ERR(clk))
7719                 kfree(utmi);
7720 -       }
7721  
7722         return clk;
7723  }
7724  
7725 -static void __init
7726 -of_at91_clk_utmi_setup(struct device_node *np, struct at91_pmc *pmc)
7727 +static void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np)
7728  {
7729 -       unsigned int irq;
7730         struct clk *clk;
7731         const char *parent_name;
7732         const char *name = np->name;
7733 +       struct regmap *regmap;
7734  
7735         parent_name = of_clk_get_parent_name(np, 0);
7736  
7737         of_property_read_string(np, "clock-output-names", &name);
7738  
7739 -       irq = irq_of_parse_and_map(np, 0);
7740 -       if (!irq)
7741 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7742 +       if (IS_ERR(regmap))
7743                 return;
7744  
7745 -       clk = at91_clk_register_utmi(pmc, irq, name, parent_name);
7746 +       clk = at91_clk_register_utmi(regmap, name, parent_name);
7747         if (IS_ERR(clk))
7748                 return;
7749  
7750         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7751         return;
7752  }
7753 -
7754 -void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np,
7755 -                                        struct at91_pmc *pmc)
7756 -{
7757 -       of_at91_clk_utmi_setup(np, pmc);
7758 -}
7759 +CLK_OF_DECLARE(at91sam9x5_clk_utmi, "atmel,at91sam9x5-clk-utmi",
7760 +              of_at91sam9x5_clk_utmi_setup);
7761 diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c
7762 index 8476b570779b..526df5ba042d 100644
7763 --- a/drivers/clk/at91/pmc.c
7764 +++ b/drivers/clk/at91/pmc.c
7765 @@ -12,36 +12,13 @@
7766  #include <linux/clkdev.h>
7767  #include <linux/clk/at91_pmc.h>
7768  #include <linux/of.h>
7769 -#include <linux/of_address.h>
7770 -#include <linux/io.h>
7771 -#include <linux/interrupt.h>
7772 -#include <linux/irq.h>
7773 -#include <linux/irqchip/chained_irq.h>
7774 -#include <linux/irqdomain.h>
7775 -#include <linux/of_irq.h>
7776 +#include <linux/mfd/syscon.h>
7777 +#include <linux/regmap.h>
7778  
7779  #include <asm/proc-fns.h>
7780  
7781  #include "pmc.h"
7782  
7783 -void __iomem *at91_pmc_base;
7784 -EXPORT_SYMBOL_GPL(at91_pmc_base);
7785 -
7786 -void at91rm9200_idle(void)
7787 -{
7788 -       /*
7789 -        * Disable the processor clock.  The processor will be automatically
7790 -        * re-enabled by an interrupt or by a reset.
7791 -        */
7792 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7793 -}
7794 -
7795 -void at91sam9_idle(void)
7796 -{
7797 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7798 -       cpu_do_idle();
7799 -}
7800 -
7801  int of_at91_get_clk_range(struct device_node *np, const char *propname,
7802                           struct clk_range *range)
7803  {
7804 @@ -64,402 +41,3 @@ int of_at91_get_clk_range(struct device_node *np, const char *propname,
7805         return 0;
7806  }
7807  EXPORT_SYMBOL_GPL(of_at91_get_clk_range);
7808 -
7809 -static void pmc_irq_mask(struct irq_data *d)
7810 -{
7811 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7812 -
7813 -       pmc_write(pmc, AT91_PMC_IDR, 1 << d->hwirq);
7814 -}
7815 -
7816 -static void pmc_irq_unmask(struct irq_data *d)
7817 -{
7818 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7819 -
7820 -       pmc_write(pmc, AT91_PMC_IER, 1 << d->hwirq);
7821 -}
7822 -
7823 -static int pmc_irq_set_type(struct irq_data *d, unsigned type)
7824 -{
7825 -       if (type != IRQ_TYPE_LEVEL_HIGH) {
7826 -               pr_warn("PMC: type not supported (support only IRQ_TYPE_LEVEL_HIGH type)\n");
7827 -               return -EINVAL;
7828 -       }
7829 -
7830 -       return 0;
7831 -}
7832 -
7833 -static void pmc_irq_suspend(struct irq_data *d)
7834 -{
7835 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7836 -
7837 -       pmc->imr = pmc_read(pmc, AT91_PMC_IMR);
7838 -       pmc_write(pmc, AT91_PMC_IDR, pmc->imr);
7839 -}
7840 -
7841 -static void pmc_irq_resume(struct irq_data *d)
7842 -{
7843 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7844 -
7845 -       pmc_write(pmc, AT91_PMC_IER, pmc->imr);
7846 -}
7847 -
7848 -static struct irq_chip pmc_irq = {
7849 -       .name = "PMC",
7850 -       .irq_disable = pmc_irq_mask,
7851 -       .irq_mask = pmc_irq_mask,
7852 -       .irq_unmask = pmc_irq_unmask,
7853 -       .irq_set_type = pmc_irq_set_type,
7854 -       .irq_suspend = pmc_irq_suspend,
7855 -       .irq_resume = pmc_irq_resume,
7856 -};
7857 -
7858 -static struct lock_class_key pmc_lock_class;
7859 -
7860 -static int pmc_irq_map(struct irq_domain *h, unsigned int virq,
7861 -                      irq_hw_number_t hw)
7862 -{
7863 -       struct at91_pmc *pmc = h->host_data;
7864 -
7865 -       irq_set_lockdep_class(virq, &pmc_lock_class);
7866 -
7867 -       irq_set_chip_and_handler(virq, &pmc_irq,
7868 -                                handle_level_irq);
7869 -       irq_set_chip_data(virq, pmc);
7870 -
7871 -       return 0;
7872 -}
7873 -
7874 -static int pmc_irq_domain_xlate(struct irq_domain *d,
7875 -                               struct device_node *ctrlr,
7876 -                               const u32 *intspec, unsigned int intsize,
7877 -                               irq_hw_number_t *out_hwirq,
7878 -                               unsigned int *out_type)
7879 -{
7880 -       struct at91_pmc *pmc = d->host_data;
7881 -       const struct at91_pmc_caps *caps = pmc->caps;
7882 -
7883 -       if (WARN_ON(intsize < 1))
7884 -               return -EINVAL;
7885 -
7886 -       *out_hwirq = intspec[0];
7887 -
7888 -       if (!(caps->available_irqs & (1 << *out_hwirq)))
7889 -               return -EINVAL;
7890 -
7891 -       *out_type = IRQ_TYPE_LEVEL_HIGH;
7892 -
7893 -       return 0;
7894 -}
7895 -
7896 -static const struct irq_domain_ops pmc_irq_ops = {
7897 -       .map    = pmc_irq_map,
7898 -       .xlate  = pmc_irq_domain_xlate,
7899 -};
7900 -
7901 -static irqreturn_t pmc_irq_handler(int irq, void *data)
7902 -{
7903 -       struct at91_pmc *pmc = (struct at91_pmc *)data;
7904 -       unsigned long sr;
7905 -       int n;
7906 -
7907 -       sr = pmc_read(pmc, AT91_PMC_SR) & pmc_read(pmc, AT91_PMC_IMR);
7908 -       if (!sr)
7909 -               return IRQ_NONE;
7910 -
7911 -       for_each_set_bit(n, &sr, BITS_PER_LONG)
7912 -               generic_handle_irq(irq_find_mapping(pmc->irqdomain, n));
7913 -
7914 -       return IRQ_HANDLED;
7915 -}
7916 -
7917 -static const struct at91_pmc_caps at91rm9200_caps = {
7918 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7919 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7920 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7921 -                         AT91_PMC_PCK3RDY,
7922 -};
7923 -
7924 -static const struct at91_pmc_caps at91sam9260_caps = {
7925 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7926 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7927 -                         AT91_PMC_PCK1RDY,
7928 -};
7929 -
7930 -static const struct at91_pmc_caps at91sam9g45_caps = {
7931 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7932 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7933 -                         AT91_PMC_PCK1RDY,
7934 -};
7935 -
7936 -static const struct at91_pmc_caps at91sam9n12_caps = {
7937 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7938 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7939 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
7940 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
7941 -};
7942 -
7943 -static const struct at91_pmc_caps at91sam9x5_caps = {
7944 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7945 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7946 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
7947 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
7948 -};
7949 -
7950 -static const struct at91_pmc_caps sama5d2_caps = {
7951 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7952 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7953 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7954 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
7955 -                         AT91_PMC_CFDEV | AT91_PMC_GCKRDY,
7956 -};
7957 -
7958 -static const struct at91_pmc_caps sama5d3_caps = {
7959 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7960 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7961 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7962 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
7963 -                         AT91_PMC_CFDEV,
7964 -};
7965 -
7966 -static struct at91_pmc *__init at91_pmc_init(struct device_node *np,
7967 -                                            void __iomem *regbase, int virq,
7968 -                                            const struct at91_pmc_caps *caps)
7969 -{
7970 -       struct at91_pmc *pmc;
7971 -
7972 -       if (!regbase || !virq ||  !caps)
7973 -               return NULL;
7974 -
7975 -       at91_pmc_base = regbase;
7976 -
7977 -       pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
7978 -       if (!pmc)
7979 -               return NULL;
7980 -
7981 -       spin_lock_init(&pmc->lock);
7982 -       pmc->regbase = regbase;
7983 -       pmc->virq = virq;
7984 -       pmc->caps = caps;
7985 -
7986 -       pmc->irqdomain = irq_domain_add_linear(np, 32, &pmc_irq_ops, pmc);
7987 -
7988 -       if (!pmc->irqdomain)
7989 -               goto out_free_pmc;
7990 -
7991 -       pmc_write(pmc, AT91_PMC_IDR, 0xffffffff);
7992 -       if (request_irq(pmc->virq, pmc_irq_handler,
7993 -                       IRQF_SHARED | IRQF_COND_SUSPEND, "pmc", pmc))
7994 -               goto out_remove_irqdomain;
7995 -
7996 -       return pmc;
7997 -
7998 -out_remove_irqdomain:
7999 -       irq_domain_remove(pmc->irqdomain);
8000 -out_free_pmc:
8001 -       kfree(pmc);
8002 -
8003 -       return NULL;
8004 -}
8005 -
8006 -static const struct of_device_id pmc_clk_ids[] __initconst = {
8007 -       /* Slow oscillator */
8008 -       {
8009 -               .compatible = "atmel,at91sam9260-clk-slow",
8010 -               .data = of_at91sam9260_clk_slow_setup,
8011 -       },
8012 -       /* Main clock */
8013 -       {
8014 -               .compatible = "atmel,at91rm9200-clk-main-osc",
8015 -               .data = of_at91rm9200_clk_main_osc_setup,
8016 -       },
8017 -       {
8018 -               .compatible = "atmel,at91sam9x5-clk-main-rc-osc",
8019 -               .data = of_at91sam9x5_clk_main_rc_osc_setup,
8020 -       },
8021 -       {
8022 -               .compatible = "atmel,at91rm9200-clk-main",
8023 -               .data = of_at91rm9200_clk_main_setup,
8024 -       },
8025 -       {
8026 -               .compatible = "atmel,at91sam9x5-clk-main",
8027 -               .data = of_at91sam9x5_clk_main_setup,
8028 -       },
8029 -       /* PLL clocks */
8030 -       {
8031 -               .compatible = "atmel,at91rm9200-clk-pll",
8032 -               .data = of_at91rm9200_clk_pll_setup,
8033 -       },
8034 -       {
8035 -               .compatible = "atmel,at91sam9g45-clk-pll",
8036 -               .data = of_at91sam9g45_clk_pll_setup,
8037 -       },
8038 -       {
8039 -               .compatible = "atmel,at91sam9g20-clk-pllb",
8040 -               .data = of_at91sam9g20_clk_pllb_setup,
8041 -       },
8042 -       {
8043 -               .compatible = "atmel,sama5d3-clk-pll",
8044 -               .data = of_sama5d3_clk_pll_setup,
8045 -       },
8046 -       {
8047 -               .compatible = "atmel,at91sam9x5-clk-plldiv",
8048 -               .data = of_at91sam9x5_clk_plldiv_setup,
8049 -       },
8050 -       /* Master clock */
8051 -       {
8052 -               .compatible = "atmel,at91rm9200-clk-master",
8053 -               .data = of_at91rm9200_clk_master_setup,
8054 -       },
8055 -       {
8056 -               .compatible = "atmel,at91sam9x5-clk-master",
8057 -               .data = of_at91sam9x5_clk_master_setup,
8058 -       },
8059 -       /* System clocks */
8060 -       {
8061 -               .compatible = "atmel,at91rm9200-clk-system",
8062 -               .data = of_at91rm9200_clk_sys_setup,
8063 -       },
8064 -       /* Peripheral clocks */
8065 -       {
8066 -               .compatible = "atmel,at91rm9200-clk-peripheral",
8067 -               .data = of_at91rm9200_clk_periph_setup,
8068 -       },
8069 -       {
8070 -               .compatible = "atmel,at91sam9x5-clk-peripheral",
8071 -               .data = of_at91sam9x5_clk_periph_setup,
8072 -       },
8073 -       /* Programmable clocks */
8074 -       {
8075 -               .compatible = "atmel,at91rm9200-clk-programmable",
8076 -               .data = of_at91rm9200_clk_prog_setup,
8077 -       },
8078 -       {
8079 -               .compatible = "atmel,at91sam9g45-clk-programmable",
8080 -               .data = of_at91sam9g45_clk_prog_setup,
8081 -       },
8082 -       {
8083 -               .compatible = "atmel,at91sam9x5-clk-programmable",
8084 -               .data = of_at91sam9x5_clk_prog_setup,
8085 -       },
8086 -       /* UTMI clock */
8087 -#if defined(CONFIG_HAVE_AT91_UTMI)
8088 -       {
8089 -               .compatible = "atmel,at91sam9x5-clk-utmi",
8090 -               .data = of_at91sam9x5_clk_utmi_setup,
8091 -       },
8092 -#endif
8093 -       /* USB clock */
8094 -#if defined(CONFIG_HAVE_AT91_USB_CLK)
8095 -       {
8096 -               .compatible = "atmel,at91rm9200-clk-usb",
8097 -               .data = of_at91rm9200_clk_usb_setup,
8098 -       },
8099 -       {
8100 -               .compatible = "atmel,at91sam9x5-clk-usb",
8101 -               .data = of_at91sam9x5_clk_usb_setup,
8102 -       },
8103 -       {
8104 -               .compatible = "atmel,at91sam9n12-clk-usb",
8105 -               .data = of_at91sam9n12_clk_usb_setup,
8106 -       },
8107 -#endif
8108 -       /* SMD clock */
8109 -#if defined(CONFIG_HAVE_AT91_SMD)
8110 -       {
8111 -               .compatible = "atmel,at91sam9x5-clk-smd",
8112 -               .data = of_at91sam9x5_clk_smd_setup,
8113 -       },
8114 -#endif
8115 -#if defined(CONFIG_HAVE_AT91_H32MX)
8116 -       {
8117 -               .compatible = "atmel,sama5d4-clk-h32mx",
8118 -               .data = of_sama5d4_clk_h32mx_setup,
8119 -       },
8120 -#endif
8121 -#if defined(CONFIG_HAVE_AT91_GENERATED_CLK)
8122 -       {
8123 -               .compatible = "atmel,sama5d2-clk-generated",
8124 -               .data = of_sama5d2_clk_generated_setup,
8125 -       },
8126 -#endif
8127 -       { /*sentinel*/ }
8128 -};
8129 -
8130 -static void __init of_at91_pmc_setup(struct device_node *np,
8131 -                                    const struct at91_pmc_caps *caps)
8132 -{
8133 -       struct at91_pmc *pmc;
8134 -       struct device_node *childnp;
8135 -       void (*clk_setup)(struct device_node *, struct at91_pmc *);
8136 -       const struct of_device_id *clk_id;
8137 -       void __iomem *regbase = of_iomap(np, 0);
8138 -       int virq;
8139 -
8140 -       if (!regbase)
8141 -               return;
8142 -
8143 -       virq = irq_of_parse_and_map(np, 0);
8144 -       if (!virq)
8145 -               return;
8146 -
8147 -       pmc = at91_pmc_init(np, regbase, virq, caps);
8148 -       if (!pmc)
8149 -               return;
8150 -       for_each_child_of_node(np, childnp) {
8151 -               clk_id = of_match_node(pmc_clk_ids, childnp);
8152 -               if (!clk_id)
8153 -                       continue;
8154 -               clk_setup = clk_id->data;
8155 -               clk_setup(childnp, pmc);
8156 -       }
8157 -}
8158 -
8159 -static void __init of_at91rm9200_pmc_setup(struct device_node *np)
8160 -{
8161 -       of_at91_pmc_setup(np, &at91rm9200_caps);
8162 -}
8163 -CLK_OF_DECLARE(at91rm9200_clk_pmc, "atmel,at91rm9200-pmc",
8164 -              of_at91rm9200_pmc_setup);
8165 -
8166 -static void __init of_at91sam9260_pmc_setup(struct device_node *np)
8167 -{
8168 -       of_at91_pmc_setup(np, &at91sam9260_caps);
8169 -}
8170 -CLK_OF_DECLARE(at91sam9260_clk_pmc, "atmel,at91sam9260-pmc",
8171 -              of_at91sam9260_pmc_setup);
8172 -
8173 -static void __init of_at91sam9g45_pmc_setup(struct device_node *np)
8174 -{
8175 -       of_at91_pmc_setup(np, &at91sam9g45_caps);
8176 -}
8177 -CLK_OF_DECLARE(at91sam9g45_clk_pmc, "atmel,at91sam9g45-pmc",
8178 -              of_at91sam9g45_pmc_setup);
8179 -
8180 -static void __init of_at91sam9n12_pmc_setup(struct device_node *np)
8181 -{
8182 -       of_at91_pmc_setup(np, &at91sam9n12_caps);
8183 -}
8184 -CLK_OF_DECLARE(at91sam9n12_clk_pmc, "atmel,at91sam9n12-pmc",
8185 -              of_at91sam9n12_pmc_setup);
8186 -
8187 -static void __init of_at91sam9x5_pmc_setup(struct device_node *np)
8188 -{
8189 -       of_at91_pmc_setup(np, &at91sam9x5_caps);
8190 -}
8191 -CLK_OF_DECLARE(at91sam9x5_clk_pmc, "atmel,at91sam9x5-pmc",
8192 -              of_at91sam9x5_pmc_setup);
8193 -
8194 -static void __init of_sama5d2_pmc_setup(struct device_node *np)
8195 -{
8196 -       of_at91_pmc_setup(np, &sama5d2_caps);
8197 -}
8198 -CLK_OF_DECLARE(sama5d2_clk_pmc, "atmel,sama5d2-pmc",
8199 -              of_sama5d2_pmc_setup);
8200 -
8201 -static void __init of_sama5d3_pmc_setup(struct device_node *np)
8202 -{
8203 -       of_at91_pmc_setup(np, &sama5d3_caps);
8204 -}
8205 -CLK_OF_DECLARE(sama5d3_clk_pmc, "atmel,sama5d3-pmc",
8206 -              of_sama5d3_pmc_setup);
8207 diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h
8208 index f65739272779..5771fff0ee3f 100644
8209 --- a/drivers/clk/at91/pmc.h
8210 +++ b/drivers/clk/at91/pmc.h
8211 @@ -14,8 +14,11 @@
8212  
8213  #include <linux/io.h>
8214  #include <linux/irqdomain.h>
8215 +#include <linux/regmap.h>
8216  #include <linux/spinlock.h>
8217  
8218 +extern spinlock_t pmc_pcr_lock;
8219 +
8220  struct clk_range {
8221         unsigned long min;
8222         unsigned long max;
8223 @@ -23,102 +26,7 @@ struct clk_range {
8224  
8225  #define CLK_RANGE(MIN, MAX) {.min = MIN, .max = MAX,}
8226  
8227 -struct at91_pmc_caps {
8228 -       u32 available_irqs;
8229 -};
8230 -
8231 -struct at91_pmc {
8232 -       void __iomem *regbase;
8233 -       int virq;
8234 -       spinlock_t lock;
8235 -       const struct at91_pmc_caps *caps;
8236 -       struct irq_domain *irqdomain;
8237 -       u32 imr;
8238 -};
8239 -
8240 -static inline void pmc_lock(struct at91_pmc *pmc)
8241 -{
8242 -       spin_lock(&pmc->lock);
8243 -}
8244 -
8245 -static inline void pmc_unlock(struct at91_pmc *pmc)
8246 -{
8247 -       spin_unlock(&pmc->lock);
8248 -}
8249 -
8250 -static inline u32 pmc_read(struct at91_pmc *pmc, int offset)
8251 -{
8252 -       return readl(pmc->regbase + offset);
8253 -}
8254 -
8255 -static inline void pmc_write(struct at91_pmc *pmc, int offset, u32 value)
8256 -{
8257 -       writel(value, pmc->regbase + offset);
8258 -}
8259 -
8260  int of_at91_get_clk_range(struct device_node *np, const char *propname,
8261                           struct clk_range *range);
8262  
8263 -void of_at91sam9260_clk_slow_setup(struct device_node *np,
8264 -                                  struct at91_pmc *pmc);
8265 -
8266 -void of_at91rm9200_clk_main_osc_setup(struct device_node *np,
8267 -                                     struct at91_pmc *pmc);
8268 -void of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
8269 -                                        struct at91_pmc *pmc);
8270 -void of_at91rm9200_clk_main_setup(struct device_node *np,
8271 -                                 struct at91_pmc *pmc);
8272 -void of_at91sam9x5_clk_main_setup(struct device_node *np,
8273 -                                 struct at91_pmc *pmc);
8274 -
8275 -void of_at91rm9200_clk_pll_setup(struct device_node *np,
8276 -                                struct at91_pmc *pmc);
8277 -void of_at91sam9g45_clk_pll_setup(struct device_node *np,
8278 -                                 struct at91_pmc *pmc);
8279 -void of_at91sam9g20_clk_pllb_setup(struct device_node *np,
8280 -                                  struct at91_pmc *pmc);
8281 -void of_sama5d3_clk_pll_setup(struct device_node *np,
8282 -                             struct at91_pmc *pmc);
8283 -void of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
8284 -                                   struct at91_pmc *pmc);
8285 -
8286 -void of_at91rm9200_clk_master_setup(struct device_node *np,
8287 -                                   struct at91_pmc *pmc);
8288 -void of_at91sam9x5_clk_master_setup(struct device_node *np,
8289 -                                   struct at91_pmc *pmc);
8290 -
8291 -void of_at91rm9200_clk_sys_setup(struct device_node *np,
8292 -                                struct at91_pmc *pmc);
8293 -
8294 -void of_at91rm9200_clk_periph_setup(struct device_node *np,
8295 -                                   struct at91_pmc *pmc);
8296 -void of_at91sam9x5_clk_periph_setup(struct device_node *np,
8297 -                                   struct at91_pmc *pmc);
8298 -
8299 -void of_at91rm9200_clk_prog_setup(struct device_node *np,
8300 -                                 struct at91_pmc *pmc);
8301 -void of_at91sam9g45_clk_prog_setup(struct device_node *np,
8302 -                                  struct at91_pmc *pmc);
8303 -void of_at91sam9x5_clk_prog_setup(struct device_node *np,
8304 -                                 struct at91_pmc *pmc);
8305 -
8306 -void of_at91sam9x5_clk_utmi_setup(struct device_node *np,
8307 -                                 struct at91_pmc *pmc);
8308 -
8309 -void of_at91rm9200_clk_usb_setup(struct device_node *np,
8310 -                                struct at91_pmc *pmc);
8311 -void of_at91sam9x5_clk_usb_setup(struct device_node *np,
8312 -                                struct at91_pmc *pmc);
8313 -void of_at91sam9n12_clk_usb_setup(struct device_node *np,
8314 -                                 struct at91_pmc *pmc);
8315 -
8316 -void of_at91sam9x5_clk_smd_setup(struct device_node *np,
8317 -                                struct at91_pmc *pmc);
8318 -
8319 -void of_sama5d4_clk_h32mx_setup(struct device_node *np,
8320 -                               struct at91_pmc *pmc);
8321 -
8322 -void of_sama5d2_clk_generated_setup(struct device_node *np,
8323 -                                   struct at91_pmc *pmc);
8324 -
8325  #endif /* __PMC_H_ */
8326 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
8327 index 4da2af9694a2..5b6f57f500b8 100644
8328 --- a/drivers/clocksource/tcb_clksrc.c
8329 +++ b/drivers/clocksource/tcb_clksrc.c
8330 @@ -23,8 +23,7 @@
8331   *     this 32 bit free-running counter. the second channel is not used.
8332   *
8333   *   - The third channel may be used to provide a 16-bit clockevent
8334 - *     source, used in either periodic or oneshot mode.  This runs
8335 - *     at 32 KiHZ, and can handle delays of up to two seconds.
8336 + *     source, used in either periodic or oneshot mode.
8337   *
8338   * A boot clocksource and clockevent source are also currently needed,
8339   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
8340 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
8341  struct tc_clkevt_device {
8342         struct clock_event_device       clkevt;
8343         struct clk                      *clk;
8344 +       bool                            clk_enabled;
8345 +       u32                             freq;
8346         void __iomem                    *regs;
8347  };
8348  
8349 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
8350         return container_of(clkevt, struct tc_clkevt_device, clkevt);
8351  }
8352  
8353 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
8354 - * because using one of the divided clocks would usually mean the
8355 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
8356 - *
8357 - * A divided clock could be good for high resolution timers, since
8358 - * 30.5 usec resolution can seem "low".
8359 - */
8360  static u32 timer_clock;
8361  
8362 +static void tc_clk_disable(struct clock_event_device *d)
8363 +{
8364 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8365 +
8366 +       clk_disable(tcd->clk);
8367 +       tcd->clk_enabled = false;
8368 +}
8369 +
8370 +static void tc_clk_enable(struct clock_event_device *d)
8371 +{
8372 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8373 +
8374 +       if (tcd->clk_enabled)
8375 +               return;
8376 +       clk_enable(tcd->clk);
8377 +       tcd->clk_enabled = true;
8378 +}
8379 +
8380  static int tc_shutdown(struct clock_event_device *d)
8381  {
8382         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8383 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
8384  
8385         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
8386         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
8387 +       return 0;
8388 +}
8389 +
8390 +static int tc_shutdown_clk_off(struct clock_event_device *d)
8391 +{
8392 +       tc_shutdown(d);
8393         if (!clockevent_state_detached(d))
8394 -               clk_disable(tcd->clk);
8395 +               tc_clk_disable(d);
8396  
8397         return 0;
8398  }
8399 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8400         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8401                 tc_shutdown(d);
8402  
8403 -       clk_enable(tcd->clk);
8404 +       tc_clk_enable(d);
8405  
8406 -       /* slow clock, count up to RC, then irq and stop */
8407 +       /* count up to RC, then irq and stop */
8408         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8409                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8410         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8411 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8412         /* By not making the gentime core emulate periodic mode on top
8413          * of oneshot, we get lower overhead and improved accuracy.
8414          */
8415 -       clk_enable(tcd->clk);
8416 +       tc_clk_enable(d);
8417  
8418 -       /* slow clock, count up to RC, then irq and restart */
8419 +       /* count up to RC, then irq and restart */
8420         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8421                      regs + ATMEL_TC_REG(2, CMR));
8422 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8423 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8424  
8425         /* Enable clock and interrupts on RC compare */
8426         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8427 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
8428                 .features               = CLOCK_EVT_FEAT_PERIODIC |
8429                                           CLOCK_EVT_FEAT_ONESHOT,
8430                 /* Should be lower than at91rm9200's system timer */
8431 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8432                 .rating                 = 125,
8433 +#else
8434 +               .rating                 = 200,
8435 +#endif
8436                 .set_next_event         = tc_next_event,
8437 -               .set_state_shutdown     = tc_shutdown,
8438 +               .set_state_shutdown     = tc_shutdown_clk_off,
8439                 .set_state_periodic     = tc_set_periodic,
8440                 .set_state_oneshot      = tc_set_oneshot,
8441         },
8442 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8443         return IRQ_NONE;
8444  }
8445  
8446 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8447 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8448  {
8449 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
8450         int ret;
8451         struct clk *t2_clk = tc->clk[2];
8452         int irq = tc->irq[2];
8453 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8454         clkevt.regs = tc->regs;
8455         clkevt.clk = t2_clk;
8456  
8457 -       timer_clock = clk32k_divisor_idx;
8458 +       timer_clock = divisor_idx;
8459 +       if (!divisor)
8460 +               clkevt.freq = 32768;
8461 +       else
8462 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
8463  
8464         clkevt.clkevt.cpumask = cpumask_of(0);
8465  
8466 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8467                 return ret;
8468         }
8469  
8470 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8471 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8472  
8473         return ret;
8474  }
8475 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
8476                 goto err_disable_t1;
8477  
8478         /* channel 2:  periodic and oneshot timer support */
8479 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8480         ret = setup_clkevents(tc, clk32k_divisor_idx);
8481 +#else
8482 +       ret = setup_clkevents(tc, best_divisor_idx);
8483 +#endif
8484         if (ret)
8485                 goto err_unregister_clksrc;
8486  
8487 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8488 index d911c5dca8f1..7a40f7e88468 100644
8489 --- a/drivers/clocksource/timer-atmel-pit.c
8490 +++ b/drivers/clocksource/timer-atmel-pit.c
8491 @@ -46,6 +46,7 @@ struct pit_data {
8492         u32             cycle;
8493         u32             cnt;
8494         unsigned int    irq;
8495 +       bool            irq_requested;
8496         struct clk      *mck;
8497  };
8498  
8499 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8500  
8501         /* disable irq, leaving the clocksource active */
8502         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8503 +       if (data->irq_requested) {
8504 +               free_irq(data->irq, data);
8505 +               data->irq_requested = false;
8506 +       }
8507         return 0;
8508  }
8509  
8510 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8511  /*
8512   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
8513   */
8514  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8515  {
8516         struct pit_data *data = clkevt_to_pit_data(dev);
8517 +       int ret;
8518 +
8519 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8520 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8521 +                         "at91_tick", data);
8522 +       if (ret)
8523 +               panic(pr_fmt("Unable to setup IRQ\n"));
8524 +
8525 +       data->irq_requested = true;
8526  
8527         /* update clocksource counter */
8528         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8529 @@ -181,7 +196,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8530  {
8531         unsigned long   pit_rate;
8532         unsigned        bits;
8533 -       int             ret;
8534  
8535         /*
8536          * Use our actual MCK to figure out how many MCK/16 ticks per
8537 @@ -206,13 +220,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8538         data->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
8539         clocksource_register_hz(&data->clksrc, pit_rate);
8540  
8541 -       /* Set up irq handler */
8542 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8543 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8544 -                         "at91_tick", data);
8545 -       if (ret)
8546 -               panic(pr_fmt("Unable to setup IRQ\n"));
8547 -
8548         /* Set up and register clockevents */
8549         data->clkevt.name = "pit";
8550         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8551 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8552 index 29d21d68df5a..103d0fd70cc4 100644
8553 --- a/drivers/clocksource/timer-atmel-st.c
8554 +++ b/drivers/clocksource/timer-atmel-st.c
8555 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8556         last_crtr = read_CRTR();
8557  }
8558  
8559 +static int atmel_st_irq;
8560 +
8561  static int clkevt32k_shutdown(struct clock_event_device *evt)
8562  {
8563         clkdev32k_disable_and_flush_irq();
8564         irqmask = 0;
8565         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8566 +       free_irq(atmel_st_irq, regmap_st);
8567         return 0;
8568  }
8569  
8570  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8571  {
8572 +       int ret;
8573 +
8574         clkdev32k_disable_and_flush_irq();
8575  
8576 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8577 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8578 +                         "at91_tick", regmap_st);
8579 +       if (ret)
8580 +               panic(pr_fmt("Unable to setup IRQ\n"));
8581 +
8582         /*
8583          * ALM for oneshot irqs, set by next_event()
8584          * before 32 seconds have passed.
8585 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8586  
8587  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8588  {
8589 +       int ret;
8590 +
8591         clkdev32k_disable_and_flush_irq();
8592  
8593 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8594 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8595 +                         "at91_tick", regmap_st);
8596 +       if (ret)
8597 +               panic(pr_fmt("Unable to setup IRQ\n"));
8598 +
8599         /* PIT for periodic irqs; fixed rate of 1/HZ */
8600         irqmask = AT91_ST_PITS;
8601         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8602 @@ -198,7 +217,7 @@ static void __init atmel_st_timer_init(struct device_node *node)
8603  {
8604         struct clk *sclk;
8605         unsigned int sclk_rate, val;
8606 -       int irq, ret;
8607 +       int ret;
8608  
8609         regmap_st = syscon_node_to_regmap(node);
8610         if (IS_ERR(regmap_st))
8611 @@ -210,17 +229,10 @@ static void __init atmel_st_timer_init(struct device_node *node)
8612         regmap_read(regmap_st, AT91_ST_SR, &val);
8613  
8614         /* Get the interrupts property */
8615 -       irq  = irq_of_parse_and_map(node, 0);
8616 -       if (!irq)
8617 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8618 +       if (!atmel_st_irq)
8619                 panic(pr_fmt("Unable to get IRQ from DT\n"));
8620  
8621 -       /* Make IRQs happen for the system timer */
8622 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8623 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8624 -                         "at91_tick", regmap_st);
8625 -       if (ret)
8626 -               panic(pr_fmt("Unable to setup IRQ\n"));
8627 -
8628         sclk = of_clk_get(node, 0);
8629         if (IS_ERR(sclk))
8630                 panic(pr_fmt("Unable to get slow clock\n"));
8631 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8632 index c59bdcb83217..8f23161d80be 100644
8633 --- a/drivers/cpufreq/Kconfig.x86
8634 +++ b/drivers/cpufreq/Kconfig.x86
8635 @@ -123,7 +123,7 @@ config X86_POWERNOW_K7_ACPI
8636  
8637  config X86_POWERNOW_K8
8638         tristate "AMD Opteron/Athlon64 PowerNow!"
8639 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8640 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8641         help
8642           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8643           Support for K10 and newer processors is now in acpi-cpufreq.
8644 diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
8645 index 344058f8501a..d5657d50ac40 100644
8646 --- a/drivers/cpuidle/coupled.c
8647 +++ b/drivers/cpuidle/coupled.c
8648 @@ -119,7 +119,6 @@ struct cpuidle_coupled {
8649  
8650  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
8651  
8652 -static DEFINE_MUTEX(cpuidle_coupled_lock);
8653  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
8654  
8655  /*
8656 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8657 index 6ed7d63a0688..9da7482ad256 100644
8658 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8659 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8660 @@ -1264,7 +1264,9 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
8661         if (ret)
8662                 return ret;
8663  
8664 +#ifndef CONFIG_PREEMPT_RT_BASE
8665         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
8666 +#endif
8667  
8668         i915_gem_execbuffer_move_to_active(vmas, params->request);
8669         i915_gem_execbuffer_retire_commands(params);
8670 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8671 index c0a96f1ee18e..deb1e207fa3c 100644
8672 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
8673 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8674 @@ -39,7 +39,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
8675         if (!mutex_is_locked(mutex))
8676                 return false;
8677  
8678 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
8679 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
8680         return mutex->owner == task;
8681  #else
8682         /* Since UP may be pre-empted, we cannot assume that we own the lock */
8683 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8684 index 0f42a2782afc..80a1db09a379 100644
8685 --- a/drivers/gpu/drm/i915/i915_irq.c
8686 +++ b/drivers/gpu/drm/i915/i915_irq.c
8687 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8688         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8689  
8690         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8691 +       preempt_disable_rt();
8692  
8693         /* Get optional system timestamp before query. */
8694         if (stime)
8695 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8696                 *etime = ktime_get();
8697  
8698         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8699 +       preempt_enable_rt();
8700  
8701         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8702  
8703 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
8704 index 909d1d71d130..8688709b4ffa 100644
8705 --- a/drivers/gpu/drm/i915/intel_display.c
8706 +++ b/drivers/gpu/drm/i915/intel_display.c
8707 @@ -11400,7 +11400,7 @@ void intel_check_page_flip(struct drm_device *dev, int pipe)
8708         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
8709         struct intel_unpin_work *work;
8710  
8711 -       WARN_ON(!in_interrupt());
8712 +       WARN_ON_NONRT(!in_interrupt());
8713  
8714         if (crtc == NULL)
8715                 return;
8716 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8717 index 2cc6aa072f4c..b79d33f14868 100644
8718 --- a/drivers/gpu/drm/i915/intel_sprite.c
8719 +++ b/drivers/gpu/drm/i915/intel_sprite.c
8720 @@ -38,6 +38,7 @@
8721  #include "intel_drv.h"
8722  #include <drm/i915_drm.h>
8723  #include "i915_drv.h"
8724 +#include <linux/locallock.h>
8725  
8726  static bool
8727  format_is_yuv(uint32_t format)
8728 @@ -64,6 +65,8 @@ static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8729                             1000 * adjusted_mode->crtc_htotal);
8730  }
8731  
8732 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8733 +
8734  /**
8735   * intel_pipe_update_start() - start update of a set of display registers
8736   * @crtc: the crtc of which the registers are going to be updated
8737 @@ -96,7 +99,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8738         min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
8739         max = vblank_start - 1;
8740  
8741 -       local_irq_disable();
8742 +       local_lock_irq(pipe_update_lock);
8743  
8744         if (min <= 0 || max <= 0)
8745                 return;
8746 @@ -126,11 +129,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8747                         break;
8748                 }
8749  
8750 -               local_irq_enable();
8751 +               local_unlock_irq(pipe_update_lock);
8752  
8753                 timeout = schedule_timeout(timeout);
8754  
8755 -               local_irq_disable();
8756 +               local_lock_irq(pipe_update_lock);
8757         }
8758  
8759         finish_wait(wq, &wait);
8760 @@ -164,7 +167,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8761  
8762         trace_i915_pipe_update_end(crtc, end_vbl_count, scanline_end);
8763  
8764 -       local_irq_enable();
8765 +       local_unlock_irq(pipe_update_lock);
8766  
8767         if (crtc->debug.start_vbl_count &&
8768             crtc->debug.start_vbl_count != end_vbl_count) {
8769 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8770 index 3645b223aa37..642854b2ed2c 100644
8771 --- a/drivers/gpu/drm/radeon/radeon_display.c
8772 +++ b/drivers/gpu/drm/radeon/radeon_display.c
8773 @@ -1862,6 +1862,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8774         struct radeon_device *rdev = dev->dev_private;
8775  
8776         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8777 +       preempt_disable_rt();
8778  
8779         /* Get optional system timestamp before query. */
8780         if (stime)
8781 @@ -1954,6 +1955,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8782                 *etime = ktime_get();
8783  
8784         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8785 +       preempt_enable_rt();
8786  
8787         /* Decode into vertical and horizontal scanout position. */
8788         *vpos = position & 0x1fff;
8789 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8790 index 509ed9731630..b2498b270f2c 100644
8791 --- a/drivers/hv/vmbus_drv.c
8792 +++ b/drivers/hv/vmbus_drv.c
8793 @@ -820,7 +820,7 @@ static void vmbus_isr(void)
8794                         tasklet_schedule(&msg_dpc);
8795         }
8796  
8797 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8798 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
8799  }
8800  
8801  
8802 diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
8803 index 08d26ba61ed3..46b89dd42b10 100644
8804 --- a/drivers/i2c/busses/i2c-omap.c
8805 +++ b/drivers/i2c/busses/i2c-omap.c
8806 @@ -995,15 +995,12 @@ omap_i2c_isr(int irq, void *dev_id)
8807         u16 mask;
8808         u16 stat;
8809  
8810 -       spin_lock(&omap->lock);
8811 -       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8812         stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
8813 +       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8814  
8815         if (stat & mask)
8816                 ret = IRQ_WAKE_THREAD;
8817  
8818 -       spin_unlock(&omap->lock);
8819 -
8820         return ret;
8821  }
8822  
8823 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8824 index 36f76e28a0bf..394f142f90c7 100644
8825 --- a/drivers/ide/alim15x3.c
8826 +++ b/drivers/ide/alim15x3.c
8827 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8828  
8829         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8830  
8831 -       local_irq_save(flags);
8832 +       local_irq_save_nort(flags);
8833  
8834         if (m5229_revision < 0xC2) {
8835                 /*
8836 @@ -325,7 +325,7 @@ out:
8837         }
8838         pci_dev_put(north);
8839         pci_dev_put(isa_dev);
8840 -       local_irq_restore(flags);
8841 +       local_irq_restore_nort(flags);
8842         return 0;
8843  }
8844  
8845 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8846 index 696b6c1ec940..0d0a96629b73 100644
8847 --- a/drivers/ide/hpt366.c
8848 +++ b/drivers/ide/hpt366.c
8849 @@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8850  
8851         dma_old = inb(base + 2);
8852  
8853 -       local_irq_save(flags);
8854 +       local_irq_save_nort(flags);
8855  
8856         dma_new = dma_old;
8857         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8858 @@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8859         if (dma_new != dma_old)
8860                 outb(dma_new, base + 2);
8861  
8862 -       local_irq_restore(flags);
8863 +       local_irq_restore_nort(flags);
8864  
8865         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8866                          hwif->name, base, base + 7);
8867 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8868 index 19763977568c..4169433faab5 100644
8869 --- a/drivers/ide/ide-io-std.c
8870 +++ b/drivers/ide/ide-io-std.c
8871 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8872                 unsigned long uninitialized_var(flags);
8873  
8874                 if ((io_32bit & 2) && !mmio) {
8875 -                       local_irq_save(flags);
8876 +                       local_irq_save_nort(flags);
8877                         ata_vlb_sync(io_ports->nsect_addr);
8878                 }
8879  
8880 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8881                         insl(data_addr, buf, words);
8882  
8883                 if ((io_32bit & 2) && !mmio)
8884 -                       local_irq_restore(flags);
8885 +                       local_irq_restore_nort(flags);
8886  
8887                 if (((len + 1) & 3) < 2)
8888                         return;
8889 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8890                 unsigned long uninitialized_var(flags);
8891  
8892                 if ((io_32bit & 2) && !mmio) {
8893 -                       local_irq_save(flags);
8894 +                       local_irq_save_nort(flags);
8895                         ata_vlb_sync(io_ports->nsect_addr);
8896                 }
8897  
8898 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8899                         outsl(data_addr, buf, words);
8900  
8901                 if ((io_32bit & 2) && !mmio)
8902 -                       local_irq_restore(flags);
8903 +                       local_irq_restore_nort(flags);
8904  
8905                 if (((len + 1) & 3) < 2)
8906                         return;
8907 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8908 index 669ea1e45795..e12e43e62245 100644
8909 --- a/drivers/ide/ide-io.c
8910 +++ b/drivers/ide/ide-io.c
8911 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
8912                 /* disable_irq_nosync ?? */
8913                 disable_irq(hwif->irq);
8914                 /* local CPU only, as if we were handling an interrupt */
8915 -               local_irq_disable();
8916 +               local_irq_disable_nort();
8917                 if (hwif->polling) {
8918                         startstop = handler(drive);
8919                 } else if (drive_is_ready(drive)) {
8920 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8921 index 376f2dc410c5..f014dd1b73dc 100644
8922 --- a/drivers/ide/ide-iops.c
8923 +++ b/drivers/ide/ide-iops.c
8924 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8925                                 if ((stat & ATA_BUSY) == 0)
8926                                         break;
8927  
8928 -                               local_irq_restore(flags);
8929 +                               local_irq_restore_nort(flags);
8930                                 *rstat = stat;
8931                                 return -EBUSY;
8932                         }
8933                 }
8934 -               local_irq_restore(flags);
8935 +               local_irq_restore_nort(flags);
8936         }
8937         /*
8938          * Allow status to settle, then read it again.
8939 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
8940 index 0b63facd1d87..4ceba37afc0c 100644
8941 --- a/drivers/ide/ide-probe.c
8942 +++ b/drivers/ide/ide-probe.c
8943 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
8944         int bswap = 1;
8945  
8946         /* local CPU only; some systems need this */
8947 -       local_irq_save(flags);
8948 +       local_irq_save_nort(flags);
8949         /* read 512 bytes of id info */
8950         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8951 -       local_irq_restore(flags);
8952 +       local_irq_restore_nort(flags);
8953  
8954         drive->dev_flags |= IDE_DFLAG_ID_READ;
8955  #ifdef DEBUG
8956 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
8957 index a716693417a3..be0568c722d6 100644
8958 --- a/drivers/ide/ide-taskfile.c
8959 +++ b/drivers/ide/ide-taskfile.c
8960 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8961  
8962                 page_is_high = PageHighMem(page);
8963                 if (page_is_high)
8964 -                       local_irq_save(flags);
8965 +                       local_irq_save_nort(flags);
8966  
8967                 buf = kmap_atomic(page) + offset;
8968  
8969 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8970                 kunmap_atomic(buf);
8971  
8972                 if (page_is_high)
8973 -                       local_irq_restore(flags);
8974 +                       local_irq_restore_nort(flags);
8975  
8976                 len -= nr_bytes;
8977         }
8978 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
8979         }
8980  
8981         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8982 -               local_irq_disable();
8983 +               local_irq_disable_nort();
8984  
8985         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8986  
8987 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8988 index 87799de90a1d..66cdd37f8605 100644
8989 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8990 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8991 @@ -857,7 +857,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
8992  
8993         ipoib_dbg_mcast(priv, "restarting multicast task\n");
8994  
8995 -       local_irq_save(flags);
8996 +       local_irq_save_nort(flags);
8997         netif_addr_lock(dev);
8998         spin_lock(&priv->lock);
8999  
9000 @@ -939,7 +939,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9001  
9002         spin_unlock(&priv->lock);
9003         netif_addr_unlock(dev);
9004 -       local_irq_restore(flags);
9005 +       local_irq_restore_nort(flags);
9006  
9007         /*
9008          * make sure the in-flight joins have finished before we attempt
9009 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
9010 index 4a2a9e370be7..e970d9afd179 100644
9011 --- a/drivers/input/gameport/gameport.c
9012 +++ b/drivers/input/gameport/gameport.c
9013 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
9014         tx = ~0;
9015  
9016         for (i = 0; i < 50; i++) {
9017 -               local_irq_save(flags);
9018 +               local_irq_save_nort(flags);
9019                 t1 = ktime_get_ns();
9020                 for (t = 0; t < 50; t++)
9021                         gameport_read(gameport);
9022                 t2 = ktime_get_ns();
9023                 t3 = ktime_get_ns();
9024 -               local_irq_restore(flags);
9025 +               local_irq_restore_nort(flags);
9026                 udelay(i * 10);
9027                 t = (t2 - t1) - (t3 - t2);
9028                 if (t < tx)
9029 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9030         tx = 1 << 30;
9031  
9032         for(i = 0; i < 50; i++) {
9033 -               local_irq_save(flags);
9034 +               local_irq_save_nort(flags);
9035                 GET_TIME(t1);
9036                 for (t = 0; t < 50; t++) gameport_read(gameport);
9037                 GET_TIME(t2);
9038                 GET_TIME(t3);
9039 -               local_irq_restore(flags);
9040 +               local_irq_restore_nort(flags);
9041                 udelay(i * 10);
9042                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
9043         }
9044 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9045         tx = 1 << 30;
9046  
9047         for(i = 0; i < 50; i++) {
9048 -               local_irq_save(flags);
9049 +               local_irq_save_nort(flags);
9050                 t1 = rdtsc();
9051                 for (t = 0; t < 50; t++) gameport_read(gameport);
9052                 t2 = rdtsc();
9053 -               local_irq_restore(flags);
9054 +               local_irq_restore_nort(flags);
9055                 udelay(i * 10);
9056                 if (t2 - t1 < tx) tx = t2 - t1;
9057         }
9058 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
9059 index 0397985a2601..bc0e7d2c5cff 100644
9060 --- a/drivers/iommu/amd_iommu.c
9061 +++ b/drivers/iommu/amd_iommu.c
9062 @@ -2019,10 +2019,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
9063         int ret;
9064  
9065         /*
9066 -        * Must be called with IRQs disabled. Warn here to detect early
9067 -        * when its not.
9068 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9069 +        * detect early when its not.
9070          */
9071 -       WARN_ON(!irqs_disabled());
9072 +       WARN_ON_NONRT(!irqs_disabled());
9073  
9074         /* lock domain */
9075         spin_lock(&domain->lock);
9076 @@ -2185,10 +2185,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
9077         struct protection_domain *domain;
9078  
9079         /*
9080 -        * Must be called with IRQs disabled. Warn here to detect early
9081 -        * when its not.
9082 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9083 +        * detect early when its not.
9084          */
9085 -       WARN_ON(!irqs_disabled());
9086 +       WARN_ON_NONRT(!irqs_disabled());
9087  
9088         if (WARN_ON(!dev_data->domain))
9089                 return;
9090 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9091 index 5bda6a9b56bb..d6286584c807 100644
9092 --- a/drivers/leds/trigger/Kconfig
9093 +++ b/drivers/leds/trigger/Kconfig
9094 @@ -61,7 +61,7 @@ config LEDS_TRIGGER_BACKLIGHT
9095  
9096  config LEDS_TRIGGER_CPU
9097         bool "LED CPU Trigger"
9098 -       depends on LEDS_TRIGGERS
9099 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9100         help
9101           This allows LEDs to be controlled by active CPUs. This shows
9102           the active CPUs across an array of LEDs so you can see which
9103 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9104 index 4d200883c505..98b64ed5cb81 100644
9105 --- a/drivers/md/bcache/Kconfig
9106 +++ b/drivers/md/bcache/Kconfig
9107 @@ -1,6 +1,7 @@
9108  
9109  config BCACHE
9110         tristate "Block device as cache"
9111 +       depends on !PREEMPT_RT_FULL
9112         ---help---
9113         Allows a block device to be used as cache for other devices; uses
9114         a btree for indexing and the layout is optimized for SSDs.
9115 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
9116 index a42729ebf272..c717ec464459 100644
9117 --- a/drivers/md/dm.c
9118 +++ b/drivers/md/dm.c
9119 @@ -2127,7 +2127,7 @@ static void dm_request_fn(struct request_queue *q)
9120                 /* Establish tio->ti before queuing work (map_tio_request) */
9121                 tio->ti = ti;
9122                 queue_kthread_work(&md->kworker, &tio->work);
9123 -               BUG_ON(!irqs_disabled());
9124 +               BUG_ON_NONRT(!irqs_disabled());
9125         }
9126  
9127         goto out;
9128 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9129 index 10ce885445f6..76f71791361c 100644
9130 --- a/drivers/md/raid5.c
9131 +++ b/drivers/md/raid5.c
9132 @@ -1920,8 +1920,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9133         struct raid5_percpu *percpu;
9134         unsigned long cpu;
9135  
9136 -       cpu = get_cpu();
9137 +       cpu = get_cpu_light();
9138         percpu = per_cpu_ptr(conf->percpu, cpu);
9139 +       spin_lock(&percpu->lock);
9140         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9141                 ops_run_biofill(sh);
9142                 overlap_clear++;
9143 @@ -1977,7 +1978,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9144                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9145                                 wake_up(&sh->raid_conf->wait_for_overlap);
9146                 }
9147 -       put_cpu();
9148 +       spin_unlock(&percpu->lock);
9149 +       put_cpu_light();
9150  }
9151  
9152  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
9153 @@ -6414,6 +6416,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9154                                __func__, cpu);
9155                         break;
9156                 }
9157 +               spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9158         }
9159         put_online_cpus();
9160  
9161 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9162 index 517d4b68a1be..efe91887ecd7 100644
9163 --- a/drivers/md/raid5.h
9164 +++ b/drivers/md/raid5.h
9165 @@ -504,6 +504,7 @@ struct r5conf {
9166         int                     recovery_disabled;
9167         /* per cpu variables */
9168         struct raid5_percpu {
9169 +               spinlock_t      lock;           /* Protection for -RT */
9170                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9171                 struct flex_array *scribble;   /* space for constructing buffer
9172                                               * lists and performing address
9173 diff --git a/drivers/media/platform/vsp1/vsp1_video.c b/drivers/media/platform/vsp1/vsp1_video.c
9174 index 5ce88e1f5d71..b4f8cd74ecb8 100644
9175 --- a/drivers/media/platform/vsp1/vsp1_video.c
9176 +++ b/drivers/media/platform/vsp1/vsp1_video.c
9177 @@ -520,7 +520,7 @@ static bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
9178         bool stopped;
9179  
9180         spin_lock_irqsave(&pipe->irqlock, flags);
9181 -       stopped = pipe->state == VSP1_PIPELINE_STOPPED,
9182 +       stopped = pipe->state == VSP1_PIPELINE_STOPPED;
9183         spin_unlock_irqrestore(&pipe->irqlock, flags);
9184  
9185         return stopped;
9186 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9187 index 4bf7d50b1bc7..6f7e99ad6e29 100644
9188 --- a/drivers/misc/Kconfig
9189 +++ b/drivers/misc/Kconfig
9190 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9191  config ATMEL_TCLIB
9192         bool "Atmel AT32/AT91 Timer/Counter Library"
9193         depends on (AVR32 || ARCH_AT91)
9194 +       default y if PREEMPT_RT_FULL
9195         help
9196           Select this if you want a library to allocate the Timer/Counter
9197           blocks found on many Atmel processors.  This facilitates using
9198 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9199           are combined to make a single 32-bit timer.
9200  
9201           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9202 -         may be used as a clock event device supporting oneshot mode
9203 -         (delays of up to two seconds) based on the 32 KiHz clock.
9204 +         may be used as a clock event device supporting oneshot mode.
9205  
9206  config ATMEL_TCB_CLKSRC_BLOCK
9207         int
9208 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9209           TC can be used for other purposes, such as PWM generation and
9210           interval timing.
9211  
9212 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9213 +       bool "TC Block use 32 KiHz clock"
9214 +       depends on ATMEL_TCB_CLKSRC
9215 +       default y if !PREEMPT_RT_FULL
9216 +       help
9217 +         Select this to use 32 KiHz base clock rate as TC block clock
9218 +         source for clock events.
9219 +
9220 +
9221  config DUMMY_IRQ
9222         tristate "Dummy IRQ handler"
9223         default n
9224 @@ -113,6 +122,35 @@ config IBM_ASM
9225           for information on the specific driver level and support statement
9226           for your IBM server.
9227  
9228 +config HWLAT_DETECTOR
9229 +       tristate "Testing module to detect hardware-induced latencies"
9230 +       depends on DEBUG_FS
9231 +       depends on RING_BUFFER
9232 +       default m
9233 +       ---help---
9234 +         A simple hardware latency detector. Use this module to detect
9235 +         large latencies introduced by the behavior of the underlying
9236 +         system firmware external to Linux. We do this using periodic
9237 +         use of stop_machine to grab all available CPUs and measure
9238 +         for unexplainable gaps in the CPU timestamp counter(s). By
9239 +         default, the module is not enabled until the "enable" file
9240 +         within the "hwlat_detector" debugfs directory is toggled.
9241 +
9242 +         This module is often used to detect SMI (System Management
9243 +         Interrupts) on x86 systems, though is not x86 specific. To
9244 +         this end, we default to using a sample window of 1 second,
9245 +         during which we will sample for 0.5 seconds. If an SMI or
9246 +         similar event occurs during that time, it is recorded
9247 +         into an 8K samples global ring buffer until retreived.
9248 +
9249 +         WARNING: This software should never be enabled (it can be built
9250 +         but should not be turned on after it is loaded) in a production
9251 +         environment where high latencies are a concern since the
9252 +         sampling mechanism actually introduces latencies for
9253 +         regular tasks while the CPU(s) are being held.
9254 +
9255 +         If unsure, say N
9256 +
9257  config PHANTOM
9258         tristate "Sensable PHANToM (PCI)"
9259         depends on PCI
9260 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
9261 index 537d7f3b78da..ec4aecba0656 100644
9262 --- a/drivers/misc/Makefile
9263 +++ b/drivers/misc/Makefile
9264 @@ -39,6 +39,7 @@ obj-$(CONFIG_C2PORT)          += c2port/
9265  obj-$(CONFIG_HMC6352)          += hmc6352.o
9266  obj-y                          += eeprom/
9267  obj-y                          += cb710/
9268 +obj-$(CONFIG_HWLAT_DETECTOR)   += hwlat_detector.o
9269  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)    += spear13xx_pcie_gadget.o
9270  obj-$(CONFIG_VMWARE_BALLOON)   += vmw_balloon.o
9271  obj-$(CONFIG_ARM_CHARLCD)      += arm-charlcd.o
9272 diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
9273 new file mode 100644
9274 index 000000000000..52f5ad5fd9c0
9275 --- /dev/null
9276 +++ b/drivers/misc/hwlat_detector.c
9277 @@ -0,0 +1,1240 @@
9278 +/*
9279 + * hwlat_detector.c - A simple Hardware Latency detector.
9280 + *
9281 + * Use this module to detect large system latencies induced by the behavior of
9282 + * certain underlying system hardware or firmware, independent of Linux itself.
9283 + * The code was developed originally to detect the presence of SMIs on Intel
9284 + * and AMD systems, although there is no dependency upon x86 herein.
9285 + *
9286 + * The classical example usage of this module is in detecting the presence of
9287 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
9288 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
9289 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
9290 + * LPC (or other device) to generate a special interrupt under certain
9291 + * circumstances, for example, upon expiration of a special SMI timer device,
9292 + * due to certain external thermal readings, on certain I/O address accesses,
9293 + * and other situations. An SMI hits a special CPU pin, triggers a special
9294 + * SMI mode (complete with special memory map), and the OS is unaware.
9295 + *
9296 + * Although certain hardware-inducing latencies are necessary (for example,
9297 + * a modern system often requires an SMI handler for correct thermal control
9298 + * and remote management) they can wreak havoc upon any OS-level performance
9299 + * guarantees toward low-latency, especially when the OS is not even made
9300 + * aware of the presence of these interrupts. For this reason, we need a
9301 + * somewhat brute force mechanism to detect these interrupts. In this case,
9302 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
9303 + * sampling the built-in CPU timer, looking for discontiguous readings.
9304 + *
9305 + * WARNING: This implementation necessarily introduces latencies. Therefore,
9306 + *          you should NEVER use this module in a production environment
9307 + *          requiring any kind of low-latency performance guarantee(s).
9308 + *
9309 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
9310 + *
9311 + * Includes useful feedback from Clark Williams <clark@redhat.com>
9312 + *
9313 + * This file is licensed under the terms of the GNU General Public
9314 + * License version 2. This program is licensed "as is" without any
9315 + * warranty of any kind, whether express or implied.
9316 + */
9317 +
9318 +#include <linux/module.h>
9319 +#include <linux/init.h>
9320 +#include <linux/ring_buffer.h>
9321 +#include <linux/time.h>
9322 +#include <linux/hrtimer.h>
9323 +#include <linux/kthread.h>
9324 +#include <linux/debugfs.h>
9325 +#include <linux/seq_file.h>
9326 +#include <linux/uaccess.h>
9327 +#include <linux/version.h>
9328 +#include <linux/delay.h>
9329 +#include <linux/slab.h>
9330 +#include <linux/trace_clock.h>
9331 +
9332 +#define BUF_SIZE_DEFAULT       262144UL                /* 8K*(sizeof(entry)) */
9333 +#define BUF_FLAGS              (RB_FL_OVERWRITE)       /* no block on full */
9334 +#define U64STR_SIZE            22                      /* 20 digits max */
9335 +
9336 +#define VERSION                        "1.0.0"
9337 +#define BANNER                 "hwlat_detector: "
9338 +#define DRVNAME                        "hwlat_detector"
9339 +#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
9340 +#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
9341 +#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
9342 +
9343 +/* Module metadata */
9344 +
9345 +MODULE_LICENSE("GPL");
9346 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
9347 +MODULE_DESCRIPTION("A simple hardware latency detector");
9348 +MODULE_VERSION(VERSION);
9349 +
9350 +/* Module parameters */
9351 +
9352 +static int debug;
9353 +static int enabled;
9354 +static int threshold;
9355 +
9356 +module_param(debug, int, 0);                   /* enable debug */
9357 +module_param(enabled, int, 0);                 /* enable detector */
9358 +module_param(threshold, int, 0);               /* latency threshold */
9359 +
9360 +/* Buffering and sampling */
9361 +
9362 +static struct ring_buffer *ring_buffer;                /* sample buffer */
9363 +static DEFINE_MUTEX(ring_buffer_mutex);                /* lock changes */
9364 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
9365 +static struct task_struct *kthread;            /* sampling thread */
9366 +
9367 +/* DebugFS filesystem entries */
9368 +
9369 +static struct dentry *debug_dir;               /* debugfs directory */
9370 +static struct dentry *debug_max;               /* maximum TSC delta */
9371 +static struct dentry *debug_count;             /* total detect count */
9372 +static struct dentry *debug_sample_width;      /* sample width us */
9373 +static struct dentry *debug_sample_window;     /* sample window us */
9374 +static struct dentry *debug_sample;            /* raw samples us */
9375 +static struct dentry *debug_threshold;         /* threshold us */
9376 +static struct dentry *debug_enable;            /* enable/disable */
9377 +
9378 +/* Individual samples and global state */
9379 +
9380 +struct sample;                                 /* latency sample */
9381 +struct data;                                   /* Global state */
9382 +
9383 +/* Sampling functions */
9384 +static int __buffer_add_sample(struct sample *sample);
9385 +static struct sample *buffer_get_sample(struct sample *sample);
9386 +
9387 +/* Threading and state */
9388 +static int kthread_fn(void *unused);
9389 +static int start_kthread(void);
9390 +static int stop_kthread(void);
9391 +static void __reset_stats(void);
9392 +static int init_stats(void);
9393 +
9394 +/* Debugfs interface */
9395 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9396 +                               size_t cnt, loff_t *ppos, const u64 *entry);
9397 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9398 +                                size_t cnt, loff_t *ppos, u64 *entry);
9399 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
9400 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
9401 +                                 size_t cnt, loff_t *ppos);
9402 +static int debug_sample_release(struct inode *inode, struct file *filp);
9403 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
9404 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9405 +                                 size_t cnt, loff_t *ppos);
9406 +static ssize_t debug_enable_fwrite(struct file *file,
9407 +                                  const char __user *user_buffer,
9408 +                                  size_t user_size, loff_t *offset);
9409 +
9410 +/* Initialization functions */
9411 +static int init_debugfs(void);
9412 +static void free_debugfs(void);
9413 +static int detector_init(void);
9414 +static void detector_exit(void);
9415 +
9416 +/* Individual latency samples are stored here when detected and packed into
9417 + * the ring_buffer circular buffer, where they are overwritten when
9418 + * more than buf_size/sizeof(sample) samples are received. */
9419 +struct sample {
9420 +       u64             seqnum;         /* unique sequence */
9421 +       u64             duration;       /* ktime delta */
9422 +       u64             outer_duration; /* ktime delta (outer loop) */
9423 +       struct timespec timestamp;      /* wall time */
9424 +       unsigned long   lost;
9425 +};
9426 +
9427 +/* keep the global state somewhere. */
9428 +static struct data {
9429 +
9430 +       struct mutex lock;              /* protect changes */
9431 +
9432 +       u64     count;                  /* total since reset */
9433 +       u64     max_sample;             /* max hardware latency */
9434 +       u64     threshold;              /* sample threshold level */
9435 +
9436 +       u64     sample_window;          /* total sampling window (on+off) */
9437 +       u64     sample_width;           /* active sampling portion of window */
9438 +
9439 +       atomic_t sample_open;           /* whether the sample file is open */
9440 +
9441 +       wait_queue_head_t wq;           /* waitqeue for new sample values */
9442 +
9443 +} data;
9444 +
9445 +/**
9446 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
9447 + * @sample: The new latency sample value
9448 + *
9449 + * This receives a new latency sample and records it in a global ring buffer.
9450 + * No additional locking is used in this case.
9451 + */
9452 +static int __buffer_add_sample(struct sample *sample)
9453 +{
9454 +       return ring_buffer_write(ring_buffer,
9455 +                                sizeof(struct sample), sample);
9456 +}
9457 +
9458 +/**
9459 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
9460 + * @sample: Pre-allocated storage for the sample
9461 + *
9462 + * This retrieves a hardware latency sample from the global circular buffer
9463 + */
9464 +static struct sample *buffer_get_sample(struct sample *sample)
9465 +{
9466 +       struct ring_buffer_event *e = NULL;
9467 +       struct sample *s = NULL;
9468 +       unsigned int cpu = 0;
9469 +
9470 +       if (!sample)
9471 +               return NULL;
9472 +
9473 +       mutex_lock(&ring_buffer_mutex);
9474 +       for_each_online_cpu(cpu) {
9475 +               e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
9476 +               if (e)
9477 +                       break;
9478 +       }
9479 +
9480 +       if (e) {
9481 +               s = ring_buffer_event_data(e);
9482 +               memcpy(sample, s, sizeof(struct sample));
9483 +       } else
9484 +               sample = NULL;
9485 +       mutex_unlock(&ring_buffer_mutex);
9486 +
9487 +       return sample;
9488 +}
9489 +
9490 +#ifndef CONFIG_TRACING
9491 +#define time_type      ktime_t
9492 +#define time_get()     ktime_get()
9493 +#define time_to_us(x)  ktime_to_us(x)
9494 +#define time_sub(a, b) ktime_sub(a, b)
9495 +#define init_time(a, b)        (a).tv64 = b
9496 +#define time_u64(a)    ((a).tv64)
9497 +#else
9498 +#define time_type      u64
9499 +#define time_get()     trace_clock_local()
9500 +#define time_to_us(x)  div_u64(x, 1000)
9501 +#define time_sub(a, b) ((a) - (b))
9502 +#define init_time(a, b)        (a = b)
9503 +#define time_u64(a)    a
9504 +#endif
9505 +/**
9506 + * get_sample - sample the CPU TSC and look for likely hardware latencies
9507 + *
9508 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
9509 + * hardware-induced latency. Called with interrupts disabled and with
9510 + * data.lock held.
9511 + */
9512 +static int get_sample(void)
9513 +{
9514 +       time_type start, t1, t2, last_t2;
9515 +       s64 diff, total = 0;
9516 +       u64 sample = 0;
9517 +       u64 outer_sample = 0;
9518 +       int ret = -1;
9519 +
9520 +       init_time(last_t2, 0);
9521 +       start = time_get(); /* start timestamp */
9522 +
9523 +       do {
9524 +
9525 +               t1 = time_get();        /* we'll look for a discontinuity */
9526 +               t2 = time_get();
9527 +
9528 +               if (time_u64(last_t2)) {
9529 +                       /* Check the delta from outer loop (t2 to next t1) */
9530 +                       diff = time_to_us(time_sub(t1, last_t2));
9531 +                       /* This shouldn't happen */
9532 +                       if (diff < 0) {
9533 +                               pr_err(BANNER "time running backwards\n");
9534 +                               goto out;
9535 +                       }
9536 +                       if (diff > outer_sample)
9537 +                               outer_sample = diff;
9538 +               }
9539 +               last_t2 = t2;
9540 +
9541 +               total = time_to_us(time_sub(t2, start)); /* sample width */
9542 +
9543 +               /* This checks the inner loop (t1 to t2) */
9544 +               diff = time_to_us(time_sub(t2, t1));     /* current diff */
9545 +
9546 +               /* This shouldn't happen */
9547 +               if (diff < 0) {
9548 +                       pr_err(BANNER "time running backwards\n");
9549 +                       goto out;
9550 +               }
9551 +
9552 +               if (diff > sample)
9553 +                       sample = diff; /* only want highest value */
9554 +
9555 +       } while (total <= data.sample_width);
9556 +
9557 +       ret = 0;
9558 +
9559 +       /* If we exceed the threshold value, we have found a hardware latency */
9560 +       if (sample > data.threshold || outer_sample > data.threshold) {
9561 +               struct sample s;
9562 +
9563 +               ret = 1;
9564 +
9565 +               data.count++;
9566 +               s.seqnum = data.count;
9567 +               s.duration = sample;
9568 +               s.outer_duration = outer_sample;
9569 +               s.timestamp = CURRENT_TIME;
9570 +               __buffer_add_sample(&s);
9571 +
9572 +               /* Keep a running maximum ever recorded hardware latency */
9573 +               if (sample > data.max_sample)
9574 +                       data.max_sample = sample;
9575 +       }
9576 +
9577 +out:
9578 +       return ret;
9579 +}
9580 +
9581 +/*
9582 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
9583 + * @unused: A required part of the kthread API.
9584 + *
9585 + * Used to periodically sample the CPU TSC via a call to get_sample. We
9586 + * disable interrupts, which does (intentionally) introduce latency since we
9587 + * need to ensure nothing else might be running (and thus pre-empting).
9588 + * Obviously this should never be used in production environments.
9589 + *
9590 + * Currently this runs on which ever CPU it was scheduled on, but most
9591 + * real-worald hardware latency situations occur across several CPUs,
9592 + * but we might later generalize this if we find there are any actualy
9593 + * systems with alternate SMI delivery or other hardware latencies.
9594 + */
9595 +static int kthread_fn(void *unused)
9596 +{
9597 +       int ret;
9598 +       u64 interval;
9599 +
9600 +       while (!kthread_should_stop()) {
9601 +
9602 +               mutex_lock(&data.lock);
9603 +
9604 +               local_irq_disable();
9605 +               ret = get_sample();
9606 +               local_irq_enable();
9607 +
9608 +               if (ret > 0)
9609 +                       wake_up(&data.wq); /* wake up reader(s) */
9610 +
9611 +               interval = data.sample_window - data.sample_width;
9612 +               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
9613 +
9614 +               mutex_unlock(&data.lock);
9615 +
9616 +               if (msleep_interruptible(interval))
9617 +                       break;
9618 +       }
9619 +
9620 +       return 0;
9621 +}
9622 +
9623 +/**
9624 + * start_kthread - Kick off the hardware latency sampling/detector kthread
9625 + *
9626 + * This starts a kernel thread that will sit and sample the CPU timestamp
9627 + * counter (TSC or similar) and look for potential hardware latencies.
9628 + */
9629 +static int start_kthread(void)
9630 +{
9631 +       kthread = kthread_run(kthread_fn, NULL,
9632 +                                       DRVNAME);
9633 +       if (IS_ERR(kthread)) {
9634 +               pr_err(BANNER "could not start sampling thread\n");
9635 +               enabled = 0;
9636 +               return -ENOMEM;
9637 +       }
9638 +
9639 +       return 0;
9640 +}
9641 +
9642 +/**
9643 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
9644 + *
9645 + * This kicks the running hardware latency sampling/detector kernel thread and
9646 + * tells it to stop sampling now. Use this on unload and at system shutdown.
9647 + */
9648 +static int stop_kthread(void)
9649 +{
9650 +       int ret;
9651 +
9652 +       ret = kthread_stop(kthread);
9653 +
9654 +       return ret;
9655 +}
9656 +
9657 +/**
9658 + * __reset_stats - Reset statistics for the hardware latency detector
9659 + *
9660 + * We use data to store various statistics and global state. We call this
9661 + * function in order to reset those when "enable" is toggled on or off, and
9662 + * also at initialization. Should be called with data.lock held.
9663 + */
9664 +static void __reset_stats(void)
9665 +{
9666 +       data.count = 0;
9667 +       data.max_sample = 0;
9668 +       ring_buffer_reset(ring_buffer); /* flush out old sample entries */
9669 +}
9670 +
9671 +/**
9672 + * init_stats - Setup global state statistics for the hardware latency detector
9673 + *
9674 + * We use data to store various statistics and global state. We also use
9675 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
9676 + * induced system latencies. This function initializes these structures and
9677 + * allocates the global ring buffer also.
9678 + */
9679 +static int init_stats(void)
9680 +{
9681 +       int ret = -ENOMEM;
9682 +
9683 +       mutex_init(&data.lock);
9684 +       init_waitqueue_head(&data.wq);
9685 +       atomic_set(&data.sample_open, 0);
9686 +
9687 +       ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
9688 +
9689 +       if (WARN(!ring_buffer, KERN_ERR BANNER
9690 +                              "failed to allocate ring buffer!\n"))
9691 +               goto out;
9692 +
9693 +       __reset_stats();
9694 +       data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
9695 +       data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
9696 +       data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
9697 +
9698 +       ret = 0;
9699 +
9700 +out:
9701 +       return ret;
9702 +
9703 +}
9704 +
9705 +/*
9706 + * simple_data_read - Wrapper read function for global state debugfs entries
9707 + * @filp: The active open file structure for the debugfs "file"
9708 + * @ubuf: The userspace provided buffer to read value into
9709 + * @cnt: The maximum number of bytes to read
9710 + * @ppos: The current "file" position
9711 + * @entry: The entry to read from
9712 + *
9713 + * This function provides a generic read implementation for the global state
9714 + * "data" structure debugfs filesystem entries. It would be nice to use
9715 + * simple_attr_read directly, but we need to make sure that the data.lock
9716 + * is held during the actual read.
9717 + */
9718 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9719 +                               size_t cnt, loff_t *ppos, const u64 *entry)
9720 +{
9721 +       char buf[U64STR_SIZE];
9722 +       u64 val = 0;
9723 +       int len = 0;
9724 +
9725 +       memset(buf, 0, sizeof(buf));
9726 +
9727 +       if (!entry)
9728 +               return -EFAULT;
9729 +
9730 +       mutex_lock(&data.lock);
9731 +       val = *entry;
9732 +       mutex_unlock(&data.lock);
9733 +
9734 +       len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
9735 +
9736 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
9737 +
9738 +}
9739 +
9740 +/*
9741 + * simple_data_write - Wrapper write function for global state debugfs entries
9742 + * @filp: The active open file structure for the debugfs "file"
9743 + * @ubuf: The userspace provided buffer to write value from
9744 + * @cnt: The maximum number of bytes to write
9745 + * @ppos: The current "file" position
9746 + * @entry: The entry to write to
9747 + *
9748 + * This function provides a generic write implementation for the global state
9749 + * "data" structure debugfs filesystem entries. It would be nice to use
9750 + * simple_attr_write directly, but we need to make sure that the data.lock
9751 + * is held during the actual write.
9752 + */
9753 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9754 +                                size_t cnt, loff_t *ppos, u64 *entry)
9755 +{
9756 +       char buf[U64STR_SIZE];
9757 +       int csize = min(cnt, sizeof(buf));
9758 +       u64 val = 0;
9759 +       int err = 0;
9760 +
9761 +       memset(buf, '\0', sizeof(buf));
9762 +       if (copy_from_user(buf, ubuf, csize))
9763 +               return -EFAULT;
9764 +
9765 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
9766 +       err = kstrtoull(buf, 10, &val);
9767 +       if (err)
9768 +               return -EINVAL;
9769 +
9770 +       mutex_lock(&data.lock);
9771 +       *entry = val;
9772 +       mutex_unlock(&data.lock);
9773 +
9774 +       return csize;
9775 +}
9776 +
9777 +/**
9778 + * debug_count_fopen - Open function for "count" debugfs entry
9779 + * @inode: The in-kernel inode representation of the debugfs "file"
9780 + * @filp: The active open file structure for the debugfs "file"
9781 + *
9782 + * This function provides an open implementation for the "count" debugfs
9783 + * interface to the hardware latency detector.
9784 + */
9785 +static int debug_count_fopen(struct inode *inode, struct file *filp)
9786 +{
9787 +       return 0;
9788 +}
9789 +
9790 +/**
9791 + * debug_count_fread - Read function for "count" debugfs entry
9792 + * @filp: The active open file structure for the debugfs "file"
9793 + * @ubuf: The userspace provided buffer to read value into
9794 + * @cnt: The maximum number of bytes to read
9795 + * @ppos: The current "file" position
9796 + *
9797 + * This function provides a read implementation for the "count" debugfs
9798 + * interface to the hardware latency detector. Can be used to read the
9799 + * number of latency readings exceeding the configured threshold since
9800 + * the detector was last reset (e.g. by writing a zero into "count").
9801 + */
9802 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
9803 +                                    size_t cnt, loff_t *ppos)
9804 +{
9805 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
9806 +}
9807 +
9808 +/**
9809 + * debug_count_fwrite - Write function for "count" debugfs entry
9810 + * @filp: The active open file structure for the debugfs "file"
9811 + * @ubuf: The user buffer that contains the value to write
9812 + * @cnt: The maximum number of bytes to write to "file"
9813 + * @ppos: The current position in the debugfs "file"
9814 + *
9815 + * This function provides a write implementation for the "count" debugfs
9816 + * interface to the hardware latency detector. Can be used to write a
9817 + * desired value, especially to zero the total count.
9818 + */
9819 +static ssize_t  debug_count_fwrite(struct file *filp,
9820 +                                      const char __user *ubuf,
9821 +                                      size_t cnt,
9822 +                                      loff_t *ppos)
9823 +{
9824 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
9825 +}
9826 +
9827 +/**
9828 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
9829 + * @inode: The in-kernel inode representation of the debugfs "file"
9830 + * @filp: The active open file structure for the debugfs "file"
9831 + *
9832 + * This function provides an open implementation for the "enable" debugfs
9833 + * interface to the hardware latency detector.
9834 + */
9835 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
9836 +{
9837 +       return 0;
9838 +}
9839 +
9840 +/**
9841 + * debug_enable_fread - Read function for "enable" debugfs interface
9842 + * @filp: The active open file structure for the debugfs "file"
9843 + * @ubuf: The userspace provided buffer to read value into
9844 + * @cnt: The maximum number of bytes to read
9845 + * @ppos: The current "file" position
9846 + *
9847 + * This function provides a read implementation for the "enable" debugfs
9848 + * interface to the hardware latency detector. Can be used to determine
9849 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
9850 + */
9851 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9852 +                                     size_t cnt, loff_t *ppos)
9853 +{
9854 +       char buf[4];
9855 +
9856 +       if ((cnt < sizeof(buf)) || (*ppos))
9857 +               return 0;
9858 +
9859 +       buf[0] = enabled ? '1' : '0';
9860 +       buf[1] = '\n';
9861 +       buf[2] = '\0';
9862 +       if (copy_to_user(ubuf, buf, strlen(buf)))
9863 +               return -EFAULT;
9864 +       return *ppos = strlen(buf);
9865 +}
9866 +
9867 +/**
9868 + * debug_enable_fwrite - Write function for "enable" debugfs interface
9869 + * @filp: The active open file structure for the debugfs "file"
9870 + * @ubuf: The user buffer that contains the value to write
9871 + * @cnt: The maximum number of bytes to write to "file"
9872 + * @ppos: The current position in the debugfs "file"
9873 + *
9874 + * This function provides a write implementation for the "enable" debugfs
9875 + * interface to the hardware latency detector. Can be used to enable or
9876 + * disable the detector, which will have the side-effect of possibly
9877 + * also resetting the global stats and kicking off the measuring
9878 + * kthread (on an enable) or the converse (upon a disable).
9879 + */
9880 +static ssize_t  debug_enable_fwrite(struct file *filp,
9881 +                                       const char __user *ubuf,
9882 +                                       size_t cnt,
9883 +                                       loff_t *ppos)
9884 +{
9885 +       char buf[4];
9886 +       int csize = min(cnt, sizeof(buf));
9887 +       long val = 0;
9888 +       int err = 0;
9889 +
9890 +       memset(buf, '\0', sizeof(buf));
9891 +       if (copy_from_user(buf, ubuf, csize))
9892 +               return -EFAULT;
9893 +
9894 +       buf[sizeof(buf)-1] = '\0';                      /* just in case */
9895 +       err = kstrtoul(buf, 10, &val);
9896 +       if (err)
9897 +               return -EINVAL;
9898 +
9899 +       if (val) {
9900 +               if (enabled)
9901 +                       goto unlock;
9902 +               enabled = 1;
9903 +               __reset_stats();
9904 +               if (start_kthread())
9905 +                       return -EFAULT;
9906 +       } else {
9907 +               if (!enabled)
9908 +                       goto unlock;
9909 +               enabled = 0;
9910 +               err = stop_kthread();
9911 +               if (err) {
9912 +                       pr_err(BANNER "cannot stop kthread\n");
9913 +                       return -EFAULT;
9914 +               }
9915 +               wake_up(&data.wq);              /* reader(s) should return */
9916 +       }
9917 +unlock:
9918 +       return csize;
9919 +}
9920 +
9921 +/**
9922 + * debug_max_fopen - Open function for "max" debugfs entry
9923 + * @inode: The in-kernel inode representation of the debugfs "file"
9924 + * @filp: The active open file structure for the debugfs "file"
9925 + *
9926 + * This function provides an open implementation for the "max" debugfs
9927 + * interface to the hardware latency detector.
9928 + */
9929 +static int debug_max_fopen(struct inode *inode, struct file *filp)
9930 +{
9931 +       return 0;
9932 +}
9933 +
9934 +/**
9935 + * debug_max_fread - Read function for "max" debugfs entry
9936 + * @filp: The active open file structure for the debugfs "file"
9937 + * @ubuf: The userspace provided buffer to read value into
9938 + * @cnt: The maximum number of bytes to read
9939 + * @ppos: The current "file" position
9940 + *
9941 + * This function provides a read implementation for the "max" debugfs
9942 + * interface to the hardware latency detector. Can be used to determine
9943 + * the maximum latency value observed since it was last reset.
9944 + */
9945 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
9946 +                                  size_t cnt, loff_t *ppos)
9947 +{
9948 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
9949 +}
9950 +
9951 +/**
9952 + * debug_max_fwrite - Write function for "max" debugfs entry
9953 + * @filp: The active open file structure for the debugfs "file"
9954 + * @ubuf: The user buffer that contains the value to write
9955 + * @cnt: The maximum number of bytes to write to "file"
9956 + * @ppos: The current position in the debugfs "file"
9957 + *
9958 + * This function provides a write implementation for the "max" debugfs
9959 + * interface to the hardware latency detector. Can be used to reset the
9960 + * maximum or set it to some other desired value - if, then, subsequent
9961 + * measurements exceed this value, the maximum will be updated.
9962 + */
9963 +static ssize_t  debug_max_fwrite(struct file *filp,
9964 +                                    const char __user *ubuf,
9965 +                                    size_t cnt,
9966 +                                    loff_t *ppos)
9967 +{
9968 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
9969 +}
9970 +
9971 +
9972 +/**
9973 + * debug_sample_fopen - An open function for "sample" debugfs interface
9974 + * @inode: The in-kernel inode representation of this debugfs "file"
9975 + * @filp: The active open file structure for the debugfs "file"
9976 + *
9977 + * This function handles opening the "sample" file within the hardware
9978 + * latency detector debugfs directory interface. This file is used to read
9979 + * raw samples from the global ring_buffer and allows the user to see a
9980 + * running latency history. Can be opened blocking or non-blocking,
9981 + * affecting whether it behaves as a buffer read pipe, or does not.
9982 + * Implements simple locking to prevent multiple simultaneous use.
9983 + */
9984 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
9985 +{
9986 +       if (!atomic_add_unless(&data.sample_open, 1, 1))
9987 +               return -EBUSY;
9988 +       else
9989 +               return 0;
9990 +}
9991 +
9992 +/**
9993 + * debug_sample_fread - A read function for "sample" debugfs interface
9994 + * @filp: The active open file structure for the debugfs "file"
9995 + * @ubuf: The user buffer that will contain the samples read
9996 + * @cnt: The maximum bytes to read from the debugfs "file"
9997 + * @ppos: The current position in the debugfs "file"
9998 + *
9999 + * This function handles reading from the "sample" file within the hardware
10000 + * latency detector debugfs directory interface. This file is used to read
10001 + * raw samples from the global ring_buffer and allows the user to see a
10002 + * running latency history. By default this will block pending a new
10003 + * value written into the sample buffer, unless there are already a
10004 + * number of value(s) waiting in the buffer, or the sample file was
10005 + * previously opened in a non-blocking mode of operation.
10006 + */
10007 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
10008 +                                       size_t cnt, loff_t *ppos)
10009 +{
10010 +       int len = 0;
10011 +       char buf[64];
10012 +       struct sample *sample = NULL;
10013 +
10014 +       if (!enabled)
10015 +               return 0;
10016 +
10017 +       sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
10018 +       if (!sample)
10019 +               return -ENOMEM;
10020 +
10021 +       while (!buffer_get_sample(sample)) {
10022 +
10023 +               DEFINE_WAIT(wait);
10024 +
10025 +               if (filp->f_flags & O_NONBLOCK) {
10026 +                       len = -EAGAIN;
10027 +                       goto out;
10028 +               }
10029 +
10030 +               prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
10031 +               schedule();
10032 +               finish_wait(&data.wq, &wait);
10033 +
10034 +               if (signal_pending(current)) {
10035 +                       len = -EINTR;
10036 +                       goto out;
10037 +               }
10038 +
10039 +               if (!enabled) {                 /* enable was toggled */
10040 +                       len = 0;
10041 +                       goto out;
10042 +               }
10043 +       }
10044 +
10045 +       len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
10046 +                      sample->timestamp.tv_sec,
10047 +                      sample->timestamp.tv_nsec,
10048 +                      sample->duration,
10049 +                      sample->outer_duration);
10050 +
10051 +
10052 +       /* handling partial reads is more trouble than it's worth */
10053 +       if (len > cnt)
10054 +               goto out;
10055 +
10056 +       if (copy_to_user(ubuf, buf, len))
10057 +               len = -EFAULT;
10058 +
10059 +out:
10060 +       kfree(sample);
10061 +       return len;
10062 +}
10063 +
10064 +/**
10065 + * debug_sample_release - Release function for "sample" debugfs interface
10066 + * @inode: The in-kernel inode represenation of the debugfs "file"
10067 + * @filp: The active open file structure for the debugfs "file"
10068 + *
10069 + * This function completes the close of the debugfs interface "sample" file.
10070 + * Frees the sample_open "lock" so that other users may open the interface.
10071 + */
10072 +static int debug_sample_release(struct inode *inode, struct file *filp)
10073 +{
10074 +       atomic_dec(&data.sample_open);
10075 +
10076 +       return 0;
10077 +}
10078 +
10079 +/**
10080 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
10081 + * @inode: The in-kernel inode representation of the debugfs "file"
10082 + * @filp: The active open file structure for the debugfs "file"
10083 + *
10084 + * This function provides an open implementation for the "threshold" debugfs
10085 + * interface to the hardware latency detector.
10086 + */
10087 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
10088 +{
10089 +       return 0;
10090 +}
10091 +
10092 +/**
10093 + * debug_threshold_fread - Read function for "threshold" debugfs entry
10094 + * @filp: The active open file structure for the debugfs "file"
10095 + * @ubuf: The userspace provided buffer to read value into
10096 + * @cnt: The maximum number of bytes to read
10097 + * @ppos: The current "file" position
10098 + *
10099 + * This function provides a read implementation for the "threshold" debugfs
10100 + * interface to the hardware latency detector. It can be used to determine
10101 + * the current threshold level at which a latency will be recorded in the
10102 + * global ring buffer, typically on the order of 10us.
10103 + */
10104 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
10105 +                                        size_t cnt, loff_t *ppos)
10106 +{
10107 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
10108 +}
10109 +
10110 +/**
10111 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
10112 + * @filp: The active open file structure for the debugfs "file"
10113 + * @ubuf: The user buffer that contains the value to write
10114 + * @cnt: The maximum number of bytes to write to "file"
10115 + * @ppos: The current position in the debugfs "file"
10116 + *
10117 + * This function provides a write implementation for the "threshold" debugfs
10118 + * interface to the hardware latency detector. It can be used to configure
10119 + * the threshold level at which any subsequently detected latencies will
10120 + * be recorded into the global ring buffer.
10121 + */
10122 +static ssize_t  debug_threshold_fwrite(struct file *filp,
10123 +                                       const char __user *ubuf,
10124 +                                       size_t cnt,
10125 +                                       loff_t *ppos)
10126 +{
10127 +       int ret;
10128 +
10129 +       ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
10130 +
10131 +       if (enabled)
10132 +               wake_up_process(kthread);
10133 +
10134 +       return ret;
10135 +}
10136 +
10137 +/**
10138 + * debug_width_fopen - Open function for "width" debugfs entry
10139 + * @inode: The in-kernel inode representation of the debugfs "file"
10140 + * @filp: The active open file structure for the debugfs "file"
10141 + *
10142 + * This function provides an open implementation for the "width" debugfs
10143 + * interface to the hardware latency detector.
10144 + */
10145 +static int debug_width_fopen(struct inode *inode, struct file *filp)
10146 +{
10147 +       return 0;
10148 +}
10149 +
10150 +/**
10151 + * debug_width_fread - Read function for "width" debugfs entry
10152 + * @filp: The active open file structure for the debugfs "file"
10153 + * @ubuf: The userspace provided buffer to read value into
10154 + * @cnt: The maximum number of bytes to read
10155 + * @ppos: The current "file" position
10156 + *
10157 + * This function provides a read implementation for the "width" debugfs
10158 + * interface to the hardware latency detector. It can be used to determine
10159 + * for how many us of the total window us we will actively sample for any
10160 + * hardware-induced latecy periods. Obviously, it is not possible to
10161 + * sample constantly and have the system respond to a sample reader, or,
10162 + * worse, without having the system appear to have gone out to lunch.
10163 + */
10164 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
10165 +                                    size_t cnt, loff_t *ppos)
10166 +{
10167 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
10168 +}
10169 +
10170 +/**
10171 + * debug_width_fwrite - Write function for "width" debugfs entry
10172 + * @filp: The active open file structure for the debugfs "file"
10173 + * @ubuf: The user buffer that contains the value to write
10174 + * @cnt: The maximum number of bytes to write to "file"
10175 + * @ppos: The current position in the debugfs "file"
10176 + *
10177 + * This function provides a write implementation for the "width" debugfs
10178 + * interface to the hardware latency detector. It can be used to configure
10179 + * for how many us of the total window us we will actively sample for any
10180 + * hardware-induced latency periods. Obviously, it is not possible to
10181 + * sample constantly and have the system respond to a sample reader, or,
10182 + * worse, without having the system appear to have gone out to lunch. It
10183 + * is enforced that width is less that the total window size.
10184 + */
10185 +static ssize_t  debug_width_fwrite(struct file *filp,
10186 +                                      const char __user *ubuf,
10187 +                                      size_t cnt,
10188 +                                      loff_t *ppos)
10189 +{
10190 +       char buf[U64STR_SIZE];
10191 +       int csize = min(cnt, sizeof(buf));
10192 +       u64 val = 0;
10193 +       int err = 0;
10194 +
10195 +       memset(buf, '\0', sizeof(buf));
10196 +       if (copy_from_user(buf, ubuf, csize))
10197 +               return -EFAULT;
10198 +
10199 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10200 +       err = kstrtoull(buf, 10, &val);
10201 +       if (err)
10202 +               return -EINVAL;
10203 +
10204 +       mutex_lock(&data.lock);
10205 +       if (val < data.sample_window)
10206 +               data.sample_width = val;
10207 +       else {
10208 +               mutex_unlock(&data.lock);
10209 +               return -EINVAL;
10210 +       }
10211 +       mutex_unlock(&data.lock);
10212 +
10213 +       if (enabled)
10214 +               wake_up_process(kthread);
10215 +
10216 +       return csize;
10217 +}
10218 +
10219 +/**
10220 + * debug_window_fopen - Open function for "window" debugfs entry
10221 + * @inode: The in-kernel inode representation of the debugfs "file"
10222 + * @filp: The active open file structure for the debugfs "file"
10223 + *
10224 + * This function provides an open implementation for the "window" debugfs
10225 + * interface to the hardware latency detector. The window is the total time
10226 + * in us that will be considered one sample period. Conceptually, windows
10227 + * occur back-to-back and contain a sample width period during which
10228 + * actual sampling occurs.
10229 + */
10230 +static int debug_window_fopen(struct inode *inode, struct file *filp)
10231 +{
10232 +       return 0;
10233 +}
10234 +
10235 +/**
10236 + * debug_window_fread - Read function for "window" debugfs entry
10237 + * @filp: The active open file structure for the debugfs "file"
10238 + * @ubuf: The userspace provided buffer to read value into
10239 + * @cnt: The maximum number of bytes to read
10240 + * @ppos: The current "file" position
10241 + *
10242 + * This function provides a read implementation for the "window" debugfs
10243 + * interface to the hardware latency detector. The window is the total time
10244 + * in us that will be considered one sample period. Conceptually, windows
10245 + * occur back-to-back and contain a sample width period during which
10246 + * actual sampling occurs. Can be used to read the total window size.
10247 + */
10248 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
10249 +                                     size_t cnt, loff_t *ppos)
10250 +{
10251 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
10252 +}
10253 +
10254 +/**
10255 + * debug_window_fwrite - Write function for "window" debugfs entry
10256 + * @filp: The active open file structure for the debugfs "file"
10257 + * @ubuf: The user buffer that contains the value to write
10258 + * @cnt: The maximum number of bytes to write to "file"
10259 + * @ppos: The current position in the debugfs "file"
10260 + *
10261 + * This function provides a write implementation for the "window" debufds
10262 + * interface to the hardware latency detetector. The window is the total time
10263 + * in us that will be considered one sample period. Conceptually, windows
10264 + * occur back-to-back and contain a sample width period during which
10265 + * actual sampling occurs. Can be used to write a new total window size. It
10266 + * is enfoced that any value written must be greater than the sample width
10267 + * size, or an error results.
10268 + */
10269 +static ssize_t  debug_window_fwrite(struct file *filp,
10270 +                                       const char __user *ubuf,
10271 +                                       size_t cnt,
10272 +                                       loff_t *ppos)
10273 +{
10274 +       char buf[U64STR_SIZE];
10275 +       int csize = min(cnt, sizeof(buf));
10276 +       u64 val = 0;
10277 +       int err = 0;
10278 +
10279 +       memset(buf, '\0', sizeof(buf));
10280 +       if (copy_from_user(buf, ubuf, csize))
10281 +               return -EFAULT;
10282 +
10283 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10284 +       err = kstrtoull(buf, 10, &val);
10285 +       if (err)
10286 +               return -EINVAL;
10287 +
10288 +       mutex_lock(&data.lock);
10289 +       if (data.sample_width < val)
10290 +               data.sample_window = val;
10291 +       else {
10292 +               mutex_unlock(&data.lock);
10293 +               return -EINVAL;
10294 +       }
10295 +       mutex_unlock(&data.lock);
10296 +
10297 +       return csize;
10298 +}
10299 +
10300 +/*
10301 + * Function pointers for the "count" debugfs file operations
10302 + */
10303 +static const struct file_operations count_fops = {
10304 +       .open           = debug_count_fopen,
10305 +       .read           = debug_count_fread,
10306 +       .write          = debug_count_fwrite,
10307 +       .owner          = THIS_MODULE,
10308 +};
10309 +
10310 +/*
10311 + * Function pointers for the "enable" debugfs file operations
10312 + */
10313 +static const struct file_operations enable_fops = {
10314 +       .open           = debug_enable_fopen,
10315 +       .read           = debug_enable_fread,
10316 +       .write          = debug_enable_fwrite,
10317 +       .owner          = THIS_MODULE,
10318 +};
10319 +
10320 +/*
10321 + * Function pointers for the "max" debugfs file operations
10322 + */
10323 +static const struct file_operations max_fops = {
10324 +       .open           = debug_max_fopen,
10325 +       .read           = debug_max_fread,
10326 +       .write          = debug_max_fwrite,
10327 +       .owner          = THIS_MODULE,
10328 +};
10329 +
10330 +/*
10331 + * Function pointers for the "sample" debugfs file operations
10332 + */
10333 +static const struct file_operations sample_fops = {
10334 +       .open           = debug_sample_fopen,
10335 +       .read           = debug_sample_fread,
10336 +       .release        = debug_sample_release,
10337 +       .owner          = THIS_MODULE,
10338 +};
10339 +
10340 +/*
10341 + * Function pointers for the "threshold" debugfs file operations
10342 + */
10343 +static const struct file_operations threshold_fops = {
10344 +       .open           = debug_threshold_fopen,
10345 +       .read           = debug_threshold_fread,
10346 +       .write          = debug_threshold_fwrite,
10347 +       .owner          = THIS_MODULE,
10348 +};
10349 +
10350 +/*
10351 + * Function pointers for the "width" debugfs file operations
10352 + */
10353 +static const struct file_operations width_fops = {
10354 +       .open           = debug_width_fopen,
10355 +       .read           = debug_width_fread,
10356 +       .write          = debug_width_fwrite,
10357 +       .owner          = THIS_MODULE,
10358 +};
10359 +
10360 +/*
10361 + * Function pointers for the "window" debugfs file operations
10362 + */
10363 +static const struct file_operations window_fops = {
10364 +       .open           = debug_window_fopen,
10365 +       .read           = debug_window_fread,
10366 +       .write          = debug_window_fwrite,
10367 +       .owner          = THIS_MODULE,
10368 +};
10369 +
10370 +/**
10371 + * init_debugfs - A function to initialize the debugfs interface files
10372 + *
10373 + * This function creates entries in debugfs for "hwlat_detector", including
10374 + * files to read values from the detector, current samples, and the
10375 + * maximum sample that has been captured since the hardware latency
10376 + * dectector was started.
10377 + */
10378 +static int init_debugfs(void)
10379 +{
10380 +       int ret = -ENOMEM;
10381 +
10382 +       debug_dir = debugfs_create_dir(DRVNAME, NULL);
10383 +       if (!debug_dir)
10384 +               goto err_debug_dir;
10385 +
10386 +       debug_sample = debugfs_create_file("sample", 0444,
10387 +                                              debug_dir, NULL,
10388 +                                              &sample_fops);
10389 +       if (!debug_sample)
10390 +               goto err_sample;
10391 +
10392 +       debug_count = debugfs_create_file("count", 0444,
10393 +                                             debug_dir, NULL,
10394 +                                             &count_fops);
10395 +       if (!debug_count)
10396 +               goto err_count;
10397 +
10398 +       debug_max = debugfs_create_file("max", 0444,
10399 +                                           debug_dir, NULL,
10400 +                                           &max_fops);
10401 +       if (!debug_max)
10402 +               goto err_max;
10403 +
10404 +       debug_sample_window = debugfs_create_file("window", 0644,
10405 +                                                     debug_dir, NULL,
10406 +                                                     &window_fops);
10407 +       if (!debug_sample_window)
10408 +               goto err_window;
10409 +
10410 +       debug_sample_width = debugfs_create_file("width", 0644,
10411 +                                                    debug_dir, NULL,
10412 +                                                    &width_fops);
10413 +       if (!debug_sample_width)
10414 +               goto err_width;
10415 +
10416 +       debug_threshold = debugfs_create_file("threshold", 0644,
10417 +                                                 debug_dir, NULL,
10418 +                                                 &threshold_fops);
10419 +       if (!debug_threshold)
10420 +               goto err_threshold;
10421 +
10422 +       debug_enable = debugfs_create_file("enable", 0644,
10423 +                                              debug_dir, &enabled,
10424 +                                              &enable_fops);
10425 +       if (!debug_enable)
10426 +               goto err_enable;
10427 +
10428 +       else {
10429 +               ret = 0;
10430 +               goto out;
10431 +       }
10432 +
10433 +err_enable:
10434 +       debugfs_remove(debug_threshold);
10435 +err_threshold:
10436 +       debugfs_remove(debug_sample_width);
10437 +err_width:
10438 +       debugfs_remove(debug_sample_window);
10439 +err_window:
10440 +       debugfs_remove(debug_max);
10441 +err_max:
10442 +       debugfs_remove(debug_count);
10443 +err_count:
10444 +       debugfs_remove(debug_sample);
10445 +err_sample:
10446 +       debugfs_remove(debug_dir);
10447 +err_debug_dir:
10448 +out:
10449 +       return ret;
10450 +}
10451 +
10452 +/**
10453 + * free_debugfs - A function to cleanup the debugfs file interface
10454 + */
10455 +static void free_debugfs(void)
10456 +{
10457 +       /* could also use a debugfs_remove_recursive */
10458 +       debugfs_remove(debug_enable);
10459 +       debugfs_remove(debug_threshold);
10460 +       debugfs_remove(debug_sample_width);
10461 +       debugfs_remove(debug_sample_window);
10462 +       debugfs_remove(debug_max);
10463 +       debugfs_remove(debug_count);
10464 +       debugfs_remove(debug_sample);
10465 +       debugfs_remove(debug_dir);
10466 +}
10467 +
10468 +/**
10469 + * detector_init - Standard module initialization code
10470 + */
10471 +static int detector_init(void)
10472 +{
10473 +       int ret = -ENOMEM;
10474 +
10475 +       pr_info(BANNER "version %s\n", VERSION);
10476 +
10477 +       ret = init_stats();
10478 +       if (ret)
10479 +               goto out;
10480 +
10481 +       ret = init_debugfs();
10482 +       if (ret)
10483 +               goto err_stats;
10484 +
10485 +       if (enabled)
10486 +               ret = start_kthread();
10487 +
10488 +       goto out;
10489 +
10490 +err_stats:
10491 +       ring_buffer_free(ring_buffer);
10492 +out:
10493 +       return ret;
10494 +
10495 +}
10496 +
10497 +/**
10498 + * detector_exit - Standard module cleanup code
10499 + */
10500 +static void detector_exit(void)
10501 +{
10502 +       int err;
10503 +
10504 +       if (enabled) {
10505 +               enabled = 0;
10506 +               err = stop_kthread();
10507 +               if (err)
10508 +                       pr_err(BANNER "cannot stop kthread\n");
10509 +       }
10510 +
10511 +       free_debugfs();
10512 +       ring_buffer_free(ring_buffer);  /* free up the ring buffer */
10513 +
10514 +}
10515 +
10516 +module_init(detector_init);
10517 +module_exit(detector_exit);
10518 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
10519 index acece3299756..58ea04a03fa9 100644
10520 --- a/drivers/mmc/host/mmci.c
10521 +++ b/drivers/mmc/host/mmci.c
10522 @@ -1155,15 +1155,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10523         struct sg_mapping_iter *sg_miter = &host->sg_miter;
10524         struct variant_data *variant = host->variant;
10525         void __iomem *base = host->base;
10526 -       unsigned long flags;
10527         u32 status;
10528  
10529         status = readl(base + MMCISTATUS);
10530  
10531         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
10532  
10533 -       local_irq_save(flags);
10534 -
10535         do {
10536                 unsigned int remain, len;
10537                 char *buffer;
10538 @@ -1203,8 +1200,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10539  
10540         sg_miter_stop(sg_miter);
10541  
10542 -       local_irq_restore(flags);
10543 -
10544         /*
10545          * If we have less than the fifo 'half-full' threshold to transfer,
10546          * trigger a PIO interrupt as soon as any data is available.
10547 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
10548 index 2839af00f20c..4348b9c850d3 100644
10549 --- a/drivers/net/ethernet/3com/3c59x.c
10550 +++ b/drivers/net/ethernet/3com/3c59x.c
10551 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
10552  {
10553         struct vortex_private *vp = netdev_priv(dev);
10554         unsigned long flags;
10555 -       local_irq_save(flags);
10556 +       local_irq_save_nort(flags);
10557         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
10558 -       local_irq_restore(flags);
10559 +       local_irq_restore_nort(flags);
10560  }
10561  #endif
10562  
10563 @@ -1916,12 +1916,12 @@ static void vortex_tx_timeout(struct net_device *dev)
10564                          * Block interrupts because vortex_interrupt does a bare spin_lock()
10565                          */
10566                         unsigned long flags;
10567 -                       local_irq_save(flags);
10568 +                       local_irq_save_nort(flags);
10569                         if (vp->full_bus_master_tx)
10570                                 boomerang_interrupt(dev->irq, dev);
10571                         else
10572                                 vortex_interrupt(dev->irq, dev);
10573 -                       local_irq_restore(flags);
10574 +                       local_irq_restore_nort(flags);
10575                 }
10576         }
10577  
10578 diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10579 index 8b5988e210d5..cf9928ccdd7e 100644
10580 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10581 +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10582 @@ -2221,11 +2221,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
10583         }
10584  
10585         tpd_req = atl1c_cal_tpd_req(skb);
10586 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
10587 -               if (netif_msg_pktdata(adapter))
10588 -                       dev_info(&adapter->pdev->dev, "tx locked\n");
10589 -               return NETDEV_TX_LOCKED;
10590 -       }
10591 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10592  
10593         if (atl1c_tpd_avail(adapter, type) < tpd_req) {
10594                 /* no enough descriptor, just stop queue */
10595 diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10596 index 59a03a193e83..734f7a7ad2c3 100644
10597 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10598 +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10599 @@ -1880,8 +1880,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
10600                 return NETDEV_TX_OK;
10601         }
10602         tpd_req = atl1e_cal_tdp_req(skb);
10603 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
10604 -               return NETDEV_TX_LOCKED;
10605 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10606  
10607         if (atl1e_tpd_avail(adapter) < tpd_req) {
10608                 /* no enough descriptor, just stop queue */
10609 diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
10610 index 526ea74e82d9..86f467a2c485 100644
10611 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c
10612 +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
10613 @@ -1664,8 +1664,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
10614         struct cmdQ *q = &sge->cmdQ[qid];
10615         unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
10616  
10617 -       if (!spin_trylock(&q->lock))
10618 -               return NETDEV_TX_LOCKED;
10619 +       spin_lock(&q->lock);
10620  
10621         reclaim_completed_tx(sge, q);
10622  
10623 diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
10624 index 9ba975853ec6..813cfa698160 100644
10625 --- a/drivers/net/ethernet/neterion/s2io.c
10626 +++ b/drivers/net/ethernet/neterion/s2io.c
10627 @@ -4084,12 +4084,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
10628                         [skb->priority & (MAX_TX_FIFOS - 1)];
10629         fifo = &mac_control->fifos[queue];
10630  
10631 -       if (do_spin_lock)
10632 -               spin_lock_irqsave(&fifo->tx_lock, flags);
10633 -       else {
10634 -               if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
10635 -                       return NETDEV_TX_LOCKED;
10636 -       }
10637 +       spin_lock_irqsave(&fifo->tx_lock, flags);
10638  
10639         if (sp->config.multiq) {
10640                 if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
10641 diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10642 index 3b98b263bad0..ca4add749410 100644
10643 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10644 +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10645 @@ -2137,10 +2137,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
10646         struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
10647         unsigned long flags;
10648  
10649 -       if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
10650 -               /* Collision - tell upper layer to requeue */
10651 -               return NETDEV_TX_LOCKED;
10652 -       }
10653 +       spin_lock_irqsave(&tx_ring->tx_lock, flags);
10654 +
10655         if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
10656                 netif_stop_queue(netdev);
10657                 spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
10658 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
10659 index ef668d300800..d987d571fdd6 100644
10660 --- a/drivers/net/ethernet/realtek/8139too.c
10661 +++ b/drivers/net/ethernet/realtek/8139too.c
10662 @@ -2229,7 +2229,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
10663         struct rtl8139_private *tp = netdev_priv(dev);
10664         const int irq = tp->pci_dev->irq;
10665  
10666 -       disable_irq(irq);
10667 +       disable_irq_nosync(irq);
10668         rtl8139_interrupt(irq, dev);
10669         enable_irq(irq);
10670  }
10671 diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
10672 index 14c9d1baa85c..e1a5305418a8 100644
10673 --- a/drivers/net/ethernet/tehuti/tehuti.c
10674 +++ b/drivers/net/ethernet/tehuti/tehuti.c
10675 @@ -1629,13 +1629,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
10676         unsigned long flags;
10677  
10678         ENTER;
10679 -       local_irq_save(flags);
10680 -       if (!spin_trylock(&priv->tx_lock)) {
10681 -               local_irq_restore(flags);
10682 -               DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
10683 -                   BDX_DRV_NAME, ndev->name);
10684 -               return NETDEV_TX_LOCKED;
10685 -       }
10686 +
10687 +       spin_lock_irqsave(&priv->tx_lock, flags);
10688  
10689         /* build tx descriptor */
10690         BDX_ASSERT(f->m.wptr >= f->m.memsz);    /* started with valid wptr */
10691 diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
10692 index e7034c55e796..2e4ee0f912bf 100644
10693 --- a/drivers/net/rionet.c
10694 +++ b/drivers/net/rionet.c
10695 @@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
10696         unsigned long flags;
10697         int add_num = 1;
10698  
10699 -       local_irq_save(flags);
10700 -       if (!spin_trylock(&rnet->tx_lock)) {
10701 -               local_irq_restore(flags);
10702 -               return NETDEV_TX_LOCKED;
10703 -       }
10704 +       spin_lock_irqsave(&rnet->tx_lock, flags);
10705  
10706         if (is_multicast_ether_addr(eth->h_dest))
10707                 add_num = nets[rnet->mport->id].nact;
10708 diff --git a/drivers/net/wireless/orinoco/orinoco_usb.c b/drivers/net/wireless/orinoco/orinoco_usb.c
10709 index f2cd513d54b2..6c0f4c9638a2 100644
10710 --- a/drivers/net/wireless/orinoco/orinoco_usb.c
10711 +++ b/drivers/net/wireless/orinoco/orinoco_usb.c
10712 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
10713                         while (!ctx->done.done && msecs--)
10714                                 udelay(1000);
10715                 } else {
10716 -                       wait_event_interruptible(ctx->done.wait,
10717 +                       swait_event_interruptible(ctx->done.wait,
10718                                                  ctx->done.done);
10719                 }
10720                 break;
10721 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
10722 index 59ac36fe7c42..7a45a20af78a 100644
10723 --- a/drivers/pci/access.c
10724 +++ b/drivers/pci/access.c
10725 @@ -561,7 +561,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
10726         WARN_ON(!dev->block_cfg_access);
10727  
10728         dev->block_cfg_access = 0;
10729 -       wake_up_all(&pci_cfg_wait);
10730 +       wake_up_all_locked(&pci_cfg_wait);
10731         raw_spin_unlock_irqrestore(&pci_lock, flags);
10732  }
10733  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
10734 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
10735 index f4424063b860..cbbbebd86c6e 100644
10736 --- a/drivers/scsi/fcoe/fcoe.c
10737 +++ b/drivers/scsi/fcoe/fcoe.c
10738 @@ -1286,7 +1286,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
10739         struct sk_buff *skb;
10740  #ifdef CONFIG_SMP
10741         struct fcoe_percpu_s *p0;
10742 -       unsigned targ_cpu = get_cpu();
10743 +       unsigned targ_cpu = get_cpu_light();
10744  #endif /* CONFIG_SMP */
10745  
10746         FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
10747 @@ -1342,7 +1342,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
10748                         kfree_skb(skb);
10749                 spin_unlock_bh(&p->fcoe_rx_list.lock);
10750         }
10751 -       put_cpu();
10752 +       put_cpu_light();
10753  #else
10754         /*
10755          * This a non-SMP scenario where the singular Rx thread is
10756 @@ -1566,11 +1566,11 @@ err2:
10757  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
10758  {
10759         struct fcoe_percpu_s *fps;
10760 -       int rc;
10761 +       int rc, cpu = get_cpu_light();
10762  
10763 -       fps = &get_cpu_var(fcoe_percpu);
10764 +       fps = &per_cpu(fcoe_percpu, cpu);
10765         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
10766 -       put_cpu_var(fcoe_percpu);
10767 +       put_cpu_light();
10768  
10769         return rc;
10770  }
10771 @@ -1766,11 +1766,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
10772                 return 0;
10773         }
10774  
10775 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10776 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10777         stats->InvalidCRCCount++;
10778         if (stats->InvalidCRCCount < 5)
10779                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
10780 -       put_cpu();
10781 +       put_cpu_light();
10782         return -EINVAL;
10783  }
10784  
10785 @@ -1814,7 +1814,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10786          */
10787         hp = (struct fcoe_hdr *) skb_network_header(skb);
10788  
10789 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10790 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10791         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
10792                 if (stats->ErrorFrames < 5)
10793                         printk(KERN_WARNING "fcoe: FCoE version "
10794 @@ -1846,13 +1846,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10795                 goto drop;
10796  
10797         if (!fcoe_filter_frames(lport, fp)) {
10798 -               put_cpu();
10799 +               put_cpu_light();
10800                 fc_exch_recv(lport, fp);
10801                 return;
10802         }
10803  drop:
10804         stats->ErrorFrames++;
10805 -       put_cpu();
10806 +       put_cpu_light();
10807         kfree_skb(skb);
10808  }
10809  
10810 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
10811 index 34a1b1f333b4..d91131210695 100644
10812 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
10813 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
10814 @@ -831,7 +831,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10815  
10816         INIT_LIST_HEAD(&del_list);
10817  
10818 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
10819 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
10820  
10821         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
10822                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
10823 @@ -867,7 +867,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10824                                 sel_time = fcf->time;
10825                 }
10826         }
10827 -       put_cpu();
10828 +       put_cpu_light();
10829  
10830         list_for_each_entry_safe(fcf, next, &del_list, list) {
10831                 /* Removes fcf from current list */
10832 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
10833 index 30f9ef0c0d4f..6c686bc01a82 100644
10834 --- a/drivers/scsi/libfc/fc_exch.c
10835 +++ b/drivers/scsi/libfc/fc_exch.c
10836 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
10837         }
10838         memset(ep, 0, sizeof(*ep));
10839  
10840 -       cpu = get_cpu();
10841 +       cpu = get_cpu_light();
10842         pool = per_cpu_ptr(mp->pool, cpu);
10843         spin_lock_bh(&pool->lock);
10844 -       put_cpu();
10845 +       put_cpu_light();
10846  
10847         /* peek cache of free slot */
10848         if (pool->left != FC_XID_UNKNOWN) {
10849 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
10850 index 9c706d8c1441..d968ffc79c08 100644
10851 --- a/drivers/scsi/libsas/sas_ata.c
10852 +++ b/drivers/scsi/libsas/sas_ata.c
10853 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10854         /* TODO: audit callers to ensure they are ready for qc_issue to
10855          * unconditionally re-enable interrupts
10856          */
10857 -       local_irq_save(flags);
10858 +       local_irq_save_nort(flags);
10859         spin_unlock(ap->lock);
10860  
10861         /* If the device fell off, no sense in issuing commands */
10862 @@ -255,7 +255,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10863  
10864   out:
10865         spin_lock(ap->lock);
10866 -       local_irq_restore(flags);
10867 +       local_irq_restore_nort(flags);
10868         return ret;
10869  }
10870  
10871 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
10872 index fee9eb7c8a60..b42d4adc42dc 100644
10873 --- a/drivers/scsi/qla2xxx/qla_inline.h
10874 +++ b/drivers/scsi/qla2xxx/qla_inline.h
10875 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
10876  {
10877         unsigned long flags;
10878         struct qla_hw_data *ha = rsp->hw;
10879 -       local_irq_save(flags);
10880 +       local_irq_save_nort(flags);
10881         if (IS_P3P_TYPE(ha))
10882                 qla82xx_poll(0, rsp);
10883         else
10884                 ha->isp_ops->intr_handler(0, rsp);
10885 -       local_irq_restore(flags);
10886 +       local_irq_restore_nort(flags);
10887  }
10888  
10889  static inline uint8_t *
10890 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
10891 index 7fc919f7da4d..e03fa17b8670 100644
10892 --- a/drivers/thermal/x86_pkg_temp_thermal.c
10893 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
10894 @@ -29,6 +29,7 @@
10895  #include <linux/pm.h>
10896  #include <linux/thermal.h>
10897  #include <linux/debugfs.h>
10898 +#include <linux/swork.h>
10899  #include <asm/cpu_device_id.h>
10900  #include <asm/mce.h>
10901  
10902 @@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
10903         }
10904  }
10905  
10906 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10907 +static void platform_thermal_notify_work(struct swork_event *event)
10908  {
10909         unsigned long flags;
10910         int cpu = smp_processor_id();
10911 @@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10912                         pkg_work_scheduled[phy_id]) {
10913                 disable_pkg_thres_interrupt();
10914                 spin_unlock_irqrestore(&pkg_work_lock, flags);
10915 -               return -EINVAL;
10916 +               return;
10917         }
10918         pkg_work_scheduled[phy_id] = 1;
10919         spin_unlock_irqrestore(&pkg_work_lock, flags);
10920 @@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10921         schedule_delayed_work_on(cpu,
10922                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
10923                                 msecs_to_jiffies(notify_delay_ms));
10924 +}
10925 +
10926 +#ifdef CONFIG_PREEMPT_RT_FULL
10927 +static struct swork_event notify_work;
10928 +
10929 +static int thermal_notify_work_init(void)
10930 +{
10931 +       int err;
10932 +
10933 +       err = swork_get();
10934 +       if (err)
10935 +               return err;
10936 +
10937 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
10938         return 0;
10939  }
10940  
10941 +static void thermal_notify_work_cleanup(void)
10942 +{
10943 +       swork_put();
10944 +}
10945 +
10946 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10947 +{
10948 +       swork_queue(&notify_work);
10949 +       return 0;
10950 +}
10951 +
10952 +#else  /* !CONFIG_PREEMPT_RT_FULL */
10953 +
10954 +static int thermal_notify_work_init(void) { return 0; }
10955 +
10956 +static void thermal_notify_work_cleanup(void) {  }
10957 +
10958 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10959 +{
10960 +       platform_thermal_notify_work(NULL);
10961 +
10962 +       return 0;
10963 +}
10964 +#endif /* CONFIG_PREEMPT_RT_FULL */
10965 +
10966  static int find_siblings_cpu(int cpu)
10967  {
10968         int i;
10969 @@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
10970         if (!x86_match_cpu(pkg_temp_thermal_ids))
10971                 return -ENODEV;
10972  
10973 +       if (!thermal_notify_work_init())
10974 +               return -ENODEV;
10975 +
10976         spin_lock_init(&pkg_work_lock);
10977         platform_thermal_package_notify =
10978                         pkg_temp_thermal_platform_thermal_notify;
10979 @@ -608,7 +651,7 @@ err_ret:
10980         kfree(pkg_work_scheduled);
10981         platform_thermal_package_notify = NULL;
10982         platform_thermal_package_rate_control = NULL;
10983 -
10984 +       thermal_notify_work_cleanup();
10985         return -ENODEV;
10986  }
10987  
10988 @@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
10989         mutex_unlock(&phy_dev_list_mutex);
10990         platform_thermal_package_notify = NULL;
10991         platform_thermal_package_rate_control = NULL;
10992 +       thermal_notify_work_cleanup();
10993         for_each_online_cpu(i)
10994                 cancel_delayed_work_sync(
10995                         &per_cpu(pkg_temp_thermal_threshold_work, i));
10996 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
10997 index 39126460c1f5..af7701ca4d48 100644
10998 --- a/drivers/tty/serial/8250/8250_core.c
10999 +++ b/drivers/tty/serial/8250/8250_core.c
11000 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
11001  
11002  static unsigned int skip_txen_test; /* force skip of txen test at init time */
11003  
11004 -#define PASS_LIMIT     512
11005 +/*
11006 + * On -rt we can have a more delays, and legitimately
11007 + * so - so don't drop work spuriously and spam the
11008 + * syslog:
11009 + */
11010 +#ifdef CONFIG_PREEMPT_RT_FULL
11011 +# define PASS_LIMIT    1000000
11012 +#else
11013 +# define PASS_LIMIT    512
11014 +#endif
11015  
11016  #include <asm/serial.h>
11017  /*
11018 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
11019 index 56ccbcefdd85..a0b9e854672c 100644
11020 --- a/drivers/tty/serial/8250/8250_port.c
11021 +++ b/drivers/tty/serial/8250/8250_port.c
11022 @@ -35,6 +35,7 @@
11023  #include <linux/nmi.h>
11024  #include <linux/mutex.h>
11025  #include <linux/slab.h>
11026 +#include <linux/kdb.h>
11027  #include <linux/uaccess.h>
11028  #include <linux/pm_runtime.h>
11029  
11030 @@ -2843,9 +2844,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
11031  
11032         serial8250_rpm_get(up);
11033  
11034 -       if (port->sysrq)
11035 +       if (port->sysrq || oops_in_progress)
11036                 locked = 0;
11037 -       else if (oops_in_progress)
11038 +       else if (in_kdb_printk())
11039                 locked = spin_trylock_irqsave(&port->lock, flags);
11040         else
11041                 spin_lock_irqsave(&port->lock, flags);
11042 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
11043 index 899a77187bde..3ff6363b3751 100644
11044 --- a/drivers/tty/serial/amba-pl011.c
11045 +++ b/drivers/tty/serial/amba-pl011.c
11046 @@ -2067,13 +2067,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11047  
11048         clk_enable(uap->clk);
11049  
11050 -       local_irq_save(flags);
11051 +       /*
11052 +        * local_irq_save(flags);
11053 +        *
11054 +        * This local_irq_save() is nonsense. If we come in via sysrq
11055 +        * handling then interrupts are already disabled. Aside of
11056 +        * that the port.sysrq check is racy on SMP regardless.
11057 +       */
11058         if (uap->port.sysrq)
11059                 locked = 0;
11060         else if (oops_in_progress)
11061 -               locked = spin_trylock(&uap->port.lock);
11062 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
11063         else
11064 -               spin_lock(&uap->port.lock);
11065 +               spin_lock_irqsave(&uap->port.lock, flags);
11066  
11067         /*
11068          *      First save the CR then disable the interrupts
11069 @@ -2098,8 +2104,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11070                 writew(old_cr, uap->port.membase + UART011_CR);
11071  
11072         if (locked)
11073 -               spin_unlock(&uap->port.lock);
11074 -       local_irq_restore(flags);
11075 +               spin_unlock_irqrestore(&uap->port.lock, flags);
11076  
11077         clk_disable(uap->clk);
11078  }
11079 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
11080 index 24280d9a05e9..9745fb8b7abb 100644
11081 --- a/drivers/tty/serial/omap-serial.c
11082 +++ b/drivers/tty/serial/omap-serial.c
11083 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
11084  
11085         pm_runtime_get_sync(up->dev);
11086  
11087 -       local_irq_save(flags);
11088 -       if (up->port.sysrq)
11089 -               locked = 0;
11090 -       else if (oops_in_progress)
11091 -               locked = spin_trylock(&up->port.lock);
11092 +       if (up->port.sysrq || oops_in_progress)
11093 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
11094         else
11095 -               spin_lock(&up->port.lock);
11096 +               spin_lock_irqsave(&up->port.lock, flags);
11097  
11098         /*
11099          * First save the IER then disable the interrupts
11100 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
11101         pm_runtime_mark_last_busy(up->dev);
11102         pm_runtime_put_autosuspend(up->dev);
11103         if (locked)
11104 -               spin_unlock(&up->port.lock);
11105 -       local_irq_restore(flags);
11106 +               spin_unlock_irqrestore(&up->port.lock, flags);
11107  }
11108  
11109  static int __init
11110 diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
11111 index edb5305b9d4d..7d5ee8a13ac6 100644
11112 --- a/drivers/tty/serial/sc16is7xx.c
11113 +++ b/drivers/tty/serial/sc16is7xx.c
11114 @@ -1230,7 +1230,7 @@ static int sc16is7xx_probe(struct device *dev,
11115  
11116         /* Setup interrupt */
11117         ret = devm_request_irq(dev, irq, sc16is7xx_irq,
11118 -                              IRQF_ONESHOT | flags, dev_name(dev), s);
11119 +                              flags, dev_name(dev), s);
11120         if (!ret)
11121                 return 0;
11122  
11123 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
11124 index f44ce09367bc..5fc9a14721bd 100644
11125 --- a/drivers/usb/core/hcd.c
11126 +++ b/drivers/usb/core/hcd.c
11127 @@ -1735,9 +1735,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
11128          * and no one may trigger the above deadlock situation when
11129          * running complete() in tasklet.
11130          */
11131 -       local_irq_save(flags);
11132 +       local_irq_save_nort(flags);
11133         urb->complete(urb);
11134 -       local_irq_restore(flags);
11135 +       local_irq_restore_nort(flags);
11136  
11137         usb_anchor_resume_wakeups(anchor);
11138         atomic_dec(&urb->use_count);
11139 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
11140 index 803c503a2e3d..8dd2720aab64 100644
11141 --- a/drivers/usb/gadget/function/f_fs.c
11142 +++ b/drivers/usb/gadget/function/f_fs.c
11143 @@ -1404,7 +1404,7 @@ static void ffs_data_put(struct ffs_data *ffs)
11144                 pr_info("%s(): freeing\n", __func__);
11145                 ffs_data_clear(ffs);
11146                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
11147 -                      waitqueue_active(&ffs->ep0req_completion.wait));
11148 +                      swait_active(&ffs->ep0req_completion.wait));
11149                 kfree(ffs->dev_name);
11150                 kfree(ffs);
11151         }
11152 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
11153 index e57f48f9528f..7544a54056e4 100644
11154 --- a/drivers/usb/gadget/legacy/inode.c
11155 +++ b/drivers/usb/gadget/legacy/inode.c
11156 @@ -345,7 +345,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11157         spin_unlock_irq (&epdata->dev->lock);
11158  
11159         if (likely (value == 0)) {
11160 -               value = wait_event_interruptible (done.wait, done.done);
11161 +               value = swait_event_interruptible (done.wait, done.done);
11162                 if (value != 0) {
11163                         spin_lock_irq (&epdata->dev->lock);
11164                         if (likely (epdata->ep != NULL)) {
11165 @@ -354,7 +354,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11166                                 usb_ep_dequeue (epdata->ep, epdata->req);
11167                                 spin_unlock_irq (&epdata->dev->lock);
11168  
11169 -                               wait_event (done.wait, done.done);
11170 +                               swait_event (done.wait, done.done);
11171                                 if (epdata->status == -ECONNRESET)
11172                                         epdata->status = -EINTR;
11173                         } else {
11174 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
11175 index f92f5aff0dd5..f9bba26e3655 100644
11176 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c
11177 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
11178 @@ -17,7 +17,9 @@
11179  #include <linux/device.h>
11180  #include <linux/dma-mapping.h>
11181  #include <linux/list.h>
11182 +#include <linux/mfd/syscon.h>
11183  #include <linux/platform_device.h>
11184 +#include <linux/regmap.h>
11185  #include <linux/usb/ch9.h>
11186  #include <linux/usb/gadget.h>
11187  #include <linux/usb/atmel_usba_udc.h>
11188 @@ -1888,20 +1890,15 @@ static int atmel_usba_stop(struct usb_gadget *gadget)
11189  #ifdef CONFIG_OF
11190  static void at91sam9rl_toggle_bias(struct usba_udc *udc, int is_on)
11191  {
11192 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11193 -
11194 -       if (is_on)
11195 -               at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11196 -       else
11197 -               at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11198 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11199 +                          is_on ? AT91_PMC_BIASEN : 0);
11200  }
11201  
11202  static void at91sam9g45_pulse_bias(struct usba_udc *udc)
11203  {
11204 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11205 -
11206 -       at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11207 -       at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11208 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN, 0);
11209 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11210 +                          AT91_PMC_BIASEN);
11211  }
11212  
11213  static const struct usba_udc_errata at91sam9rl_errata = {
11214 @@ -1938,6 +1935,9 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev,
11215                 return ERR_PTR(-EINVAL);
11216  
11217         udc->errata = match->data;
11218 +       udc->pmc = syscon_regmap_lookup_by_compatible("atmel,at91sam9g45-pmc");
11219 +       if (udc->errata && IS_ERR(udc->pmc))
11220 +               return ERR_CAST(udc->pmc);
11221  
11222         udc->num_ep = 0;
11223  
11224 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.h b/drivers/usb/gadget/udc/atmel_usba_udc.h
11225 index ea448a344767..3e1c9d589dfa 100644
11226 --- a/drivers/usb/gadget/udc/atmel_usba_udc.h
11227 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.h
11228 @@ -354,6 +354,8 @@ struct usba_udc {
11229         struct dentry *debugfs_root;
11230         struct dentry *debugfs_regs;
11231  #endif
11232 +
11233 +       struct regmap *pmc;
11234  };
11235  
11236  static inline struct usba_ep *to_usba_ep(struct usb_ep *ep)
11237 diff --git a/fs/aio.c b/fs/aio.c
11238 index 155f84253f33..dd8d6f234a0b 100644
11239 --- a/fs/aio.c
11240 +++ b/fs/aio.c
11241 @@ -40,6 +40,7 @@
11242  #include <linux/ramfs.h>
11243  #include <linux/percpu-refcount.h>
11244  #include <linux/mount.h>
11245 +#include <linux/swork.h>
11246  
11247  #include <asm/kmap_types.h>
11248  #include <asm/uaccess.h>
11249 @@ -115,7 +116,7 @@ struct kioctx {
11250         struct page             **ring_pages;
11251         long                    nr_pages;
11252  
11253 -       struct work_struct      free_work;
11254 +       struct swork_event      free_work;
11255  
11256         /*
11257          * signals when all in-flight requests are done
11258 @@ -253,6 +254,7 @@ static int __init aio_setup(void)
11259                 .mount          = aio_mount,
11260                 .kill_sb        = kill_anon_super,
11261         };
11262 +       BUG_ON(swork_get());
11263         aio_mnt = kern_mount(&aio_fs);
11264         if (IS_ERR(aio_mnt))
11265                 panic("Failed to create aio fs mount.");
11266 @@ -568,9 +570,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
11267         return cancel(&kiocb->common);
11268  }
11269  
11270 -static void free_ioctx(struct work_struct *work)
11271 +static void free_ioctx(struct swork_event *sev)
11272  {
11273 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
11274 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11275  
11276         pr_debug("freeing %p\n", ctx);
11277  
11278 @@ -589,8 +591,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11279         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
11280                 complete(&ctx->rq_wait->comp);
11281  
11282 -       INIT_WORK(&ctx->free_work, free_ioctx);
11283 -       schedule_work(&ctx->free_work);
11284 +       INIT_SWORK(&ctx->free_work, free_ioctx);
11285 +       swork_queue(&ctx->free_work);
11286  }
11287  
11288  /*
11289 @@ -598,9 +600,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11290   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
11291   * now it's safe to cancel any that need to be.
11292   */
11293 -static void free_ioctx_users(struct percpu_ref *ref)
11294 +static void free_ioctx_users_work(struct swork_event *sev)
11295  {
11296 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11297 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11298         struct aio_kiocb *req;
11299  
11300         spin_lock_irq(&ctx->ctx_lock);
11301 @@ -619,6 +621,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
11302         percpu_ref_put(&ctx->reqs);
11303  }
11304  
11305 +static void free_ioctx_users(struct percpu_ref *ref)
11306 +{
11307 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11308 +
11309 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
11310 +       swork_queue(&ctx->free_work);
11311 +}
11312 +
11313  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
11314  {
11315         unsigned i, new_nr;
11316 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
11317 index 502d3892d8a4..05af8d3e6e88 100644
11318 --- a/fs/autofs4/autofs_i.h
11319 +++ b/fs/autofs4/autofs_i.h
11320 @@ -34,6 +34,7 @@
11321  #include <linux/sched.h>
11322  #include <linux/mount.h>
11323  #include <linux/namei.h>
11324 +#include <linux/delay.h>
11325  #include <asm/current.h>
11326  #include <asm/uaccess.h>
11327  
11328 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
11329 index 7a5a598a2d94..d08bcdc30566 100644
11330 --- a/fs/autofs4/expire.c
11331 +++ b/fs/autofs4/expire.c
11332 @@ -150,7 +150,7 @@ again:
11333                         parent = p->d_parent;
11334                         if (!spin_trylock(&parent->d_lock)) {
11335                                 spin_unlock(&p->d_lock);
11336 -                               cpu_relax();
11337 +                               cpu_chill();
11338                                 goto relock;
11339                         }
11340                         spin_unlock(&p->d_lock);
11341 diff --git a/fs/buffer.c b/fs/buffer.c
11342 index 4f4cd959da7c..72b27e17b907 100644
11343 --- a/fs/buffer.c
11344 +++ b/fs/buffer.c
11345 @@ -305,8 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11346          * decide that the page is now completely done.
11347          */
11348         first = page_buffers(page);
11349 -       local_irq_save(flags);
11350 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11351 +       flags = bh_uptodate_lock_irqsave(first);
11352         clear_buffer_async_read(bh);
11353         unlock_buffer(bh);
11354         tmp = bh;
11355 @@ -319,8 +318,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11356                 }
11357                 tmp = tmp->b_this_page;
11358         } while (tmp != bh);
11359 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11360 -       local_irq_restore(flags);
11361 +       bh_uptodate_unlock_irqrestore(first, flags);
11362  
11363         /*
11364          * If none of the buffers had errors and they are all
11365 @@ -332,9 +330,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11366         return;
11367  
11368  still_busy:
11369 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11370 -       local_irq_restore(flags);
11371 -       return;
11372 +       bh_uptodate_unlock_irqrestore(first, flags);
11373  }
11374  
11375  /*
11376 @@ -362,8 +358,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11377         }
11378  
11379         first = page_buffers(page);
11380 -       local_irq_save(flags);
11381 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11382 +       flags = bh_uptodate_lock_irqsave(first);
11383  
11384         clear_buffer_async_write(bh);
11385         unlock_buffer(bh);
11386 @@ -375,15 +370,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11387                 }
11388                 tmp = tmp->b_this_page;
11389         }
11390 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11391 -       local_irq_restore(flags);
11392 +       bh_uptodate_unlock_irqrestore(first, flags);
11393         end_page_writeback(page);
11394         return;
11395  
11396  still_busy:
11397 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11398 -       local_irq_restore(flags);
11399 -       return;
11400 +       bh_uptodate_unlock_irqrestore(first, flags);
11401  }
11402  EXPORT_SYMBOL(end_buffer_async_write);
11403  
11404 @@ -3325,6 +3317,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
11405         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
11406         if (ret) {
11407                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
11408 +               buffer_head_init_locks(ret);
11409                 preempt_disable();
11410                 __this_cpu_inc(bh_accounting.nr);
11411                 recalc_bh_state();
11412 diff --git a/fs/dcache.c b/fs/dcache.c
11413 index 71b6056ad35d..e80471cbfc19 100644
11414 --- a/fs/dcache.c
11415 +++ b/fs/dcache.c
11416 @@ -19,6 +19,7 @@
11417  #include <linux/mm.h>
11418  #include <linux/fs.h>
11419  #include <linux/fsnotify.h>
11420 +#include <linux/delay.h>
11421  #include <linux/slab.h>
11422  #include <linux/init.h>
11423  #include <linux/hash.h>
11424 @@ -747,6 +748,8 @@ static inline bool fast_dput(struct dentry *dentry)
11425   */
11426  void dput(struct dentry *dentry)
11427  {
11428 +       struct dentry *parent;
11429 +
11430         if (unlikely(!dentry))
11431                 return;
11432  
11433 @@ -783,9 +786,18 @@ repeat:
11434         return;
11435  
11436  kill_it:
11437 -       dentry = dentry_kill(dentry);
11438 -       if (dentry) {
11439 -               cond_resched();
11440 +       parent = dentry_kill(dentry);
11441 +       if (parent) {
11442 +               int r;
11443 +
11444 +               if (parent == dentry) {
11445 +                       /* the task with the highest priority won't schedule */
11446 +                       r = cond_resched();
11447 +                       if (!r)
11448 +                               cpu_chill();
11449 +               } else {
11450 +                       dentry = parent;
11451 +               }
11452                 goto repeat;
11453         }
11454  }
11455 @@ -2394,7 +2406,7 @@ again:
11456         if (dentry->d_lockref.count == 1) {
11457                 if (!spin_trylock(&inode->i_lock)) {
11458                         spin_unlock(&dentry->d_lock);
11459 -                       cpu_relax();
11460 +                       cpu_chill();
11461                         goto again;
11462                 }
11463                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
11464 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
11465 index 1e009cad8d5c..d0c12504d3b4 100644
11466 --- a/fs/eventpoll.c
11467 +++ b/fs/eventpoll.c
11468 @@ -505,12 +505,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
11469   */
11470  static void ep_poll_safewake(wait_queue_head_t *wq)
11471  {
11472 -       int this_cpu = get_cpu();
11473 +       int this_cpu = get_cpu_light();
11474  
11475         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
11476                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
11477  
11478 -       put_cpu();
11479 +       put_cpu_light();
11480  }
11481  
11482  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
11483 diff --git a/fs/exec.c b/fs/exec.c
11484 index b06623a9347f..e7760b7b692c 100644
11485 --- a/fs/exec.c
11486 +++ b/fs/exec.c
11487 @@ -865,12 +865,14 @@ static int exec_mmap(struct mm_struct *mm)
11488                 }
11489         }
11490         task_lock(tsk);
11491 +       preempt_disable_rt();
11492         active_mm = tsk->active_mm;
11493         tsk->mm = mm;
11494         tsk->active_mm = mm;
11495         activate_mm(active_mm, mm);
11496         tsk->mm->vmacache_seqnum = 0;
11497         vmacache_flush(tsk);
11498 +       preempt_enable_rt();
11499         task_unlock(tsk);
11500         if (old_mm) {
11501                 up_read(&old_mm->mmap_sem);
11502 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
11503 index 9db5500d63d9..5951c495d124 100644
11504 --- a/fs/f2fs/f2fs.h
11505 +++ b/fs/f2fs/f2fs.h
11506 @@ -24,7 +24,6 @@
11507  
11508  #ifdef CONFIG_F2FS_CHECK_FS
11509  #define f2fs_bug_on(sbi, condition)    BUG_ON(condition)
11510 -#define f2fs_down_write(x, y)  down_write_nest_lock(x, y)
11511  #else
11512  #define f2fs_bug_on(sbi, condition)                                    \
11513         do {                                                            \
11514 @@ -33,7 +32,6 @@
11515                         set_sbi_flag(sbi, SBI_NEED_FSCK);               \
11516                 }                                                       \
11517         } while (0)
11518 -#define f2fs_down_write(x, y)  down_write(x)
11519  #endif
11520  
11521  /*
11522 @@ -959,7 +957,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
11523  
11524  static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
11525  {
11526 -       f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
11527 +       down_write(&sbi->cp_rwsem);
11528  }
11529  
11530  static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
11531 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
11532 index 684996c8a3a4..6e18a06aaabe 100644
11533 --- a/fs/jbd2/checkpoint.c
11534 +++ b/fs/jbd2/checkpoint.c
11535 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
11536         nblocks = jbd2_space_needed(journal);
11537         while (jbd2_log_space_left(journal) < nblocks) {
11538                 write_unlock(&journal->j_state_lock);
11539 +               if (current->plug)
11540 +                       io_schedule();
11541                 mutex_lock(&journal->j_checkpoint_mutex);
11542  
11543                 /*
11544 diff --git a/fs/namespace.c b/fs/namespace.c
11545 index 5be02a0635be..1f3725bbd04b 100644
11546 --- a/fs/namespace.c
11547 +++ b/fs/namespace.c
11548 @@ -14,6 +14,7 @@
11549  #include <linux/mnt_namespace.h>
11550  #include <linux/user_namespace.h>
11551  #include <linux/namei.h>
11552 +#include <linux/delay.h>
11553  #include <linux/security.h>
11554  #include <linux/idr.h>
11555  #include <linux/init.h>                /* init_rootfs */
11556 @@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
11557          * incremented count after it has set MNT_WRITE_HOLD.
11558          */
11559         smp_mb();
11560 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11561 -               cpu_relax();
11562 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11563 +               preempt_enable();
11564 +               cpu_chill();
11565 +               preempt_disable();
11566 +       }
11567         /*
11568          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11569          * be set to match its requirements. So we must not load that until
11570 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11571 index 7521e11db728..f0de4b6b8bf3 100644
11572 --- a/fs/ntfs/aops.c
11573 +++ b/fs/ntfs/aops.c
11574 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11575                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11576         }
11577         first = page_buffers(page);
11578 -       local_irq_save(flags);
11579 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11580 +       flags = bh_uptodate_lock_irqsave(first);
11581         clear_buffer_async_read(bh);
11582         unlock_buffer(bh);
11583         tmp = bh;
11584 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11585                 }
11586                 tmp = tmp->b_this_page;
11587         } while (tmp != bh);
11588 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11589 -       local_irq_restore(flags);
11590 +       bh_uptodate_unlock_irqrestore(first, flags);
11591         /*
11592          * If none of the buffers had errors then we can set the page uptodate,
11593          * but we first have to perform the post read mst fixups, if the
11594 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11595                 recs = PAGE_CACHE_SIZE / rec_size;
11596                 /* Should have been verified before we got here... */
11597                 BUG_ON(!recs);
11598 -               local_irq_save(flags);
11599 +               local_irq_save_nort(flags);
11600                 kaddr = kmap_atomic(page);
11601                 for (i = 0; i < recs; i++)
11602                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11603                                         i * rec_size), rec_size);
11604                 kunmap_atomic(kaddr);
11605 -               local_irq_restore(flags);
11606 +               local_irq_restore_nort(flags);
11607                 flush_dcache_page(page);
11608                 if (likely(page_uptodate && !PageError(page)))
11609                         SetPageUptodate(page);
11610 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11611         unlock_page(page);
11612         return;
11613  still_busy:
11614 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11615 -       local_irq_restore(flags);
11616 -       return;
11617 +       bh_uptodate_unlock_irqrestore(first, flags);
11618  }
11619  
11620  /**
11621 diff --git a/fs/timerfd.c b/fs/timerfd.c
11622 index 053818dd6c18..c4bc14fe0085 100644
11623 --- a/fs/timerfd.c
11624 +++ b/fs/timerfd.c
11625 @@ -450,7 +450,10 @@ static int do_timerfd_settime(int ufd, int flags,
11626                                 break;
11627                 }
11628                 spin_unlock_irq(&ctx->wqh.lock);
11629 -               cpu_relax();
11630 +               if (isalarm(ctx))
11631 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11632 +               else
11633 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11634         }
11635  
11636         /*
11637 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11638 index 323e5daece54..cc5fbd534fd4 100644
11639 --- a/include/acpi/platform/aclinux.h
11640 +++ b/include/acpi/platform/aclinux.h
11641 @@ -127,6 +127,7 @@
11642  
11643  #define acpi_cache_t                        struct kmem_cache
11644  #define acpi_spinlock                       spinlock_t *
11645 +#define acpi_raw_spinlock              raw_spinlock_t *
11646  #define acpi_cpu_flags                      unsigned long
11647  
11648  /* Use native linux version of acpi_os_allocate_zeroed */
11649 @@ -145,6 +146,20 @@
11650  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11651  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11652  
11653 +#define acpi_os_create_raw_lock(__handle)                      \
11654 +({                                                             \
11655 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11656 +                                                               \
11657 +        if (lock) {                                            \
11658 +               *(__handle) = lock;                             \
11659 +               raw_spin_lock_init(*(__handle));                \
11660 +        }                                                      \
11661 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11662 + })
11663 +
11664 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
11665 +
11666 +
11667  /*
11668   * OSL interfaces used by debugger/disassembler
11669   */
11670 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11671 index 630dd2372238..850e4d993a88 100644
11672 --- a/include/asm-generic/bug.h
11673 +++ b/include/asm-generic/bug.h
11674 @@ -206,6 +206,20 @@ extern void warn_slowpath_null(const char *file, const int line);
11675  # define WARN_ON_SMP(x)                        ({0;})
11676  #endif
11677  
11678 +#ifdef CONFIG_PREEMPT_RT_BASE
11679 +# define BUG_ON_RT(c)                  BUG_ON(c)
11680 +# define BUG_ON_NONRT(c)               do { } while (0)
11681 +# define WARN_ON_RT(condition)         WARN_ON(condition)
11682 +# define WARN_ON_NONRT(condition)      do { } while (0)
11683 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11684 +#else
11685 +# define BUG_ON_RT(c)                  do { } while (0)
11686 +# define BUG_ON_NONRT(c)               BUG_ON(c)
11687 +# define WARN_ON_RT(condition)         do { } while (0)
11688 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
11689 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11690 +#endif
11691 +
11692  #endif /* __ASSEMBLY__ */
11693  
11694  #endif
11695 diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
11696 index 5d8ffa3e6f8c..c1cde3577551 100644
11697 --- a/include/asm-generic/preempt.h
11698 +++ b/include/asm-generic/preempt.h
11699 @@ -7,10 +7,10 @@
11700  
11701  static __always_inline int preempt_count(void)
11702  {
11703 -       return current_thread_info()->preempt_count;
11704 +       return READ_ONCE(current_thread_info()->preempt_count);
11705  }
11706  
11707 -static __always_inline int *preempt_count_ptr(void)
11708 +static __always_inline volatile int *preempt_count_ptr(void)
11709  {
11710         return &current_thread_info()->preempt_count;
11711  }
11712 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11713 index daf17d70aeca..463df8954255 100644
11714 --- a/include/linux/blk-mq.h
11715 +++ b/include/linux/blk-mq.h
11716 @@ -212,6 +212,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11717  
11718  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
11719  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
11720 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11721  
11722  int blk_mq_request_started(struct request *rq);
11723  void blk_mq_start_request(struct request *rq);
11724 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11725 index fe14382f9664..a82143ad6702 100644
11726 --- a/include/linux/blkdev.h
11727 +++ b/include/linux/blkdev.h
11728 @@ -89,6 +89,7 @@ struct request {
11729         struct list_head queuelist;
11730         union {
11731                 struct call_single_data csd;
11732 +               struct work_struct work;
11733                 unsigned long fifo_time;
11734         };
11735  
11736 @@ -455,7 +456,7 @@ struct request_queue {
11737         struct throtl_data *td;
11738  #endif
11739         struct rcu_head         rcu_head;
11740 -       wait_queue_head_t       mq_freeze_wq;
11741 +       struct swait_queue_head mq_freeze_wq;
11742         struct percpu_ref       q_usage_counter;
11743         struct list_head        all_q_node;
11744  
11745 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
11746 index 8fdcb783197d..d07dbeec7bc1 100644
11747 --- a/include/linux/bottom_half.h
11748 +++ b/include/linux/bottom_half.h
11749 @@ -3,6 +3,39 @@
11750  
11751  #include <linux/preempt.h>
11752  
11753 +#ifdef CONFIG_PREEMPT_RT_FULL
11754 +
11755 +extern void __local_bh_disable(void);
11756 +extern void _local_bh_enable(void);
11757 +extern void __local_bh_enable(void);
11758 +
11759 +static inline void local_bh_disable(void)
11760 +{
11761 +       __local_bh_disable();
11762 +}
11763 +
11764 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11765 +{
11766 +       __local_bh_disable();
11767 +}
11768 +
11769 +static inline void local_bh_enable(void)
11770 +{
11771 +       __local_bh_enable();
11772 +}
11773 +
11774 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11775 +{
11776 +       __local_bh_enable();
11777 +}
11778 +
11779 +static inline void local_bh_enable_ip(unsigned long ip)
11780 +{
11781 +       __local_bh_enable();
11782 +}
11783 +
11784 +#else
11785 +
11786  #ifdef CONFIG_TRACE_IRQFLAGS
11787  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11788  #else
11789 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
11790  {
11791         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11792  }
11793 +#endif
11794  
11795  #endif /* _LINUX_BH_H */
11796 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
11797 index 89d9aa9e79bf..4a201008b02d 100644
11798 --- a/include/linux/buffer_head.h
11799 +++ b/include/linux/buffer_head.h
11800 @@ -75,8 +75,50 @@ struct buffer_head {
11801         struct address_space *b_assoc_map;      /* mapping this buffer is
11802                                                    associated with */
11803         atomic_t b_count;               /* users using this buffer_head */
11804 +#ifdef CONFIG_PREEMPT_RT_BASE
11805 +       spinlock_t b_uptodate_lock;
11806 +#if IS_ENABLED(CONFIG_JBD2)
11807 +       spinlock_t b_state_lock;
11808 +       spinlock_t b_journal_head_lock;
11809 +#endif
11810 +#endif
11811  };
11812  
11813 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11814 +{
11815 +       unsigned long flags;
11816 +
11817 +#ifndef CONFIG_PREEMPT_RT_BASE
11818 +       local_irq_save(flags);
11819 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11820 +#else
11821 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11822 +#endif
11823 +       return flags;
11824 +}
11825 +
11826 +static inline void
11827 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11828 +{
11829 +#ifndef CONFIG_PREEMPT_RT_BASE
11830 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11831 +       local_irq_restore(flags);
11832 +#else
11833 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11834 +#endif
11835 +}
11836 +
11837 +static inline void buffer_head_init_locks(struct buffer_head *bh)
11838 +{
11839 +#ifdef CONFIG_PREEMPT_RT_BASE
11840 +       spin_lock_init(&bh->b_uptodate_lock);
11841 +#if IS_ENABLED(CONFIG_JBD2)
11842 +       spin_lock_init(&bh->b_state_lock);
11843 +       spin_lock_init(&bh->b_journal_head_lock);
11844 +#endif
11845 +#endif
11846 +}
11847 +
11848  /*
11849   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11850   * and buffer_foo() functions.
11851 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
11852 index 8da263299754..0cc474291e08 100644
11853 --- a/include/linux/cgroup-defs.h
11854 +++ b/include/linux/cgroup-defs.h
11855 @@ -16,6 +16,7 @@
11856  #include <linux/percpu-refcount.h>
11857  #include <linux/percpu-rwsem.h>
11858  #include <linux/workqueue.h>
11859 +#include <linux/swork.h>
11860  
11861  #ifdef CONFIG_CGROUPS
11862  
11863 @@ -142,6 +143,7 @@ struct cgroup_subsys_state {
11864         /* percpu_ref killing and RCU release */
11865         struct rcu_head rcu_head;
11866         struct work_struct destroy_work;
11867 +       struct swork_event destroy_swork;
11868  };
11869  
11870  /*
11871 diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
11872 index 1e6932222e11..17f413bbbedf 100644
11873 --- a/include/linux/clk/at91_pmc.h
11874 +++ b/include/linux/clk/at91_pmc.h
11875 @@ -16,18 +16,6 @@
11876  #ifndef AT91_PMC_H
11877  #define AT91_PMC_H
11878  
11879 -#ifndef __ASSEMBLY__
11880 -extern void __iomem *at91_pmc_base;
11881 -
11882 -#define at91_pmc_read(field) \
11883 -       readl_relaxed(at91_pmc_base + field)
11884 -
11885 -#define at91_pmc_write(field, value) \
11886 -       writel_relaxed(value, at91_pmc_base + field)
11887 -#else
11888 -.extern at91_pmc_base
11889 -#endif
11890 -
11891  #define        AT91_PMC_SCER           0x00                    /* System Clock Enable Register */
11892  #define        AT91_PMC_SCDR           0x04                    /* System Clock Disable Register */
11893  
11894 diff --git a/include/linux/completion.h b/include/linux/completion.h
11895 index 5d5aaae3af43..3bca1590e29f 100644
11896 --- a/include/linux/completion.h
11897 +++ b/include/linux/completion.h
11898 @@ -7,8 +7,7 @@
11899   * Atomic wait-for-completion handler data structures.
11900   * See kernel/sched/completion.c for details.
11901   */
11902 -
11903 -#include <linux/wait.h>
11904 +#include <linux/swait.h>
11905  
11906  /*
11907   * struct completion - structure used to maintain state for a "completion"
11908 @@ -24,11 +23,11 @@
11909   */
11910  struct completion {
11911         unsigned int done;
11912 -       wait_queue_head_t wait;
11913 +       struct swait_queue_head wait;
11914  };
11915  
11916  #define COMPLETION_INITIALIZER(work) \
11917 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11918 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11919  
11920  #define COMPLETION_INITIALIZER_ONSTACK(work) \
11921         ({ init_completion(&work); work; })
11922 @@ -73,7 +72,7 @@ struct completion {
11923  static inline void init_completion(struct completion *x)
11924  {
11925         x->done = 0;
11926 -       init_waitqueue_head(&x->wait);
11927 +       init_swait_queue_head(&x->wait);
11928  }
11929  
11930  /**
11931 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
11932 index d2ca8c38f9c4..94041d803d0b 100644
11933 --- a/include/linux/cpu.h
11934 +++ b/include/linux/cpu.h
11935 @@ -231,6 +231,8 @@ extern void get_online_cpus(void);
11936  extern void put_online_cpus(void);
11937  extern void cpu_hotplug_disable(void);
11938  extern void cpu_hotplug_enable(void);
11939 +extern void pin_current_cpu(void);
11940 +extern void unpin_current_cpu(void);
11941  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
11942  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
11943  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
11944 @@ -248,6 +250,8 @@ static inline void cpu_hotplug_done(void) {}
11945  #define put_online_cpus()      do { } while (0)
11946  #define cpu_hotplug_disable()  do { } while (0)
11947  #define cpu_hotplug_enable()   do { } while (0)
11948 +static inline void pin_current_cpu(void) { }
11949 +static inline void unpin_current_cpu(void) { }
11950  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
11951  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
11952  /* These aren't inline functions due to a GCC bug. */
11953 diff --git a/include/linux/delay.h b/include/linux/delay.h
11954 index a6ecb34cf547..37caab306336 100644
11955 --- a/include/linux/delay.h
11956 +++ b/include/linux/delay.h
11957 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
11958         msleep(seconds * 1000);
11959  }
11960  
11961 +#ifdef CONFIG_PREEMPT_RT_FULL
11962 +extern void cpu_chill(void);
11963 +#else
11964 +# define cpu_chill()   cpu_relax()
11965 +#endif
11966 +
11967  #endif /* defined(_LINUX_DELAY_H) */
11968 diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
11969 index 60048c50404e..f2cd67624f18 100644
11970 --- a/include/linux/ftrace.h
11971 +++ b/include/linux/ftrace.h
11972 @@ -694,6 +694,18 @@ static inline void __ftrace_enabled_restore(int enabled)
11973  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
11974  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
11975  
11976 +static inline unsigned long get_lock_parent_ip(void)
11977 +{
11978 +       unsigned long addr = CALLER_ADDR0;
11979 +
11980 +       if (!in_lock_functions(addr))
11981 +               return addr;
11982 +       addr = CALLER_ADDR1;
11983 +       if (!in_lock_functions(addr))
11984 +               return addr;
11985 +       return CALLER_ADDR2;
11986 +}
11987 +
11988  #ifdef CONFIG_IRQSOFF_TRACER
11989    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
11990    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
11991 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
11992 index bb3f3297062a..a117a33ef72c 100644
11993 --- a/include/linux/highmem.h
11994 +++ b/include/linux/highmem.h
11995 @@ -7,6 +7,7 @@
11996  #include <linux/mm.h>
11997  #include <linux/uaccess.h>
11998  #include <linux/hardirq.h>
11999 +#include <linux/sched.h>
12000  
12001  #include <asm/cacheflush.h>
12002  
12003 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
12004  
12005  static inline void *kmap_atomic(struct page *page)
12006  {
12007 -       preempt_disable();
12008 +       preempt_disable_nort();
12009         pagefault_disable();
12010         return page_address(page);
12011  }
12012 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
12013  static inline void __kunmap_atomic(void *addr)
12014  {
12015         pagefault_enable();
12016 -       preempt_enable();
12017 +       preempt_enable_nort();
12018  }
12019  
12020  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
12021 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
12022  
12023  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12024  
12025 +#ifndef CONFIG_PREEMPT_RT_FULL
12026  DECLARE_PER_CPU(int, __kmap_atomic_idx);
12027 +#endif
12028  
12029  static inline int kmap_atomic_idx_push(void)
12030  {
12031 +#ifndef CONFIG_PREEMPT_RT_FULL
12032         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12033  
12034 -#ifdef CONFIG_DEBUG_HIGHMEM
12035 +# ifdef CONFIG_DEBUG_HIGHMEM
12036         WARN_ON_ONCE(in_irq() && !irqs_disabled());
12037         BUG_ON(idx >= KM_TYPE_NR);
12038 -#endif
12039 +# endif
12040         return idx;
12041 +#else
12042 +       current->kmap_idx++;
12043 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
12044 +       return current->kmap_idx - 1;
12045 +#endif
12046  }
12047  
12048  static inline int kmap_atomic_idx(void)
12049  {
12050 +#ifndef CONFIG_PREEMPT_RT_FULL
12051         return __this_cpu_read(__kmap_atomic_idx) - 1;
12052 +#else
12053 +       return current->kmap_idx - 1;
12054 +#endif
12055  }
12056  
12057  static inline void kmap_atomic_idx_pop(void)
12058  {
12059 -#ifdef CONFIG_DEBUG_HIGHMEM
12060 +#ifndef CONFIG_PREEMPT_RT_FULL
12061 +# ifdef CONFIG_DEBUG_HIGHMEM
12062         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12063  
12064         BUG_ON(idx < 0);
12065 -#else
12066 +# else
12067         __this_cpu_dec(__kmap_atomic_idx);
12068 +# endif
12069 +#else
12070 +       current->kmap_idx--;
12071 +# ifdef CONFIG_DEBUG_HIGHMEM
12072 +       BUG_ON(current->kmap_idx < 0);
12073 +# endif
12074  #endif
12075  }
12076  
12077 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12078 index 2ead22dd74a0..8fbcdfa5dc77 100644
12079 --- a/include/linux/hrtimer.h
12080 +++ b/include/linux/hrtimer.h
12081 @@ -87,6 +87,9 @@ enum hrtimer_restart {
12082   * @function:  timer expiry callback function
12083   * @base:      pointer to the timer base (per cpu and per clock)
12084   * @state:     state information (See bit values above)
12085 + * @cb_entry:  list entry to defer timers from hardirq context
12086 + * @irqsafe:   timer can run in hardirq context
12087 + * @praecox:   timer expiry time if expired at the time of programming
12088   * @is_rel:    Set if the timer was armed relative
12089   * @start_pid:  timer statistics field to store the pid of the task which
12090   *             started the timer
12091 @@ -103,6 +106,11 @@ struct hrtimer {
12092         enum hrtimer_restart            (*function)(struct hrtimer *);
12093         struct hrtimer_clock_base       *base;
12094         u8                              state;
12095 +       struct list_head                cb_entry;
12096 +       int                             irqsafe;
12097 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
12098 +       ktime_t                         praecox;
12099 +#endif
12100         u8                              is_rel;
12101  #ifdef CONFIG_TIMER_STATS
12102         int                             start_pid;
12103 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
12104         struct task_struct *task;
12105  };
12106  
12107 -#ifdef CONFIG_64BIT
12108  # define HRTIMER_CLOCK_BASE_ALIGN      64
12109 -#else
12110 -# define HRTIMER_CLOCK_BASE_ALIGN      32
12111 -#endif
12112  
12113  /**
12114   * struct hrtimer_clock_base - the timer base for a specific clock
12115 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
12116   *                     timer to a base on another cpu.
12117   * @clockid:           clock id for per_cpu support
12118   * @active:            red black tree root node for the active timers
12119 + * @expired:           list head for deferred timers.
12120   * @get_time:          function to retrieve the current time of the clock
12121   * @offset:            offset of this clock to the monotonic base
12122   */
12123 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
12124         int                     index;
12125         clockid_t               clockid;
12126         struct timerqueue_head  active;
12127 +       struct list_head        expired;
12128         ktime_t                 (*get_time)(void);
12129         ktime_t                 offset;
12130  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12131 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
12132         raw_spinlock_t                  lock;
12133         seqcount_t                      seq;
12134         struct hrtimer                  *running;
12135 +       struct hrtimer                  *running_soft;
12136         unsigned int                    cpu;
12137         unsigned int                    active_bases;
12138         unsigned int                    clock_was_set_seq;
12139 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
12140         unsigned int                    nr_hangs;
12141         unsigned int                    max_hang_time;
12142  #endif
12143 +#ifdef CONFIG_PREEMPT_RT_BASE
12144 +       wait_queue_head_t               wait;
12145 +#endif
12146         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
12147  } ____cacheline_aligned;
12148  
12149 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12150         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12151  }
12152  
12153 +/* Softirq preemption could deadlock timer removal */
12154 +#ifdef CONFIG_PREEMPT_RT_BASE
12155 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12156 +#else
12157 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12158 +#endif
12159 +
12160  /* Query timers: */
12161  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12162  
12163 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12164   * Helper function to check, whether the timer is running the callback
12165   * function
12166   */
12167 -static inline int hrtimer_callback_running(struct hrtimer *timer)
12168 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
12169  {
12170         return timer->base->cpu_base->running == timer;
12171  }
12172 diff --git a/include/linux/idr.h b/include/linux/idr.h
12173 index 013fd9bc4cb6..f62be0aec911 100644
12174 --- a/include/linux/idr.h
12175 +++ b/include/linux/idr.h
12176 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
12177   * Each idr_preload() should be matched with an invocation of this
12178   * function.  See idr_preload() for details.
12179   */
12180 +#ifdef CONFIG_PREEMPT_RT_FULL
12181 +void idr_preload_end(void);
12182 +#else
12183  static inline void idr_preload_end(void)
12184  {
12185         preempt_enable();
12186  }
12187 +#endif
12188  
12189  /**
12190   * idr_find - return pointer for given id
12191 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12192 index 1c1ff7e4faa4..60fadde71a44 100644
12193 --- a/include/linux/init_task.h
12194 +++ b/include/linux/init_task.h
12195 @@ -148,9 +148,15 @@ extern struct task_group root_task_group;
12196  # define INIT_PERF_EVENTS(tsk)
12197  #endif
12198  
12199 +#ifdef CONFIG_PREEMPT_RT_BASE
12200 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12201 +#else
12202 +# define INIT_TIMER_LIST
12203 +#endif
12204 +
12205  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12206  # define INIT_VTIME(tsk)                                               \
12207 -       .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
12208 +       .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
12209         .vtime_snap = 0,                                \
12210         .vtime_snap_whence = VTIME_SYS,
12211  #else
12212 @@ -239,6 +245,7 @@ extern struct task_group root_task_group;
12213         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
12214         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12215         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12216 +       INIT_TIMER_LIST                                                 \
12217         .pids = {                                                       \
12218                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12219                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12220 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12221 index ad16809c8596..655cee096aed 100644
12222 --- a/include/linux/interrupt.h
12223 +++ b/include/linux/interrupt.h
12224 @@ -61,6 +61,7 @@
12225   *                interrupt handler after suspending interrupts. For system
12226   *                wakeup devices users need to implement wakeup detection in
12227   *                their interrupt handlers.
12228 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12229   */
12230  #define IRQF_SHARED            0x00000080
12231  #define IRQF_PROBE_SHARED      0x00000100
12232 @@ -74,6 +75,7 @@
12233  #define IRQF_NO_THREAD         0x00010000
12234  #define IRQF_EARLY_RESUME      0x00020000
12235  #define IRQF_COND_SUSPEND      0x00040000
12236 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12237  
12238  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12239  
12240 @@ -186,7 +188,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12241  #ifdef CONFIG_LOCKDEP
12242  # define local_irq_enable_in_hardirq() do { } while (0)
12243  #else
12244 -# define local_irq_enable_in_hardirq() local_irq_enable()
12245 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12246  #endif
12247  
12248  extern void disable_irq_nosync(unsigned int irq);
12249 @@ -206,6 +208,7 @@ extern void resume_device_irqs(void);
12250   * @irq:               Interrupt to which notification applies
12251   * @kref:              Reference count, for internal use
12252   * @work:              Work item, for internal use
12253 + * @list:              List item for deferred callbacks
12254   * @notify:            Function to be called on change.  This will be
12255   *                     called in process context.
12256   * @release:           Function to be called on release.  This will be
12257 @@ -217,6 +220,7 @@ struct irq_affinity_notify {
12258         unsigned int irq;
12259         struct kref kref;
12260         struct work_struct work;
12261 +       struct list_head list;
12262         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12263         void (*release)(struct kref *ref);
12264  };
12265 @@ -379,9 +383,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12266                                  bool state);
12267  
12268  #ifdef CONFIG_IRQ_FORCED_THREADING
12269 +# ifndef CONFIG_PREEMPT_RT_BASE
12270  extern bool force_irqthreads;
12271 +# else
12272 +#  define force_irqthreads     (true)
12273 +# endif
12274  #else
12275 -#define force_irqthreads       (0)
12276 +#define force_irqthreads       (false)
12277  #endif
12278  
12279  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12280 @@ -438,9 +446,10 @@ struct softirq_action
12281         void    (*action)(struct softirq_action *);
12282  };
12283  
12284 +#ifndef CONFIG_PREEMPT_RT_FULL
12285  asmlinkage void do_softirq(void);
12286  asmlinkage void __do_softirq(void);
12287 -
12288 +static inline void thread_do_softirq(void) { do_softirq(); }
12289  #ifdef __ARCH_HAS_DO_SOFTIRQ
12290  void do_softirq_own_stack(void);
12291  #else
12292 @@ -449,13 +458,25 @@ static inline void do_softirq_own_stack(void)
12293         __do_softirq();
12294  }
12295  #endif
12296 +#else
12297 +extern void thread_do_softirq(void);
12298 +#endif
12299  
12300  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12301  extern void softirq_init(void);
12302  extern void __raise_softirq_irqoff(unsigned int nr);
12303 +#ifdef CONFIG_PREEMPT_RT_FULL
12304 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12305 +#else
12306 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12307 +{
12308 +       __raise_softirq_irqoff(nr);
12309 +}
12310 +#endif
12311  
12312  extern void raise_softirq_irqoff(unsigned int nr);
12313  extern void raise_softirq(unsigned int nr);
12314 +extern void softirq_check_pending_idle(void);
12315  
12316  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12317  
12318 @@ -477,8 +498,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12319       to be executed on some cpu at least once after this.
12320     * If the tasklet is already scheduled, but its execution is still not
12321       started, it will be executed only once.
12322 -   * If this tasklet is already running on another CPU (or schedule is called
12323 -     from tasklet itself), it is rescheduled for later.
12324 +   * If this tasklet is already running on another CPU, it is rescheduled
12325 +     for later.
12326 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12327     * Tasklet is strictly serialized wrt itself, but not
12328       wrt another tasklets. If client needs some intertask synchronization,
12329       he makes it with spinlocks.
12330 @@ -503,27 +525,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12331  enum
12332  {
12333         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12334 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12335 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12336 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12337  };
12338  
12339 -#ifdef CONFIG_SMP
12340 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12341 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12342 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12343 +
12344 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12345  static inline int tasklet_trylock(struct tasklet_struct *t)
12346  {
12347         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12348  }
12349  
12350 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12351 +{
12352 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12353 +}
12354 +
12355  static inline void tasklet_unlock(struct tasklet_struct *t)
12356  {
12357         smp_mb__before_atomic();
12358         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12359  }
12360  
12361 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12362 -{
12363 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12364 -}
12365 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12366 +
12367  #else
12368  #define tasklet_trylock(t) 1
12369 +#define tasklet_tryunlock(t)   1
12370  #define tasklet_unlock_wait(t) do { } while (0)
12371  #define tasklet_unlock(t) do { } while (0)
12372  #endif
12373 @@ -572,12 +603,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12374         smp_mb();
12375  }
12376  
12377 -static inline void tasklet_enable(struct tasklet_struct *t)
12378 -{
12379 -       smp_mb__before_atomic();
12380 -       atomic_dec(&t->count);
12381 -}
12382 -
12383 +extern void tasklet_enable(struct tasklet_struct *t);
12384  extern void tasklet_kill(struct tasklet_struct *t);
12385  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12386  extern void tasklet_init(struct tasklet_struct *t,
12387 @@ -608,6 +634,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12388         tasklet_kill(&ttimer->tasklet);
12389  }
12390  
12391 +#ifdef CONFIG_PREEMPT_RT_FULL
12392 +extern void softirq_early_init(void);
12393 +#else
12394 +static inline void softirq_early_init(void) { }
12395 +#endif
12396 +
12397  /*
12398   * Autoprobing for irqs:
12399   *
12400 diff --git a/include/linux/irq.h b/include/linux/irq.h
12401 index f7cade00c525..dac9e11ba037 100644
12402 --- a/include/linux/irq.h
12403 +++ b/include/linux/irq.h
12404 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
12405   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12406   *                               it from the spurious interrupt detection
12407   *                               mechanism and from core side polling.
12408 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12409   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12410   */
12411  enum {
12412 @@ -99,13 +100,14 @@ enum {
12413         IRQ_PER_CPU_DEVID       = (1 << 17),
12414         IRQ_IS_POLLED           = (1 << 18),
12415         IRQ_DISABLE_UNLAZY      = (1 << 19),
12416 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12417  };
12418  
12419  #define IRQF_MODIFY_MASK       \
12420         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12421          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12422          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12423 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12424 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12425  
12426  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12427  
12428 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12429 index 47b9ebd4a74f..2543aab05daa 100644
12430 --- a/include/linux/irq_work.h
12431 +++ b/include/linux/irq_work.h
12432 @@ -16,6 +16,7 @@
12433  #define IRQ_WORK_BUSY          2UL
12434  #define IRQ_WORK_FLAGS         3UL
12435  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12436 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12437  
12438  struct irq_work {
12439         unsigned long flags;
12440 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12441  static inline void irq_work_run(void) { }
12442  #endif
12443  
12444 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12445 +void irq_work_tick_soft(void);
12446 +#else
12447 +static inline void irq_work_tick_soft(void) { }
12448 +#endif
12449 +
12450  #endif /* _LINUX_IRQ_WORK_H */
12451 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12452 index a587a33363c7..ad57402a242d 100644
12453 --- a/include/linux/irqdesc.h
12454 +++ b/include/linux/irqdesc.h
12455 @@ -61,6 +61,7 @@ struct irq_desc {
12456         unsigned int            irqs_unhandled;
12457         atomic_t                threads_handled;
12458         int                     threads_handled_last;
12459 +       u64                     random_ip;
12460         raw_spinlock_t          lock;
12461         struct cpumask          *percpu_enabled;
12462  #ifdef CONFIG_SMP
12463 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12464 index 5dd1272d1ab2..9b77034f7c5e 100644
12465 --- a/include/linux/irqflags.h
12466 +++ b/include/linux/irqflags.h
12467 @@ -25,8 +25,6 @@
12468  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
12469  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
12470  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
12471 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
12472 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
12473  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12474  #else
12475  # define trace_hardirqs_on()           do { } while (0)
12476 @@ -39,9 +37,15 @@
12477  # define trace_softirqs_enabled(p)     0
12478  # define trace_hardirq_enter()         do { } while (0)
12479  # define trace_hardirq_exit()          do { } while (0)
12480 +# define INIT_TRACE_IRQFLAGS
12481 +#endif
12482 +
12483 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12484 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
12485 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
12486 +#else
12487  # define lockdep_softirq_enter()       do { } while (0)
12488  # define lockdep_softirq_exit()                do { } while (0)
12489 -# define INIT_TRACE_IRQFLAGS
12490  #endif
12491  
12492  #if defined(CONFIG_IRQSOFF_TRACER) || \
12493 @@ -148,4 +152,23 @@
12494  
12495  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12496  
12497 +/*
12498 + * local_irq* variants depending on RT/!RT
12499 + */
12500 +#ifdef CONFIG_PREEMPT_RT_FULL
12501 +# define local_irq_disable_nort()      do { } while (0)
12502 +# define local_irq_enable_nort()       do { } while (0)
12503 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12504 +# define local_irq_restore_nort(flags) (void)(flags)
12505 +# define local_irq_disable_rt()                local_irq_disable()
12506 +# define local_irq_enable_rt()         local_irq_enable()
12507 +#else
12508 +# define local_irq_disable_nort()      local_irq_disable()
12509 +# define local_irq_enable_nort()       local_irq_enable()
12510 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12511 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12512 +# define local_irq_disable_rt()                do { } while (0)
12513 +# define local_irq_enable_rt()         do { } while (0)
12514 +#endif
12515 +
12516  #endif
12517 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12518 index 65407f6c9120..eb5aabe4e18c 100644
12519 --- a/include/linux/jbd2.h
12520 +++ b/include/linux/jbd2.h
12521 @@ -352,32 +352,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12522  
12523  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12524  {
12525 +#ifndef CONFIG_PREEMPT_RT_BASE
12526         bit_spin_lock(BH_State, &bh->b_state);
12527 +#else
12528 +       spin_lock(&bh->b_state_lock);
12529 +#endif
12530  }
12531  
12532  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12533  {
12534 +#ifndef CONFIG_PREEMPT_RT_BASE
12535         return bit_spin_trylock(BH_State, &bh->b_state);
12536 +#else
12537 +       return spin_trylock(&bh->b_state_lock);
12538 +#endif
12539  }
12540  
12541  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12542  {
12543 +#ifndef CONFIG_PREEMPT_RT_BASE
12544         return bit_spin_is_locked(BH_State, &bh->b_state);
12545 +#else
12546 +       return spin_is_locked(&bh->b_state_lock);
12547 +#endif
12548  }
12549  
12550  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12551  {
12552 +#ifndef CONFIG_PREEMPT_RT_BASE
12553         bit_spin_unlock(BH_State, &bh->b_state);
12554 +#else
12555 +       spin_unlock(&bh->b_state_lock);
12556 +#endif
12557  }
12558  
12559  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12560  {
12561 +#ifndef CONFIG_PREEMPT_RT_BASE
12562         bit_spin_lock(BH_JournalHead, &bh->b_state);
12563 +#else
12564 +       spin_lock(&bh->b_journal_head_lock);
12565 +#endif
12566  }
12567  
12568  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12569  {
12570 +#ifndef CONFIG_PREEMPT_RT_BASE
12571         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12572 +#else
12573 +       spin_unlock(&bh->b_journal_head_lock);
12574 +#endif
12575  }
12576  
12577  #define J_ASSERT(assert)       BUG_ON(!(assert))
12578 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12579 index a19bcf9e762e..897495386446 100644
12580 --- a/include/linux/kdb.h
12581 +++ b/include/linux/kdb.h
12582 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12583  extern __printf(1, 2) int kdb_printf(const char *, ...);
12584  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12585  
12586 +#define in_kdb_printk()        (kdb_trap_printk)
12587  extern void kdb_init(int level);
12588  
12589  /* Access to kdb specific polling devices */
12590 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12591  extern int kdb_unregister(char *);
12592  #else /* ! CONFIG_KGDB_KDB */
12593  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12594 +#define in_kdb_printk() (0)
12595  static inline void kdb_init(int level) {}
12596  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12597                                char *help, short minlen) { return 0; }
12598 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12599 index e571e592e53a..8004feb91175 100644
12600 --- a/include/linux/kernel.h
12601 +++ b/include/linux/kernel.h
12602 @@ -188,6 +188,9 @@ extern int _cond_resched(void);
12603   */
12604  # define might_sleep() \
12605         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12606 +
12607 +# define might_sleep_no_state_check() \
12608 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12609  # define sched_annotate_sleep()        (current->task_state_change = 0)
12610  #else
12611    static inline void ___might_sleep(const char *file, int line,
12612 @@ -195,6 +198,7 @@ extern int _cond_resched(void);
12613    static inline void __might_sleep(const char *file, int line,
12614                                    int preempt_offset) { }
12615  # define might_sleep() do { might_resched(); } while (0)
12616 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12617  # define sched_annotate_sleep() do { } while (0)
12618  #endif
12619  
12620 @@ -255,6 +259,7 @@ extern long (*panic_blink)(int state);
12621  __printf(1, 2)
12622  void panic(const char *fmt, ...)
12623         __noreturn __cold;
12624 +void nmi_panic(struct pt_regs *regs, const char *msg);
12625  extern void oops_enter(void);
12626  extern void oops_exit(void);
12627  void print_oops_end_marker(void);
12628 @@ -446,6 +451,14 @@ extern int sysctl_panic_on_stackoverflow;
12629  extern bool crash_kexec_post_notifiers;
12630  
12631  /*
12632 + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
12633 + * holds a CPU number which is executing panic() currently. A value of
12634 + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
12635 + */
12636 +extern atomic_t panic_cpu;
12637 +#define PANIC_CPU_INVALID      -1
12638 +
12639 +/*
12640   * Only to be used by arch init code. If the user over-wrote the default
12641   * CONFIG_PANIC_TIMEOUT, honor it.
12642   */
12643 @@ -473,6 +486,7 @@ extern enum system_states {
12644         SYSTEM_HALT,
12645         SYSTEM_POWER_OFF,
12646         SYSTEM_RESTART,
12647 +       SYSTEM_SUSPEND,
12648  } system_state;
12649  
12650  #define TAINT_PROPRIETARY_MODULE       0
12651 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
12652 index c923350ca20a..c690acc6900e 100644
12653 --- a/include/linux/kvm_host.h
12654 +++ b/include/linux/kvm_host.h
12655 @@ -25,6 +25,7 @@
12656  #include <linux/irqflags.h>
12657  #include <linux/context_tracking.h>
12658  #include <linux/irqbypass.h>
12659 +#include <linux/swait.h>
12660  #include <asm/signal.h>
12661  
12662  #include <linux/kvm.h>
12663 @@ -243,7 +244,7 @@ struct kvm_vcpu {
12664         int fpu_active;
12665         int guest_fpu_loaded, guest_xcr0_loaded;
12666         unsigned char fpu_counter;
12667 -       wait_queue_head_t wq;
12668 +       struct swait_queue_head wq;
12669         struct pid *pid;
12670         int sigset_active;
12671         sigset_t sigset;
12672 @@ -794,7 +795,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
12673  }
12674  #endif
12675  
12676 -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12677 +static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12678  {
12679  #ifdef __KVM_HAVE_ARCH_WQP
12680         return vcpu->arch.wqp;
12681 diff --git a/include/linux/lglock.h b/include/linux/lglock.h
12682 index c92ebd100d9b..6f035f635d0e 100644
12683 --- a/include/linux/lglock.h
12684 +++ b/include/linux/lglock.h
12685 @@ -34,13 +34,30 @@
12686  #endif
12687  
12688  struct lglock {
12689 +#ifdef CONFIG_PREEMPT_RT_FULL
12690 +       struct rt_mutex __percpu *lock;
12691 +#else
12692         arch_spinlock_t __percpu *lock;
12693 +#endif
12694  #ifdef CONFIG_DEBUG_LOCK_ALLOC
12695         struct lock_class_key lock_key;
12696         struct lockdep_map    lock_dep_map;
12697  #endif
12698  };
12699  
12700 +#ifdef CONFIG_PREEMPT_RT_FULL
12701 +# define DEFINE_LGLOCK(name)                                           \
12702 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12703 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12704 +       struct lglock name = { .lock = &name ## _lock }
12705 +
12706 +# define DEFINE_STATIC_LGLOCK(name)                                    \
12707 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12708 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12709 +       static struct lglock name = { .lock = &name ## _lock }
12710 +
12711 +#else
12712 +
12713  #define DEFINE_LGLOCK(name)                                            \
12714         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12715         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12716 @@ -50,6 +67,7 @@ struct lglock {
12717         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12718         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12719         static struct lglock name = { .lock = &name ## _lock }
12720 +#endif
12721  
12722  void lg_lock_init(struct lglock *lg, char *name);
12723  
12724 @@ -64,6 +82,12 @@ void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
12725  void lg_global_lock(struct lglock *lg);
12726  void lg_global_unlock(struct lglock *lg);
12727  
12728 +#ifndef CONFIG_PREEMPT_RT_FULL
12729 +#define lg_global_trylock_relax(name)  lg_global_lock(name)
12730 +#else
12731 +void lg_global_trylock_relax(struct lglock *lg);
12732 +#endif
12733 +
12734  #else
12735  /* When !CONFIG_SMP, map lglock to spinlock */
12736  #define lglock spinlock
12737 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12738 index 8132214e8efd..89ffaa7bd342 100644
12739 --- a/include/linux/list_bl.h
12740 +++ b/include/linux/list_bl.h
12741 @@ -2,6 +2,7 @@
12742  #define _LINUX_LIST_BL_H
12743  
12744  #include <linux/list.h>
12745 +#include <linux/spinlock.h>
12746  #include <linux/bit_spinlock.h>
12747  
12748  /*
12749 @@ -32,13 +33,24 @@
12750  
12751  struct hlist_bl_head {
12752         struct hlist_bl_node *first;
12753 +#ifdef CONFIG_PREEMPT_RT_BASE
12754 +       raw_spinlock_t lock;
12755 +#endif
12756  };
12757  
12758  struct hlist_bl_node {
12759         struct hlist_bl_node *next, **pprev;
12760  };
12761 -#define INIT_HLIST_BL_HEAD(ptr) \
12762 -       ((ptr)->first = NULL)
12763 +
12764 +#ifdef CONFIG_PREEMPT_RT_BASE
12765 +#define INIT_HLIST_BL_HEAD(h)          \
12766 +do {                                   \
12767 +       (h)->first = NULL;              \
12768 +       raw_spin_lock_init(&(h)->lock); \
12769 +} while (0)
12770 +#else
12771 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12772 +#endif
12773  
12774  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12775  {
12776 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
12777  
12778  static inline void hlist_bl_lock(struct hlist_bl_head *b)
12779  {
12780 +#ifndef CONFIG_PREEMPT_RT_BASE
12781         bit_spin_lock(0, (unsigned long *)b);
12782 +#else
12783 +       raw_spin_lock(&b->lock);
12784 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12785 +       __set_bit(0, (unsigned long *)b);
12786 +#endif
12787 +#endif
12788  }
12789  
12790  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12791  {
12792 +#ifndef CONFIG_PREEMPT_RT_BASE
12793         __bit_spin_unlock(0, (unsigned long *)b);
12794 +#else
12795 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12796 +       __clear_bit(0, (unsigned long *)b);
12797 +#endif
12798 +       raw_spin_unlock(&b->lock);
12799 +#endif
12800  }
12801  
12802  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
12803 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
12804 new file mode 100644
12805 index 000000000000..e572a3971631
12806 --- /dev/null
12807 +++ b/include/linux/locallock.h
12808 @@ -0,0 +1,276 @@
12809 +#ifndef _LINUX_LOCALLOCK_H
12810 +#define _LINUX_LOCALLOCK_H
12811 +
12812 +#include <linux/percpu.h>
12813 +#include <linux/spinlock.h>
12814 +
12815 +#ifdef CONFIG_PREEMPT_RT_BASE
12816 +
12817 +#ifdef CONFIG_DEBUG_SPINLOCK
12818 +# define LL_WARN(cond) WARN_ON(cond)
12819 +#else
12820 +# define LL_WARN(cond) do { } while (0)
12821 +#endif
12822 +
12823 +/*
12824 + * per cpu lock based substitute for local_irq_*()
12825 + */
12826 +struct local_irq_lock {
12827 +       spinlock_t              lock;
12828 +       struct task_struct      *owner;
12829 +       int                     nestcnt;
12830 +       unsigned long           flags;
12831 +};
12832 +
12833 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
12834 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
12835 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
12836 +
12837 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
12838 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
12839 +
12840 +#define local_irq_lock_init(lvar)                                      \
12841 +       do {                                                            \
12842 +               int __cpu;                                              \
12843 +               for_each_possible_cpu(__cpu)                            \
12844 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
12845 +       } while (0)
12846 +
12847 +/*
12848 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
12849 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
12850 + * already takes care of the migrate_disable/enable
12851 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
12852 + */
12853 +#ifdef CONFIG_PREEMPT_RT_FULL
12854 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
12855 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
12856 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
12857 +#else
12858 +# define spin_lock_local(lock)                 spin_lock(lock)
12859 +# define spin_trylock_local(lock)              spin_trylock(lock)
12860 +# define spin_unlock_local(lock)               spin_unlock(lock)
12861 +#endif
12862 +
12863 +static inline void __local_lock(struct local_irq_lock *lv)
12864 +{
12865 +       if (lv->owner != current) {
12866 +               spin_lock_local(&lv->lock);
12867 +               LL_WARN(lv->owner);
12868 +               LL_WARN(lv->nestcnt);
12869 +               lv->owner = current;
12870 +       }
12871 +       lv->nestcnt++;
12872 +}
12873 +
12874 +#define local_lock(lvar)                                       \
12875 +       do { __local_lock(&get_local_var(lvar)); } while (0)
12876 +
12877 +#define local_lock_on(lvar, cpu)                               \
12878 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
12879 +
12880 +static inline int __local_trylock(struct local_irq_lock *lv)
12881 +{
12882 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
12883 +               LL_WARN(lv->owner);
12884 +               LL_WARN(lv->nestcnt);
12885 +               lv->owner = current;
12886 +               lv->nestcnt = 1;
12887 +               return 1;
12888 +       }
12889 +       return 0;
12890 +}
12891 +
12892 +#define local_trylock(lvar)                                            \
12893 +       ({                                                              \
12894 +               int __locked;                                           \
12895 +               __locked = __local_trylock(&get_local_var(lvar));       \
12896 +               if (!__locked)                                          \
12897 +                       put_local_var(lvar);                            \
12898 +               __locked;                                               \
12899 +       })
12900 +
12901 +static inline void __local_unlock(struct local_irq_lock *lv)
12902 +{
12903 +       LL_WARN(lv->nestcnt == 0);
12904 +       LL_WARN(lv->owner != current);
12905 +       if (--lv->nestcnt)
12906 +               return;
12907 +
12908 +       lv->owner = NULL;
12909 +       spin_unlock_local(&lv->lock);
12910 +}
12911 +
12912 +#define local_unlock(lvar)                                     \
12913 +       do {                                                    \
12914 +               __local_unlock(this_cpu_ptr(&lvar));            \
12915 +               put_local_var(lvar);                            \
12916 +       } while (0)
12917 +
12918 +#define local_unlock_on(lvar, cpu)                       \
12919 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
12920 +
12921 +static inline void __local_lock_irq(struct local_irq_lock *lv)
12922 +{
12923 +       spin_lock_irqsave(&lv->lock, lv->flags);
12924 +       LL_WARN(lv->owner);
12925 +       LL_WARN(lv->nestcnt);
12926 +       lv->owner = current;
12927 +       lv->nestcnt = 1;
12928 +}
12929 +
12930 +#define local_lock_irq(lvar)                                           \
12931 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
12932 +
12933 +#define local_lock_irq_on(lvar, cpu)                                   \
12934 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
12935 +
12936 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
12937 +{
12938 +       LL_WARN(!lv->nestcnt);
12939 +       LL_WARN(lv->owner != current);
12940 +       lv->owner = NULL;
12941 +       lv->nestcnt = 0;
12942 +       spin_unlock_irq(&lv->lock);
12943 +}
12944 +
12945 +#define local_unlock_irq(lvar)                                         \
12946 +       do {                                                            \
12947 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
12948 +               put_local_var(lvar);                                    \
12949 +       } while (0)
12950 +
12951 +#define local_unlock_irq_on(lvar, cpu)                                 \
12952 +       do {                                                            \
12953 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
12954 +       } while (0)
12955 +
12956 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
12957 +{
12958 +       if (lv->owner != current) {
12959 +               __local_lock_irq(lv);
12960 +               return 0;
12961 +       } else {
12962 +               lv->nestcnt++;
12963 +               return 1;
12964 +       }
12965 +}
12966 +
12967 +#define local_lock_irqsave(lvar, _flags)                               \
12968 +       do {                                                            \
12969 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
12970 +                       put_local_var(lvar);                            \
12971 +               _flags = __this_cpu_read(lvar.flags);                   \
12972 +       } while (0)
12973 +
12974 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
12975 +       do {                                                            \
12976 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
12977 +               _flags = per_cpu(lvar, cpu).flags;                      \
12978 +       } while (0)
12979 +
12980 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
12981 +                                           unsigned long flags)
12982 +{
12983 +       LL_WARN(!lv->nestcnt);
12984 +       LL_WARN(lv->owner != current);
12985 +       if (--lv->nestcnt)
12986 +               return 0;
12987 +
12988 +       lv->owner = NULL;
12989 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
12990 +       return 1;
12991 +}
12992 +
12993 +#define local_unlock_irqrestore(lvar, flags)                           \
12994 +       do {                                                            \
12995 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
12996 +                       put_local_var(lvar);                            \
12997 +       } while (0)
12998 +
12999 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
13000 +       do {                                                            \
13001 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
13002 +       } while (0)
13003 +
13004 +#define local_spin_trylock_irq(lvar, lock)                             \
13005 +       ({                                                              \
13006 +               int __locked;                                           \
13007 +               local_lock_irq(lvar);                                   \
13008 +               __locked = spin_trylock(lock);                          \
13009 +               if (!__locked)                                          \
13010 +                       local_unlock_irq(lvar);                         \
13011 +               __locked;                                               \
13012 +       })
13013 +
13014 +#define local_spin_lock_irq(lvar, lock)                                        \
13015 +       do {                                                            \
13016 +               local_lock_irq(lvar);                                   \
13017 +               spin_lock(lock);                                        \
13018 +       } while (0)
13019 +
13020 +#define local_spin_unlock_irq(lvar, lock)                              \
13021 +       do {                                                            \
13022 +               spin_unlock(lock);                                      \
13023 +               local_unlock_irq(lvar);                                 \
13024 +       } while (0)
13025 +
13026 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
13027 +       do {                                                            \
13028 +               local_lock_irqsave(lvar, flags);                        \
13029 +               spin_lock(lock);                                        \
13030 +       } while (0)
13031 +
13032 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
13033 +       do {                                                            \
13034 +               spin_unlock(lock);                                      \
13035 +               local_unlock_irqrestore(lvar, flags);                   \
13036 +       } while (0)
13037 +
13038 +#define get_locked_var(lvar, var)                                      \
13039 +       (*({                                                            \
13040 +               local_lock(lvar);                                       \
13041 +               this_cpu_ptr(&var);                                     \
13042 +       }))
13043 +
13044 +#define put_locked_var(lvar, var)      local_unlock(lvar);
13045 +
13046 +#define local_lock_cpu(lvar)                                           \
13047 +       ({                                                              \
13048 +               local_lock(lvar);                                       \
13049 +               smp_processor_id();                                     \
13050 +       })
13051 +
13052 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
13053 +
13054 +#else /* PREEMPT_RT_BASE */
13055 +
13056 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
13057 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
13058 +
13059 +static inline void local_irq_lock_init(int lvar) { }
13060 +
13061 +#define local_lock(lvar)                       preempt_disable()
13062 +#define local_unlock(lvar)                     preempt_enable()
13063 +#define local_lock_irq(lvar)                   local_irq_disable()
13064 +#define local_unlock_irq(lvar)                 local_irq_enable()
13065 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
13066 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
13067 +
13068 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
13069 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
13070 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
13071 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
13072 +       spin_lock_irqsave(lock, flags)
13073 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
13074 +       spin_unlock_irqrestore(lock, flags)
13075 +
13076 +#define get_locked_var(lvar, var)              get_cpu_var(var)
13077 +#define put_locked_var(lvar, var)              put_cpu_var(var)
13078 +
13079 +#define local_lock_cpu(lvar)                   get_cpu()
13080 +#define local_unlock_cpu(lvar)                 put_cpu()
13081 +
13082 +#endif
13083 +
13084 +#endif
13085 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13086 index f8d1492a114f..b238ebfbb4d6 100644
13087 --- a/include/linux/mm_types.h
13088 +++ b/include/linux/mm_types.h
13089 @@ -11,6 +11,7 @@
13090  #include <linux/completion.h>
13091  #include <linux/cpumask.h>
13092  #include <linux/uprobes.h>
13093 +#include <linux/rcupdate.h>
13094  #include <linux/page-flags-layout.h>
13095  #include <asm/page.h>
13096  #include <asm/mmu.h>
13097 @@ -504,6 +505,9 @@ struct mm_struct {
13098         bool tlb_flush_pending;
13099  #endif
13100         struct uprobes_state uprobes_state;
13101 +#ifdef CONFIG_PREEMPT_RT_BASE
13102 +       struct rcu_head delayed_drop;
13103 +#endif
13104  #ifdef CONFIG_X86_INTEL_MPX
13105         /* address of the bounds directory */
13106         void __user *bd_addr;
13107 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13108 index 2cb7531e7d7a..b3fdfc820216 100644
13109 --- a/include/linux/mutex.h
13110 +++ b/include/linux/mutex.h
13111 @@ -19,6 +19,17 @@
13112  #include <asm/processor.h>
13113  #include <linux/osq_lock.h>
13114  
13115 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13116 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13117 +       , .dep_map = { .name = #lockname }
13118 +#else
13119 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13120 +#endif
13121 +
13122 +#ifdef CONFIG_PREEMPT_RT_FULL
13123 +# include <linux/mutex_rt.h>
13124 +#else
13125 +
13126  /*
13127   * Simple, straightforward mutexes with strict semantics:
13128   *
13129 @@ -99,13 +110,6 @@ do {                                                        \
13130  static inline void mutex_destroy(struct mutex *lock) {}
13131  #endif
13132  
13133 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
13134 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13135 -               , .dep_map = { .name = #lockname }
13136 -#else
13137 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13138 -#endif
13139 -
13140  #define __MUTEX_INITIALIZER(lockname) \
13141                 { .count = ATOMIC_INIT(1) \
13142                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13143 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
13144  extern int mutex_trylock(struct mutex *lock);
13145  extern void mutex_unlock(struct mutex *lock);
13146  
13147 +#endif /* !PREEMPT_RT_FULL */
13148 +
13149  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13150  
13151  #endif /* __LINUX_MUTEX_H */
13152 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13153 new file mode 100644
13154 index 000000000000..c38a44b14da5
13155 --- /dev/null
13156 +++ b/include/linux/mutex_rt.h
13157 @@ -0,0 +1,84 @@
13158 +#ifndef __LINUX_MUTEX_RT_H
13159 +#define __LINUX_MUTEX_RT_H
13160 +
13161 +#ifndef __LINUX_MUTEX_H
13162 +#error "Please include mutex.h"
13163 +#endif
13164 +
13165 +#include <linux/rtmutex.h>
13166 +
13167 +/* FIXME: Just for __lockfunc */
13168 +#include <linux/spinlock.h>
13169 +
13170 +struct mutex {
13171 +       struct rt_mutex         lock;
13172 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13173 +       struct lockdep_map      dep_map;
13174 +#endif
13175 +};
13176 +
13177 +#define __MUTEX_INITIALIZER(mutexname)                                 \
13178 +       {                                                               \
13179 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
13180 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
13181 +       }
13182 +
13183 +#define DEFINE_MUTEX(mutexname)                                                \
13184 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13185 +
13186 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13187 +extern void __lockfunc _mutex_lock(struct mutex *lock);
13188 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13189 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13190 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13191 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13192 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13193 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13194 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
13195 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
13196 +
13197 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
13198 +#define mutex_lock(l)                  _mutex_lock(l)
13199 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
13200 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
13201 +#define mutex_trylock(l)               _mutex_trylock(l)
13202 +#define mutex_unlock(l)                        _mutex_unlock(l)
13203 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
13204 +
13205 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13206 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
13207 +# define mutex_lock_interruptible_nested(l, s) \
13208 +                                       _mutex_lock_interruptible_nested(l, s)
13209 +# define mutex_lock_killable_nested(l, s) \
13210 +                                       _mutex_lock_killable_nested(l, s)
13211 +
13212 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
13213 +do {                                                                   \
13214 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
13215 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
13216 +} while (0)
13217 +
13218 +#else
13219 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
13220 +# define mutex_lock_interruptible_nested(l, s) \
13221 +                                       _mutex_lock_interruptible(l)
13222 +# define mutex_lock_killable_nested(l, s) \
13223 +                                       _mutex_lock_killable(l)
13224 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13225 +#endif
13226 +
13227 +# define mutex_init(mutex)                             \
13228 +do {                                                   \
13229 +       static struct lock_class_key __key;             \
13230 +                                                       \
13231 +       rt_mutex_init(&(mutex)->lock);                  \
13232 +       __mutex_do_init((mutex), #mutex, &__key);       \
13233 +} while (0)
13234 +
13235 +# define __mutex_init(mutex, name, key)                        \
13236 +do {                                                   \
13237 +       rt_mutex_init(&(mutex)->lock);                  \
13238 +       __mutex_do_init((mutex), name, key);            \
13239 +} while (0)
13240 +
13241 +#endif
13242 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13243 index 4e9c75226f07..53dd7087d75e 100644
13244 --- a/include/linux/netdevice.h
13245 +++ b/include/linux/netdevice.h
13246 @@ -2248,11 +2248,20 @@ void netdev_freemem(struct net_device *dev);
13247  void synchronize_net(void);
13248  int init_dummy_netdev(struct net_device *dev);
13249  
13250 +#ifdef CONFIG_PREEMPT_RT_FULL
13251 +static inline int dev_recursion_level(void)
13252 +{
13253 +       return current->xmit_recursion;
13254 +}
13255 +
13256 +#else
13257 +
13258  DECLARE_PER_CPU(int, xmit_recursion);
13259  static inline int dev_recursion_level(void)
13260  {
13261         return this_cpu_read(xmit_recursion);
13262  }
13263 +#endif
13264  
13265  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13266  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13267 @@ -2563,6 +2572,7 @@ struct softnet_data {
13268         unsigned int            dropped;
13269         struct sk_buff_head     input_pkt_queue;
13270         struct napi_struct      backlog;
13271 +       struct sk_buff_head     tofree_queue;
13272  
13273  };
13274  
13275 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13276 index 04078e8a4803..a61c9609e32f 100644
13277 --- a/include/linux/netfilter/x_tables.h
13278 +++ b/include/linux/netfilter/x_tables.h
13279 @@ -4,6 +4,7 @@
13280  
13281  #include <linux/netdevice.h>
13282  #include <linux/static_key.h>
13283 +#include <linux/locallock.h>
13284  #include <uapi/linux/netfilter/x_tables.h>
13285  
13286  /**
13287 @@ -289,6 +290,8 @@ void xt_free_table_info(struct xt_table_info *info);
13288   */
13289  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13290  
13291 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13292 +
13293  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13294   *
13295   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13296 @@ -309,6 +312,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13297  {
13298         unsigned int addend;
13299  
13300 +       /* RT protection */
13301 +       local_lock(xt_write_lock);
13302 +
13303         /*
13304          * Low order bit of sequence is set if we already
13305          * called xt_write_recseq_begin().
13306 @@ -339,6 +345,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13307         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13308         smp_wmb();
13309         __this_cpu_add(xt_recseq.sequence, addend);
13310 +       local_unlock(xt_write_lock);
13311  }
13312  
13313  /*
13314 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13315 index d14a4c362465..2e4414a0c1c4 100644
13316 --- a/include/linux/notifier.h
13317 +++ b/include/linux/notifier.h
13318 @@ -6,7 +6,7 @@
13319   *
13320   *                             Alan Cox <Alan.Cox@linux.org>
13321   */
13322
13323 +
13324  #ifndef _LINUX_NOTIFIER_H
13325  #define _LINUX_NOTIFIER_H
13326  #include <linux/errno.h>
13327 @@ -42,9 +42,7 @@
13328   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13329   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13330   * SRCU notifier chains should be used when the chain will be called very
13331 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13332 - * chains are slightly more difficult to use because they require special
13333 - * runtime initialization.
13334 + * often but notifier_blocks will seldom be removed.
13335   */
13336  
13337  typedef        int (*notifier_fn_t)(struct notifier_block *nb,
13338 @@ -88,7 +86,7 @@ struct srcu_notifier_head {
13339                 (name)->head = NULL;            \
13340         } while (0)
13341  
13342 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13343 +/* srcu_notifier_heads must be cleaned up dynamically */
13344  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13345  #define srcu_cleanup_notifier_head(name)       \
13346                 cleanup_srcu_struct(&(name)->srcu);
13347 @@ -101,7 +99,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13348                 .head = NULL }
13349  #define RAW_NOTIFIER_INIT(name)        {                               \
13350                 .head = NULL }
13351 -/* srcu_notifier_heads cannot be initialized statically */
13352 +
13353 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13354 +       {                                                       \
13355 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13356 +               .head = NULL,                                   \
13357 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13358 +       }
13359  
13360  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13361         struct atomic_notifier_head name =                      \
13362 @@ -113,6 +117,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13363         struct raw_notifier_head name =                         \
13364                 RAW_NOTIFIER_INIT(name)
13365  
13366 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13367 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
13368 +                       name##_head_srcu_array);                \
13369 +       mod struct srcu_notifier_head name =                    \
13370 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
13371 +
13372 +#define SRCU_NOTIFIER_HEAD(name)                               \
13373 +       _SRCU_NOTIFIER_HEAD(name, )
13374 +
13375 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13376 +       _SRCU_NOTIFIER_HEAD(name, static)
13377 +
13378  #ifdef __KERNEL__
13379  
13380  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13381 @@ -182,12 +198,12 @@ static inline int notifier_to_errno(int ret)
13382  
13383  /*
13384   *     Declared notifiers so far. I can imagine quite a few more chains
13385 - *     over time (eg laptop power reset chains, reboot chain (to clean 
13386 + *     over time (eg laptop power reset chains, reboot chain (to clean
13387   *     device units up), device [un]mount chain, module load/unload chain,
13388 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
13389 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13390   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13391   */
13392
13393 +
13394  /* CPU notfiers are defined in include/linux/cpu.h. */
13395  
13396  /* netdevice notifiers are defined in include/linux/netdevice.h */
13397 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13398 index caebf2a758dc..53a60a51c758 100644
13399 --- a/include/linux/percpu.h
13400 +++ b/include/linux/percpu.h
13401 @@ -24,6 +24,35 @@
13402          PERCPU_MODULE_RESERVE)
13403  #endif
13404  
13405 +#ifdef CONFIG_PREEMPT_RT_FULL
13406 +
13407 +#define get_local_var(var) (*({                \
13408 +              migrate_disable();       \
13409 +              this_cpu_ptr(&var);      }))
13410 +
13411 +#define put_local_var(var) do {        \
13412 +       (void)&(var);           \
13413 +       migrate_enable();       \
13414 +} while (0)
13415 +
13416 +# define get_local_ptr(var) ({         \
13417 +               migrate_disable();      \
13418 +               this_cpu_ptr(var);      })
13419 +
13420 +# define put_local_ptr(var) do {       \
13421 +       (void)(var);                    \
13422 +       migrate_enable();               \
13423 +} while (0)
13424 +
13425 +#else
13426 +
13427 +#define get_local_var(var)     get_cpu_var(var)
13428 +#define put_local_var(var)     put_cpu_var(var)
13429 +#define get_local_ptr(var)     get_cpu_ptr(var)
13430 +#define put_local_ptr(var)     put_cpu_ptr(var)
13431 +
13432 +#endif
13433 +
13434  /* minimum unit size, also is the maximum supported allocation size */
13435  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13436  
13437 diff --git a/include/linux/pid.h b/include/linux/pid.h
13438 index 23705a53abba..2cc64b779f03 100644
13439 --- a/include/linux/pid.h
13440 +++ b/include/linux/pid.h
13441 @@ -2,6 +2,7 @@
13442  #define _LINUX_PID_H
13443  
13444  #include <linux/rcupdate.h>
13445 +#include <linux/atomic.h>
13446  
13447  enum pid_type
13448  {
13449 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13450 index 75e4e30677f1..1cfb1cb72354 100644
13451 --- a/include/linux/preempt.h
13452 +++ b/include/linux/preempt.h
13453 @@ -50,7 +50,11 @@
13454  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13455  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13456  
13457 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13458 +#ifndef CONFIG_PREEMPT_RT_FULL
13459 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13460 +#else
13461 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13462 +#endif
13463  
13464  /* We use the MSB mostly because its available */
13465  #define PREEMPT_NEED_RESCHED   0x80000000
13466 @@ -59,9 +63,15 @@
13467  #include <asm/preempt.h>
13468  
13469  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13470 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13471  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13472                                  | NMI_MASK))
13473 +#ifndef CONFIG_PREEMPT_RT_FULL
13474 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13475 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13476 +#else
13477 +# define softirq_count()       (0UL)
13478 +extern int in_serving_softirq(void);
13479 +#endif
13480  
13481  /*
13482   * Are we doing bottom half or hardware interrupt processing?
13483 @@ -72,7 +82,6 @@
13484  #define in_irq()               (hardirq_count())
13485  #define in_softirq()           (softirq_count())
13486  #define in_interrupt()         (irq_count())
13487 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
13488  
13489  /*
13490   * Are we in NMI context?
13491 @@ -91,7 +100,11 @@
13492  /*
13493   * The preempt_count offset after spin_lock()
13494   */
13495 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13496  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
13497 +#else
13498 +#define PREEMPT_LOCK_OFFSET    0
13499 +#endif
13500  
13501  /*
13502   * The preempt_count offset needed for things like:
13503 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
13504  #define preempt_count_inc() preempt_count_add(1)
13505  #define preempt_count_dec() preempt_count_sub(1)
13506  
13507 +#ifdef CONFIG_PREEMPT_LAZY
13508 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
13509 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
13510 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
13511 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
13512 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
13513 +#else
13514 +#define add_preempt_lazy_count(val)    do { } while (0)
13515 +#define sub_preempt_lazy_count(val)    do { } while (0)
13516 +#define inc_preempt_lazy_count()       do { } while (0)
13517 +#define dec_preempt_lazy_count()       do { } while (0)
13518 +#define preempt_lazy_count()           (0)
13519 +#endif
13520 +
13521  #ifdef CONFIG_PREEMPT_COUNT
13522  
13523  #define preempt_disable() \
13524 @@ -148,13 +175,25 @@ do { \
13525         barrier(); \
13526  } while (0)
13527  
13528 +#define preempt_lazy_disable() \
13529 +do { \
13530 +       inc_preempt_lazy_count(); \
13531 +       barrier(); \
13532 +} while (0)
13533 +
13534  #define sched_preempt_enable_no_resched() \
13535  do { \
13536         barrier(); \
13537         preempt_count_dec(); \
13538  } while (0)
13539  
13540 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13541 +#ifdef CONFIG_PREEMPT_RT_BASE
13542 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13543 +# define preempt_check_resched_rt() preempt_check_resched()
13544 +#else
13545 +# define preempt_enable_no_resched() preempt_enable()
13546 +# define preempt_check_resched_rt() barrier();
13547 +#endif
13548  
13549  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
13550  
13551 @@ -179,6 +218,13 @@ do { \
13552                 __preempt_schedule(); \
13553  } while (0)
13554  
13555 +#define preempt_lazy_enable() \
13556 +do { \
13557 +       dec_preempt_lazy_count(); \
13558 +       barrier(); \
13559 +       preempt_check_resched(); \
13560 +} while (0)
13561 +
13562  #else /* !CONFIG_PREEMPT */
13563  #define preempt_enable() \
13564  do { \
13565 @@ -224,6 +270,7 @@ do { \
13566  #define preempt_disable_notrace()              barrier()
13567  #define preempt_enable_no_resched_notrace()    barrier()
13568  #define preempt_enable_notrace()               barrier()
13569 +#define preempt_check_resched_rt()             barrier()
13570  #define preemptible()                          0
13571  
13572  #endif /* CONFIG_PREEMPT_COUNT */
13573 @@ -244,10 +291,31 @@ do { \
13574  } while (0)
13575  #define preempt_fold_need_resched() \
13576  do { \
13577 -       if (tif_need_resched()) \
13578 +       if (tif_need_resched_now()) \
13579                 set_preempt_need_resched(); \
13580  } while (0)
13581  
13582 +#ifdef CONFIG_PREEMPT_RT_FULL
13583 +# define preempt_disable_rt()          preempt_disable()
13584 +# define preempt_enable_rt()           preempt_enable()
13585 +# define preempt_disable_nort()                barrier()
13586 +# define preempt_enable_nort()         barrier()
13587 +# ifdef CONFIG_SMP
13588 +   extern void migrate_disable(void);
13589 +   extern void migrate_enable(void);
13590 +# else /* CONFIG_SMP */
13591 +#  define migrate_disable()            barrier()
13592 +#  define migrate_enable()             barrier()
13593 +# endif /* CONFIG_SMP */
13594 +#else
13595 +# define preempt_disable_rt()          barrier()
13596 +# define preempt_enable_rt()           barrier()
13597 +# define preempt_disable_nort()                preempt_disable()
13598 +# define preempt_enable_nort()         preempt_enable()
13599 +# define migrate_disable()             preempt_disable()
13600 +# define migrate_enable()              preempt_enable()
13601 +#endif
13602 +
13603  #ifdef CONFIG_PREEMPT_NOTIFIERS
13604  
13605  struct preempt_notifier;
13606 diff --git a/include/linux/printk.h b/include/linux/printk.h
13607 index 9729565c25ff..9cdca696b718 100644
13608 --- a/include/linux/printk.h
13609 +++ b/include/linux/printk.h
13610 @@ -117,9 +117,11 @@ int no_printk(const char *fmt, ...)
13611  #ifdef CONFIG_EARLY_PRINTK
13612  extern asmlinkage __printf(1, 2)
13613  void early_printk(const char *fmt, ...);
13614 +extern void printk_kill(void);
13615  #else
13616  static inline __printf(1, 2) __cold
13617  void early_printk(const char *s, ...) { }
13618 +static inline void printk_kill(void) { }
13619  #endif
13620  
13621  typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
13622 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
13623 index 5d5174b59802..8ddbd6e15a3c 100644
13624 --- a/include/linux/radix-tree.h
13625 +++ b/include/linux/radix-tree.h
13626 @@ -277,8 +277,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
13627  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
13628                         void ***results, unsigned long *indices,
13629                         unsigned long first_index, unsigned int max_items);
13630 +#ifndef CONFIG_PREEMPT_RT_FULL
13631  int radix_tree_preload(gfp_t gfp_mask);
13632  int radix_tree_maybe_preload(gfp_t gfp_mask);
13633 +#else
13634 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
13635 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
13636 +#endif
13637  void radix_tree_init(void);
13638  void *radix_tree_tag_set(struct radix_tree_root *root,
13639                         unsigned long index, unsigned int tag);
13640 @@ -303,7 +308,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
13641  
13642  static inline void radix_tree_preload_end(void)
13643  {
13644 -       preempt_enable();
13645 +       preempt_enable_nort();
13646  }
13647  
13648  /**
13649 diff --git a/include/linux/random.h b/include/linux/random.h
13650 index a75840c1aa71..1a804361670c 100644
13651 --- a/include/linux/random.h
13652 +++ b/include/linux/random.h
13653 @@ -20,7 +20,7 @@ struct random_ready_callback {
13654  extern void add_device_randomness(const void *, unsigned int);
13655  extern void add_input_randomness(unsigned int type, unsigned int code,
13656                                  unsigned int value);
13657 -extern void add_interrupt_randomness(int irq, int irq_flags);
13658 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
13659  
13660  extern void get_random_bytes(void *buf, int nbytes);
13661  extern int add_random_ready_callback(struct random_ready_callback *rdy);
13662 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
13663 index a5aa7ae671f4..24ddffd25492 100644
13664 --- a/include/linux/rbtree.h
13665 +++ b/include/linux/rbtree.h
13666 @@ -31,7 +31,6 @@
13667  
13668  #include <linux/kernel.h>
13669  #include <linux/stddef.h>
13670 -#include <linux/rcupdate.h>
13671  
13672  struct rb_node {
13673         unsigned long  __rb_parent_color;
13674 @@ -86,14 +85,8 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
13675         *rb_link = node;
13676  }
13677  
13678 -static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13679 -                                   struct rb_node **rb_link)
13680 -{
13681 -       node->__rb_parent_color = (unsigned long)parent;
13682 -       node->rb_left = node->rb_right = NULL;
13683 -
13684 -       rcu_assign_pointer(*rb_link, node);
13685 -}
13686 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13687 +                     struct rb_node **rb_link);
13688  
13689  #define rb_entry_safe(ptr, type, member) \
13690         ({ typeof(ptr) ____ptr = (ptr); \
13691 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
13692 index a0189ba67fde..c2f5f955163d 100644
13693 --- a/include/linux/rcupdate.h
13694 +++ b/include/linux/rcupdate.h
13695 @@ -169,6 +169,9 @@ void call_rcu(struct rcu_head *head,
13696  
13697  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13698  
13699 +#ifdef CONFIG_PREEMPT_RT_FULL
13700 +#define call_rcu_bh    call_rcu
13701 +#else
13702  /**
13703   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
13704   * @head: structure to be used for queueing the RCU updates.
13705 @@ -192,6 +195,7 @@ void call_rcu(struct rcu_head *head,
13706   */
13707  void call_rcu_bh(struct rcu_head *head,
13708                  rcu_callback_t func);
13709 +#endif
13710  
13711  /**
13712   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
13713 @@ -292,6 +296,11 @@ void synchronize_rcu(void);
13714   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
13715   */
13716  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
13717 +#ifndef CONFIG_PREEMPT_RT_FULL
13718 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13719 +#else
13720 +static inline int sched_rcu_preempt_depth(void) { return 0; }
13721 +#endif
13722  
13723  #else /* #ifdef CONFIG_PREEMPT_RCU */
13724  
13725 @@ -317,6 +326,8 @@ static inline int rcu_preempt_depth(void)
13726         return 0;
13727  }
13728  
13729 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13730 +
13731  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13732  
13733  /* Internal to kernel */
13734 @@ -489,7 +500,14 @@ extern struct lockdep_map rcu_callback_map;
13735  int debug_lockdep_rcu_enabled(void);
13736  
13737  int rcu_read_lock_held(void);
13738 +#ifdef CONFIG_PREEMPT_RT_FULL
13739 +static inline int rcu_read_lock_bh_held(void)
13740 +{
13741 +       return rcu_read_lock_held();
13742 +}
13743 +#else
13744  int rcu_read_lock_bh_held(void);
13745 +#endif
13746  
13747  /**
13748   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
13749 @@ -937,10 +955,14 @@ static inline void rcu_read_unlock(void)
13750  static inline void rcu_read_lock_bh(void)
13751  {
13752         local_bh_disable();
13753 +#ifdef CONFIG_PREEMPT_RT_FULL
13754 +       rcu_read_lock();
13755 +#else
13756         __acquire(RCU_BH);
13757         rcu_lock_acquire(&rcu_bh_lock_map);
13758         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13759                          "rcu_read_lock_bh() used illegally while idle");
13760 +#endif
13761  }
13762  
13763  /*
13764 @@ -950,10 +972,14 @@ static inline void rcu_read_lock_bh(void)
13765   */
13766  static inline void rcu_read_unlock_bh(void)
13767  {
13768 +#ifdef CONFIG_PREEMPT_RT_FULL
13769 +       rcu_read_unlock();
13770 +#else
13771         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13772                          "rcu_read_unlock_bh() used illegally while idle");
13773         rcu_lock_release(&rcu_bh_lock_map);
13774         __release(RCU_BH);
13775 +#endif
13776         local_bh_enable();
13777  }
13778  
13779 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
13780 index 60d15a080d7c..436c9e62bfc6 100644
13781 --- a/include/linux/rcutree.h
13782 +++ b/include/linux/rcutree.h
13783 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
13784         rcu_note_context_switch();
13785  }
13786  
13787 +#ifdef CONFIG_PREEMPT_RT_FULL
13788 +# define synchronize_rcu_bh    synchronize_rcu
13789 +#else
13790  void synchronize_rcu_bh(void);
13791 +#endif
13792  void synchronize_sched_expedited(void);
13793  void synchronize_rcu_expedited(void);
13794  
13795 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
13796  }
13797  
13798  void rcu_barrier(void);
13799 +#ifdef CONFIG_PREEMPT_RT_FULL
13800 +# define rcu_barrier_bh                rcu_barrier
13801 +#else
13802  void rcu_barrier_bh(void);
13803 +#endif
13804  void rcu_barrier_sched(void);
13805  unsigned long get_state_synchronize_rcu(void);
13806  void cond_synchronize_rcu(unsigned long oldstate);
13807 @@ -85,12 +93,10 @@ unsigned long rcu_batches_started(void);
13808  unsigned long rcu_batches_started_bh(void);
13809  unsigned long rcu_batches_started_sched(void);
13810  unsigned long rcu_batches_completed(void);
13811 -unsigned long rcu_batches_completed_bh(void);
13812  unsigned long rcu_batches_completed_sched(void);
13813  void show_rcu_gp_kthreads(void);
13814  
13815  void rcu_force_quiescent_state(void);
13816 -void rcu_bh_force_quiescent_state(void);
13817  void rcu_sched_force_quiescent_state(void);
13818  
13819  void rcu_idle_enter(void);
13820 @@ -105,6 +111,14 @@ extern int rcu_scheduler_active __read_mostly;
13821  
13822  bool rcu_is_watching(void);
13823  
13824 +#ifndef CONFIG_PREEMPT_RT_FULL
13825 +void rcu_bh_force_quiescent_state(void);
13826 +unsigned long rcu_batches_completed_bh(void);
13827 +#else
13828 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
13829 +# define rcu_batches_completed_bh      rcu_batches_completed
13830 +#endif
13831 +
13832  void rcu_all_qs(void);
13833  
13834  #endif /* __LINUX_RCUTREE_H */
13835 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
13836 index 1abba5ce2a2f..30211c627511 100644
13837 --- a/include/linux/rtmutex.h
13838 +++ b/include/linux/rtmutex.h
13839 @@ -13,11 +13,15 @@
13840  #define __LINUX_RT_MUTEX_H
13841  
13842  #include <linux/linkage.h>
13843 +#include <linux/spinlock_types_raw.h>
13844  #include <linux/rbtree.h>
13845 -#include <linux/spinlock_types.h>
13846  
13847  extern int max_lock_depth; /* for sysctl */
13848  
13849 +#ifdef CONFIG_DEBUG_MUTEXES
13850 +#include <linux/debug_locks.h>
13851 +#endif
13852 +
13853  /**
13854   * The rt_mutex structure
13855   *
13856 @@ -31,8 +35,8 @@ struct rt_mutex {
13857         struct rb_root          waiters;
13858         struct rb_node          *waiters_leftmost;
13859         struct task_struct      *owner;
13860 -#ifdef CONFIG_DEBUG_RT_MUTEXES
13861         int                     save_state;
13862 +#ifdef CONFIG_DEBUG_RT_MUTEXES
13863         const char              *name, *file;
13864         int                     line;
13865         void                    *magic;
13866 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
13867  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
13868  #endif
13869  
13870 +# define rt_mutex_init(mutex)                                  \
13871 +       do {                                                    \
13872 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
13873 +               __rt_mutex_init(mutex, #mutex);                 \
13874 +       } while (0)
13875 +
13876  #ifdef CONFIG_DEBUG_RT_MUTEXES
13877  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
13878         , .name = #mutexname, .file = __FILE__, .line = __LINE__
13879 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
13880   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
13881  #else
13882  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
13883 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
13884  # define rt_mutex_debug_task_free(t)                   do { } while (0)
13885  #endif
13886  
13887 -#define __RT_MUTEX_INITIALIZER(mutexname) \
13888 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
13889 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
13890 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
13891         , .waiters = RB_ROOT \
13892         , .owner = NULL \
13893 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
13894 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
13895 +
13896 +#define __RT_MUTEX_INITIALIZER(mutexname) \
13897 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
13898 +
13899 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
13900 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
13901 +       , .save_state = 1 }
13902  
13903  #define DEFINE_RT_MUTEX(mutexname) \
13904         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
13905 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
13906  
13907  extern void rt_mutex_lock(struct rt_mutex *lock);
13908  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
13909 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
13910  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
13911                                struct hrtimer_sleeper *timeout);
13912  
13913 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
13914 new file mode 100644
13915 index 000000000000..49ed2d45d3be
13916 --- /dev/null
13917 +++ b/include/linux/rwlock_rt.h
13918 @@ -0,0 +1,99 @@
13919 +#ifndef __LINUX_RWLOCK_RT_H
13920 +#define __LINUX_RWLOCK_RT_H
13921 +
13922 +#ifndef __LINUX_SPINLOCK_H
13923 +#error Do not include directly. Use spinlock.h
13924 +#endif
13925 +
13926 +#define rwlock_init(rwl)                               \
13927 +do {                                                   \
13928 +       static struct lock_class_key __key;             \
13929 +                                                       \
13930 +       rt_mutex_init(&(rwl)->lock);                    \
13931 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
13932 +} while (0)
13933 +
13934 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
13935 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
13936 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
13937 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
13938 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
13939 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
13940 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
13941 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
13942 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
13943 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
13944 +
13945 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
13946 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
13947 +
13948 +#define write_trylock_irqsave(lock, flags)     \
13949 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
13950 +
13951 +#define read_lock_irqsave(lock, flags)                 \
13952 +       do {                                            \
13953 +               typecheck(unsigned long, flags);        \
13954 +               flags = rt_read_lock_irqsave(lock);     \
13955 +       } while (0)
13956 +
13957 +#define write_lock_irqsave(lock, flags)                        \
13958 +       do {                                            \
13959 +               typecheck(unsigned long, flags);        \
13960 +               flags = rt_write_lock_irqsave(lock);    \
13961 +       } while (0)
13962 +
13963 +#define read_lock(lock)                rt_read_lock(lock)
13964 +
13965 +#define read_lock_bh(lock)                             \
13966 +       do {                                            \
13967 +               local_bh_disable();                     \
13968 +               rt_read_lock(lock);                     \
13969 +       } while (0)
13970 +
13971 +#define read_lock_irq(lock)    read_lock(lock)
13972 +
13973 +#define write_lock(lock)       rt_write_lock(lock)
13974 +
13975 +#define write_lock_bh(lock)                            \
13976 +       do {                                            \
13977 +               local_bh_disable();                     \
13978 +               rt_write_lock(lock);                    \
13979 +       } while (0)
13980 +
13981 +#define write_lock_irq(lock)   write_lock(lock)
13982 +
13983 +#define read_unlock(lock)      rt_read_unlock(lock)
13984 +
13985 +#define read_unlock_bh(lock)                           \
13986 +       do {                                            \
13987 +               rt_read_unlock(lock);                   \
13988 +               local_bh_enable();                      \
13989 +       } while (0)
13990 +
13991 +#define read_unlock_irq(lock)  read_unlock(lock)
13992 +
13993 +#define write_unlock(lock)     rt_write_unlock(lock)
13994 +
13995 +#define write_unlock_bh(lock)                          \
13996 +       do {                                            \
13997 +               rt_write_unlock(lock);                  \
13998 +               local_bh_enable();                      \
13999 +       } while (0)
14000 +
14001 +#define write_unlock_irq(lock) write_unlock(lock)
14002 +
14003 +#define read_unlock_irqrestore(lock, flags)            \
14004 +       do {                                            \
14005 +               typecheck(unsigned long, flags);        \
14006 +               (void) flags;                           \
14007 +               rt_read_unlock(lock);                   \
14008 +       } while (0)
14009 +
14010 +#define write_unlock_irqrestore(lock, flags) \
14011 +       do {                                            \
14012 +               typecheck(unsigned long, flags);        \
14013 +               (void) flags;                           \
14014 +               rt_write_unlock(lock);                  \
14015 +       } while (0)
14016 +
14017 +#endif
14018 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14019 index cc0072e93e36..d0da966ad7a0 100644
14020 --- a/include/linux/rwlock_types.h
14021 +++ b/include/linux/rwlock_types.h
14022 @@ -1,6 +1,10 @@
14023  #ifndef __LINUX_RWLOCK_TYPES_H
14024  #define __LINUX_RWLOCK_TYPES_H
14025  
14026 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14027 +# error "Do not include directly, include spinlock_types.h"
14028 +#endif
14029 +
14030  /*
14031   * include/linux/rwlock_types.h - generic rwlock type definitions
14032   *                               and initializers
14033 @@ -43,6 +47,7 @@ typedef struct {
14034                                 RW_DEP_MAP_INIT(lockname) }
14035  #endif
14036  
14037 -#define DEFINE_RWLOCK(x)       rwlock_t x = __RW_LOCK_UNLOCKED(x)
14038 +#define DEFINE_RWLOCK(name) \
14039 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14040  
14041  #endif /* __LINUX_RWLOCK_TYPES_H */
14042 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14043 new file mode 100644
14044 index 000000000000..b13832119591
14045 --- /dev/null
14046 +++ b/include/linux/rwlock_types_rt.h
14047 @@ -0,0 +1,33 @@
14048 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14049 +#define __LINUX_RWLOCK_TYPES_RT_H
14050 +
14051 +#ifndef __LINUX_SPINLOCK_TYPES_H
14052 +#error "Do not include directly. Include spinlock_types.h instead"
14053 +#endif
14054 +
14055 +/*
14056 + * rwlocks - rtmutex which allows single reader recursion
14057 + */
14058 +typedef struct {
14059 +       struct rt_mutex         lock;
14060 +       int                     read_depth;
14061 +       unsigned int            break_lock;
14062 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14063 +       struct lockdep_map      dep_map;
14064 +#endif
14065 +} rwlock_t;
14066 +
14067 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14068 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14069 +#else
14070 +# define RW_DEP_MAP_INIT(lockname)
14071 +#endif
14072 +
14073 +#define __RW_LOCK_UNLOCKED(name) \
14074 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
14075 +         RW_DEP_MAP_INIT(name) }
14076 +
14077 +#define DEFINE_RWLOCK(name) \
14078 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14079 +
14080 +#endif
14081 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14082 index 8f498cdde280..2b2148431f14 100644
14083 --- a/include/linux/rwsem.h
14084 +++ b/include/linux/rwsem.h
14085 @@ -18,6 +18,10 @@
14086  #include <linux/osq_lock.h>
14087  #endif
14088  
14089 +#ifdef CONFIG_PREEMPT_RT_FULL
14090 +#include <linux/rwsem_rt.h>
14091 +#else /* PREEMPT_RT_FULL */
14092 +
14093  struct rw_semaphore;
14094  
14095  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14096 @@ -177,4 +181,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
14097  # define up_read_non_owner(sem)                        up_read(sem)
14098  #endif
14099  
14100 +#endif /* !PREEMPT_RT_FULL */
14101 +
14102  #endif /* _LINUX_RWSEM_H */
14103 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14104 new file mode 100644
14105 index 000000000000..f97860b2e2a4
14106 --- /dev/null
14107 +++ b/include/linux/rwsem_rt.h
14108 @@ -0,0 +1,152 @@
14109 +#ifndef _LINUX_RWSEM_RT_H
14110 +#define _LINUX_RWSEM_RT_H
14111 +
14112 +#ifndef _LINUX_RWSEM_H
14113 +#error "Include rwsem.h"
14114 +#endif
14115 +
14116 +/*
14117 + * RW-semaphores are a spinlock plus a reader-depth count.
14118 + *
14119 + * Note that the semantics are different from the usual
14120 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
14121 + * multiple readers to hold the lock at once, we only allow
14122 + * a read-lock owner to read-lock recursively. This is
14123 + * better for latency, makes the implementation inherently
14124 + * fair and makes it simpler as well.
14125 + */
14126 +
14127 +#include <linux/rtmutex.h>
14128 +
14129 +struct rw_semaphore {
14130 +       struct rt_mutex         lock;
14131 +       int                     read_depth;
14132 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14133 +       struct lockdep_map      dep_map;
14134 +#endif
14135 +};
14136 +
14137 +#define __RWSEM_INITIALIZER(name) \
14138 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
14139 +         RW_DEP_MAP_INIT(name) }
14140 +
14141 +#define DECLARE_RWSEM(lockname) \
14142 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14143 +
14144 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
14145 +                                    struct lock_class_key *key);
14146 +
14147 +#define __rt_init_rwsem(sem, name, key)                        \
14148 +       do {                                            \
14149 +               rt_mutex_init(&(sem)->lock);            \
14150 +               __rt_rwsem_init((sem), (name), (key));\
14151 +       } while (0)
14152 +
14153 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
14154 +
14155 +# define rt_init_rwsem(sem)                            \
14156 +do {                                                   \
14157 +       static struct lock_class_key __key;             \
14158 +                                                       \
14159 +       __rt_init_rwsem((sem), #sem, &__key);           \
14160 +} while (0)
14161 +
14162 +extern void rt_down_write(struct rw_semaphore *rwsem);
14163 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
14164 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
14165 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
14166 +                                     struct lockdep_map *nest);
14167 +extern void rt__down_read(struct rw_semaphore *rwsem);
14168 +extern void rt_down_read(struct rw_semaphore *rwsem);
14169 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
14170 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
14171 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
14172 +extern void __rt_up_read(struct rw_semaphore *rwsem);
14173 +extern void rt_up_read(struct rw_semaphore *rwsem);
14174 +extern void rt_up_write(struct rw_semaphore *rwsem);
14175 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
14176 +
14177 +#define init_rwsem(sem)                rt_init_rwsem(sem)
14178 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
14179 +
14180 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14181 +{
14182 +       /* rt_mutex_has_waiters() */
14183 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
14184 +}
14185 +
14186 +static inline void __down_read(struct rw_semaphore *sem)
14187 +{
14188 +       rt__down_read(sem);
14189 +}
14190 +
14191 +static inline void down_read(struct rw_semaphore *sem)
14192 +{
14193 +       rt_down_read(sem);
14194 +}
14195 +
14196 +static inline int __down_read_trylock(struct rw_semaphore *sem)
14197 +{
14198 +       return rt__down_read_trylock(sem);
14199 +}
14200 +
14201 +static inline int down_read_trylock(struct rw_semaphore *sem)
14202 +{
14203 +       return rt_down_read_trylock(sem);
14204 +}
14205 +
14206 +static inline void down_write(struct rw_semaphore *sem)
14207 +{
14208 +       rt_down_write(sem);
14209 +}
14210 +
14211 +static inline int down_write_trylock(struct rw_semaphore *sem)
14212 +{
14213 +       return rt_down_write_trylock(sem);
14214 +}
14215 +
14216 +static inline void __up_read(struct rw_semaphore *sem)
14217 +{
14218 +       __rt_up_read(sem);
14219 +}
14220 +
14221 +static inline void up_read(struct rw_semaphore *sem)
14222 +{
14223 +       rt_up_read(sem);
14224 +}
14225 +
14226 +static inline void up_write(struct rw_semaphore *sem)
14227 +{
14228 +       rt_up_write(sem);
14229 +}
14230 +
14231 +static inline void downgrade_write(struct rw_semaphore *sem)
14232 +{
14233 +       rt_downgrade_write(sem);
14234 +}
14235 +
14236 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
14237 +{
14238 +       return rt_down_read_nested(sem, subclass);
14239 +}
14240 +
14241 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
14242 +{
14243 +       rt_down_write_nested(sem, subclass);
14244 +}
14245 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14246 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14247 +               struct rw_semaphore *nest_lock)
14248 +{
14249 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
14250 +}
14251 +
14252 +#else
14253 +
14254 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14255 +               struct rw_semaphore *nest_lock)
14256 +{
14257 +       rt_down_write_nested_lock(sem, NULL);
14258 +}
14259 +#endif
14260 +#endif
14261 diff --git a/include/linux/sched.h b/include/linux/sched.h
14262 index 1c0193baea2a..0570d8e022ec 100644
14263 --- a/include/linux/sched.h
14264 +++ b/include/linux/sched.h
14265 @@ -26,6 +26,7 @@ struct sched_param {
14266  #include <linux/nodemask.h>
14267  #include <linux/mm_types.h>
14268  #include <linux/preempt.h>
14269 +#include <asm/kmap_types.h>
14270  
14271  #include <asm/page.h>
14272  #include <asm/ptrace.h>
14273 @@ -182,8 +183,6 @@ extern void update_cpu_load_nohz(void);
14274  static inline void update_cpu_load_nohz(void) { }
14275  #endif
14276  
14277 -extern unsigned long get_parent_ip(unsigned long addr);
14278 -
14279  extern void dump_cpu_task(int cpu);
14280  
14281  struct seq_file;
14282 @@ -242,10 +241,7 @@ extern char ___assert_task_state[1 - 2*!!(
14283                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
14284                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
14285  
14286 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
14287  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
14288 -#define task_is_stopped_or_traced(task)        \
14289 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14290  #define task_contributes_to_load(task) \
14291                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14292                                  (task->flags & PF_FROZEN) == 0 && \
14293 @@ -311,6 +307,11 @@ extern char ___assert_task_state[1 - 2*!!(
14294  
14295  #endif
14296  
14297 +#define __set_current_state_no_track(state_value)      \
14298 +       do { current->state = (state_value); } while (0)
14299 +#define set_current_state_no_track(state_value)                \
14300 +       set_mb(current->state, (state_value))
14301 +
14302  /* Task command name length */
14303  #define TASK_COMM_LEN 16
14304  
14305 @@ -970,8 +971,18 @@ struct wake_q_head {
14306         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
14307  
14308  extern void wake_q_add(struct wake_q_head *head,
14309 -                      struct task_struct *task);
14310 -extern void wake_up_q(struct wake_q_head *head);
14311 +                             struct task_struct *task);
14312 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14313 +
14314 +static inline void wake_up_q(struct wake_q_head *head)
14315 +{
14316 +       __wake_up_q(head, false);
14317 +}
14318 +
14319 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14320 +{
14321 +       __wake_up_q(head, true);
14322 +}
14323  
14324  /*
14325   * sched-domains (multiprocessor balancing) declarations:
14326 @@ -1379,6 +1390,7 @@ struct tlbflush_unmap_batch {
14327  
14328  struct task_struct {
14329         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
14330 +       volatile long saved_state;      /* saved state for "spinlock sleepers" */
14331         void *stack;
14332         atomic_t usage;
14333         unsigned int flags;     /* per process flags, defined below */
14334 @@ -1415,6 +1427,12 @@ struct task_struct {
14335  #endif
14336  
14337         unsigned int policy;
14338 +#ifdef CONFIG_PREEMPT_RT_FULL
14339 +       int migrate_disable;
14340 +# ifdef CONFIG_SCHED_DEBUG
14341 +       int migrate_disable_atomic;
14342 +# endif
14343 +#endif
14344         int nr_cpus_allowed;
14345         cpumask_t cpus_allowed;
14346  
14347 @@ -1522,11 +1540,14 @@ struct task_struct {
14348         cputime_t gtime;
14349         struct prev_cputime prev_cputime;
14350  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14351 -       seqlock_t vtime_seqlock;
14352 +       seqcount_t vtime_seqcount;
14353         unsigned long long vtime_snap;
14354         enum {
14355 -               VTIME_SLEEPING = 0,
14356 +               /* Task is sleeping or running in a CPU with VTIME inactive */
14357 +               VTIME_INACTIVE = 0,
14358 +               /* Task runs in userspace in a CPU with VTIME active */
14359                 VTIME_USER,
14360 +               /* Task runs in kernelspace in a CPU with VTIME active */
14361                 VTIME_SYS,
14362         } vtime_snap_whence;
14363  #endif
14364 @@ -1538,6 +1559,9 @@ struct task_struct {
14365  
14366         struct task_cputime cputime_expires;
14367         struct list_head cpu_timers[3];
14368 +#ifdef CONFIG_PREEMPT_RT_BASE
14369 +       struct task_struct *posix_timer_list;
14370 +#endif
14371  
14372  /* process credentials */
14373         const struct cred __rcu *real_cred; /* objective and real subjective task
14374 @@ -1568,10 +1592,15 @@ struct task_struct {
14375  /* signal handlers */
14376         struct signal_struct *signal;
14377         struct sighand_struct *sighand;
14378 +       struct sigqueue *sigqueue_cache;
14379  
14380         sigset_t blocked, real_blocked;
14381         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
14382         struct sigpending pending;
14383 +#ifdef CONFIG_PREEMPT_RT_FULL
14384 +       /* TODO: move me into ->restart_block ? */
14385 +       struct siginfo forced_info;
14386 +#endif
14387  
14388         unsigned long sas_ss_sp;
14389         size_t sas_ss_size;
14390 @@ -1795,6 +1824,12 @@ struct task_struct {
14391         unsigned long trace;
14392         /* bitmask and counter of trace recursion */
14393         unsigned long trace_recursion;
14394 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
14395 +       u64 preempt_timestamp_hist;
14396 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
14397 +       long timer_offset;
14398 +#endif
14399 +#endif
14400  #endif /* CONFIG_TRACING */
14401  #ifdef CONFIG_MEMCG
14402         struct mem_cgroup *memcg_in_oom;
14403 @@ -1811,9 +1846,23 @@ struct task_struct {
14404         unsigned int    sequential_io;
14405         unsigned int    sequential_io_avg;
14406  #endif
14407 +#ifdef CONFIG_PREEMPT_RT_BASE
14408 +       struct rcu_head put_rcu;
14409 +       int softirq_nestcnt;
14410 +       unsigned int softirqs_raised;
14411 +#endif
14412 +#ifdef CONFIG_PREEMPT_RT_FULL
14413 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14414 +       int kmap_idx;
14415 +       pte_t kmap_pte[KM_TYPE_NR];
14416 +# endif
14417 +#endif
14418  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14419         unsigned long   task_state_change;
14420  #endif
14421 +#ifdef CONFIG_PREEMPT_RT_FULL
14422 +       int xmit_recursion;
14423 +#endif
14424         int pagefault_disabled;
14425  /* CPU-specific state of this task */
14426         struct thread_struct thread;
14427 @@ -1831,9 +1880,6 @@ extern int arch_task_struct_size __read_mostly;
14428  # define arch_task_struct_size (sizeof(struct task_struct))
14429  #endif
14430  
14431 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
14432 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
14433 -
14434  #define TNF_MIGRATED   0x01
14435  #define TNF_NO_GROUP   0x02
14436  #define TNF_SHARED     0x04
14437 @@ -2023,6 +2069,15 @@ extern struct pid *cad_pid;
14438  extern void free_task(struct task_struct *tsk);
14439  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14440  
14441 +#ifdef CONFIG_PREEMPT_RT_BASE
14442 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14443 +
14444 +static inline void put_task_struct(struct task_struct *t)
14445 +{
14446 +       if (atomic_dec_and_test(&t->usage))
14447 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
14448 +}
14449 +#else
14450  extern void __put_task_struct(struct task_struct *t);
14451  
14452  static inline void put_task_struct(struct task_struct *t)
14453 @@ -2030,6 +2085,7 @@ static inline void put_task_struct(struct task_struct *t)
14454         if (atomic_dec_and_test(&t->usage))
14455                 __put_task_struct(t);
14456  }
14457 +#endif
14458  
14459  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14460  extern void task_cputime(struct task_struct *t,
14461 @@ -2068,6 +2124,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
14462  /*
14463   * Per process flags
14464   */
14465 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
14466  #define PF_EXITING     0x00000004      /* getting shut down */
14467  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
14468  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
14469 @@ -2232,6 +2289,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
14470  
14471  extern int set_cpus_allowed_ptr(struct task_struct *p,
14472                                 const struct cpumask *new_mask);
14473 +int migrate_me(void);
14474 +void tell_sched_cpu_down_begin(int cpu);
14475 +void tell_sched_cpu_down_done(int cpu);
14476 +
14477  #else
14478  static inline void do_set_cpus_allowed(struct task_struct *p,
14479                                       const struct cpumask *new_mask)
14480 @@ -2244,6 +2305,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
14481                 return -EINVAL;
14482         return 0;
14483  }
14484 +static inline int migrate_me(void) { return 0; }
14485 +static inline void tell_sched_cpu_down_begin(int cpu) { }
14486 +static inline void tell_sched_cpu_down_done(int cpu) { }
14487  #endif
14488  
14489  #ifdef CONFIG_NO_HZ_COMMON
14490 @@ -2453,6 +2517,7 @@ extern void xtime_update(unsigned long ticks);
14491  
14492  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14493  extern int wake_up_process(struct task_struct *tsk);
14494 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
14495  extern void wake_up_new_task(struct task_struct *tsk);
14496  #ifdef CONFIG_SMP
14497   extern void kick_process(struct task_struct *tsk);
14498 @@ -2576,12 +2641,24 @@ extern struct mm_struct * mm_alloc(void);
14499  
14500  /* mmdrop drops the mm and the page tables */
14501  extern void __mmdrop(struct mm_struct *);
14502 +
14503  static inline void mmdrop(struct mm_struct * mm)
14504  {
14505         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
14506                 __mmdrop(mm);
14507  }
14508  
14509 +#ifdef CONFIG_PREEMPT_RT_BASE
14510 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14511 +static inline void mmdrop_delayed(struct mm_struct *mm)
14512 +{
14513 +       if (atomic_dec_and_test(&mm->mm_count))
14514 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14515 +}
14516 +#else
14517 +# define mmdrop_delayed(mm)    mmdrop(mm)
14518 +#endif
14519 +
14520  /* mmput gets rid of the mappings and all user-space */
14521  extern void mmput(struct mm_struct *);
14522  /* Grab a reference to a task's mm, if it is not already going away */
14523 @@ -2891,6 +2968,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
14524         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14525  }
14526  
14527 +#ifdef CONFIG_PREEMPT_LAZY
14528 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14529 +{
14530 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14531 +}
14532 +
14533 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14534 +{
14535 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14536 +}
14537 +
14538 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14539 +{
14540 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14541 +}
14542 +
14543 +static inline int need_resched_lazy(void)
14544 +{
14545 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14546 +}
14547 +
14548 +static inline int need_resched_now(void)
14549 +{
14550 +       return test_thread_flag(TIF_NEED_RESCHED);
14551 +}
14552 +
14553 +#else
14554 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14555 +static inline int need_resched_lazy(void) { return 0; }
14556 +
14557 +static inline int need_resched_now(void)
14558 +{
14559 +       return test_thread_flag(TIF_NEED_RESCHED);
14560 +}
14561 +
14562 +#endif
14563 +
14564  static inline int restart_syscall(void)
14565  {
14566         set_tsk_thread_flag(current, TIF_SIGPENDING);
14567 @@ -2922,6 +3036,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
14568         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
14569  }
14570  
14571 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14572 +{
14573 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14574 +               return true;
14575 +#ifdef CONFIG_PREEMPT_RT_FULL
14576 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14577 +               return true;
14578 +#endif
14579 +       return false;
14580 +}
14581 +
14582 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
14583 +{
14584 +       bool traced_stopped;
14585 +
14586 +#ifdef CONFIG_PREEMPT_RT_FULL
14587 +       unsigned long flags;
14588 +
14589 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
14590 +       traced_stopped = __task_is_stopped_or_traced(task);
14591 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14592 +#else
14593 +       traced_stopped = __task_is_stopped_or_traced(task);
14594 +#endif
14595 +       return traced_stopped;
14596 +}
14597 +
14598 +static inline bool task_is_traced(struct task_struct *task)
14599 +{
14600 +       bool traced = false;
14601 +
14602 +       if (task->state & __TASK_TRACED)
14603 +               return true;
14604 +#ifdef CONFIG_PREEMPT_RT_FULL
14605 +       /* in case the task is sleeping on tasklist_lock */
14606 +       raw_spin_lock_irq(&task->pi_lock);
14607 +       if (task->state & __TASK_TRACED)
14608 +               traced = true;
14609 +       else if (task->saved_state & __TASK_TRACED)
14610 +               traced = true;
14611 +       raw_spin_unlock_irq(&task->pi_lock);
14612 +#endif
14613 +       return traced;
14614 +}
14615 +
14616  /*
14617   * cond_resched() and cond_resched_lock(): latency reduction via
14618   * explicit rescheduling in places that are safe. The return
14619 @@ -2943,12 +3102,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
14620         __cond_resched_lock(lock);                              \
14621  })
14622  
14623 +#ifndef CONFIG_PREEMPT_RT_FULL
14624  extern int __cond_resched_softirq(void);
14625  
14626  #define cond_resched_softirq() ({                                      \
14627         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
14628         __cond_resched_softirq();                                       \
14629  })
14630 +#else
14631 +# define cond_resched_softirq()                cond_resched()
14632 +#endif
14633  
14634  static inline void cond_resched_rcu(void)
14635  {
14636 @@ -3110,6 +3273,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
14637  
14638  #endif /* CONFIG_SMP */
14639  
14640 +static inline int __migrate_disabled(struct task_struct *p)
14641 +{
14642 +#ifdef CONFIG_PREEMPT_RT_FULL
14643 +       return p->migrate_disable;
14644 +#else
14645 +       return 0;
14646 +#endif
14647 +}
14648 +
14649 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
14650 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
14651 +{
14652 +       if (__migrate_disabled(p))
14653 +               return cpumask_of(task_cpu(p));
14654 +
14655 +       return &p->cpus_allowed;
14656 +}
14657 +
14658 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
14659 +{
14660 +       if (__migrate_disabled(p))
14661 +               return 1;
14662 +       return p->nr_cpus_allowed;
14663 +}
14664 +
14665  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
14666  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
14667  
14668 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
14669 index e0582106ef4f..b14f4d2368aa 100644
14670 --- a/include/linux/seqlock.h
14671 +++ b/include/linux/seqlock.h
14672 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
14673         return __read_seqcount_retry(s, start);
14674  }
14675  
14676 -
14677 -
14678 -static inline void raw_write_seqcount_begin(seqcount_t *s)
14679 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
14680  {
14681         s->sequence++;
14682         smp_wmb();
14683  }
14684  
14685 -static inline void raw_write_seqcount_end(seqcount_t *s)
14686 +static inline void raw_write_seqcount_begin(seqcount_t *s)
14687 +{
14688 +       preempt_disable_rt();
14689 +       __raw_write_seqcount_begin(s);
14690 +}
14691 +
14692 +static inline void __raw_write_seqcount_end(seqcount_t *s)
14693  {
14694         smp_wmb();
14695         s->sequence++;
14696  }
14697  
14698 +static inline void raw_write_seqcount_end(seqcount_t *s)
14699 +{
14700 +       __raw_write_seqcount_end(s);
14701 +       preempt_enable_rt();
14702 +}
14703 +
14704  /**
14705   * raw_write_seqcount_barrier - do a seq write barrier
14706   * @s: pointer to seqcount_t
14707 @@ -425,10 +435,32 @@ typedef struct {
14708  /*
14709   * Read side functions for starting and finalizing a read side section.
14710   */
14711 +#ifndef CONFIG_PREEMPT_RT_FULL
14712  static inline unsigned read_seqbegin(const seqlock_t *sl)
14713  {
14714         return read_seqcount_begin(&sl->seqcount);
14715  }
14716 +#else
14717 +/*
14718 + * Starvation safe read side for RT
14719 + */
14720 +static inline unsigned read_seqbegin(seqlock_t *sl)
14721 +{
14722 +       unsigned ret;
14723 +
14724 +repeat:
14725 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
14726 +       if (unlikely(ret & 1)) {
14727 +               /*
14728 +                * Take the lock and let the writer proceed (i.e. evtl
14729 +                * boost it), otherwise we could loop here forever.
14730 +                */
14731 +               spin_unlock_wait(&sl->lock);
14732 +               goto repeat;
14733 +       }
14734 +       return ret;
14735 +}
14736 +#endif
14737  
14738  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14739  {
14740 @@ -443,36 +475,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14741  static inline void write_seqlock(seqlock_t *sl)
14742  {
14743         spin_lock(&sl->lock);
14744 -       write_seqcount_begin(&sl->seqcount);
14745 +       __raw_write_seqcount_begin(&sl->seqcount);
14746  }
14747  
14748  static inline void write_sequnlock(seqlock_t *sl)
14749  {
14750 -       write_seqcount_end(&sl->seqcount);
14751 +       __raw_write_seqcount_end(&sl->seqcount);
14752         spin_unlock(&sl->lock);
14753  }
14754  
14755  static inline void write_seqlock_bh(seqlock_t *sl)
14756  {
14757         spin_lock_bh(&sl->lock);
14758 -       write_seqcount_begin(&sl->seqcount);
14759 +       __raw_write_seqcount_begin(&sl->seqcount);
14760  }
14761  
14762  static inline void write_sequnlock_bh(seqlock_t *sl)
14763  {
14764 -       write_seqcount_end(&sl->seqcount);
14765 +       __raw_write_seqcount_end(&sl->seqcount);
14766         spin_unlock_bh(&sl->lock);
14767  }
14768  
14769  static inline void write_seqlock_irq(seqlock_t *sl)
14770  {
14771         spin_lock_irq(&sl->lock);
14772 -       write_seqcount_begin(&sl->seqcount);
14773 +       __raw_write_seqcount_begin(&sl->seqcount);
14774  }
14775  
14776  static inline void write_sequnlock_irq(seqlock_t *sl)
14777  {
14778 -       write_seqcount_end(&sl->seqcount);
14779 +       __raw_write_seqcount_end(&sl->seqcount);
14780         spin_unlock_irq(&sl->lock);
14781  }
14782  
14783 @@ -481,7 +513,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
14784         unsigned long flags;
14785  
14786         spin_lock_irqsave(&sl->lock, flags);
14787 -       write_seqcount_begin(&sl->seqcount);
14788 +       __raw_write_seqcount_begin(&sl->seqcount);
14789         return flags;
14790  }
14791  
14792 @@ -491,7 +523,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
14793  static inline void
14794  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
14795  {
14796 -       write_seqcount_end(&sl->seqcount);
14797 +       __raw_write_seqcount_end(&sl->seqcount);
14798         spin_unlock_irqrestore(&sl->lock, flags);
14799  }
14800  
14801 diff --git a/include/linux/signal.h b/include/linux/signal.h
14802 index d80259afb9e5..ddd1e6866a54 100644
14803 --- a/include/linux/signal.h
14804 +++ b/include/linux/signal.h
14805 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
14806  }
14807  
14808  extern void flush_sigqueue(struct sigpending *queue);
14809 +extern void flush_task_sigqueue(struct task_struct *tsk);
14810  
14811  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
14812  static inline int valid_signal(unsigned long sig)
14813 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
14814 index d443d9ab0236..2d1c7f9b7fd0 100644
14815 --- a/include/linux/skbuff.h
14816 +++ b/include/linux/skbuff.h
14817 @@ -203,6 +203,7 @@ struct sk_buff_head {
14818  
14819         __u32           qlen;
14820         spinlock_t      lock;
14821 +       raw_spinlock_t  raw_lock;
14822  };
14823  
14824  struct sk_buff;
14825 @@ -1465,6 +1466,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
14826         __skb_queue_head_init(list);
14827  }
14828  
14829 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
14830 +{
14831 +       raw_spin_lock_init(&list->raw_lock);
14832 +       __skb_queue_head_init(list);
14833 +}
14834 +
14835  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
14836                 struct lock_class_key *class)
14837  {
14838 diff --git a/include/linux/smp.h b/include/linux/smp.h
14839 index c4414074bd88..e6ab36aeaaab 100644
14840 --- a/include/linux/smp.h
14841 +++ b/include/linux/smp.h
14842 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
14843  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
14844  #define put_cpu()              preempt_enable()
14845  
14846 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
14847 +#define put_cpu_light()                migrate_enable()
14848 +
14849  /*
14850   * Callback to arch code if there's nosmp or maxcpus=0 on the
14851   * boot command line:
14852 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
14853 index 47dd0cebd204..b241cc044bd3 100644
14854 --- a/include/linux/spinlock.h
14855 +++ b/include/linux/spinlock.h
14856 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
14857  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
14858  
14859  /* Include rwlock functions */
14860 -#include <linux/rwlock.h>
14861 +#ifdef CONFIG_PREEMPT_RT_FULL
14862 +# include <linux/rwlock_rt.h>
14863 +#else
14864 +# include <linux/rwlock.h>
14865 +#endif
14866  
14867  /*
14868   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
14869 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
14870  # include <linux/spinlock_api_up.h>
14871  #endif
14872  
14873 +#ifdef CONFIG_PREEMPT_RT_FULL
14874 +# include <linux/spinlock_rt.h>
14875 +#else /* PREEMPT_RT_FULL */
14876 +
14877  /*
14878   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
14879   */
14880 @@ -416,4 +424,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
14881  #define atomic_dec_and_lock(atomic, lock) \
14882                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
14883  
14884 +#endif /* !PREEMPT_RT_FULL */
14885 +
14886  #endif /* __LINUX_SPINLOCK_H */
14887 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
14888 index 5344268e6e62..043263f30e81 100644
14889 --- a/include/linux/spinlock_api_smp.h
14890 +++ b/include/linux/spinlock_api_smp.h
14891 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
14892         return 0;
14893  }
14894  
14895 -#include <linux/rwlock_api_smp.h>
14896 +#ifndef CONFIG_PREEMPT_RT_FULL
14897 +# include <linux/rwlock_api_smp.h>
14898 +#endif
14899  
14900  #endif /* __LINUX_SPINLOCK_API_SMP_H */
14901 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
14902 new file mode 100644
14903 index 000000000000..3b2825537531
14904 --- /dev/null
14905 +++ b/include/linux/spinlock_rt.h
14906 @@ -0,0 +1,163 @@
14907 +#ifndef __LINUX_SPINLOCK_RT_H
14908 +#define __LINUX_SPINLOCK_RT_H
14909 +
14910 +#ifndef __LINUX_SPINLOCK_H
14911 +#error Do not include directly. Use spinlock.h
14912 +#endif
14913 +
14914 +#include <linux/bug.h>
14915 +
14916 +extern void
14917 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
14918 +
14919 +#define spin_lock_init(slock)                          \
14920 +do {                                                   \
14921 +       static struct lock_class_key __key;             \
14922 +                                                       \
14923 +       rt_mutex_init(&(slock)->lock);                  \
14924 +       __rt_spin_lock_init(slock, #slock, &__key);     \
14925 +} while (0)
14926 +
14927 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
14928 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
14929 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
14930 +
14931 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
14932 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
14933 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
14934 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
14935 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
14936 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
14937 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
14938 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
14939 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
14940 +
14941 +/*
14942 + * lockdep-less calls, for derived types like rwlock:
14943 + * (for trylock they can use rt_mutex_trylock() directly.
14944 + */
14945 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
14946 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
14947 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
14948 +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
14949 +
14950 +#define spin_lock(lock)                        rt_spin_lock(lock)
14951 +
14952 +#define spin_lock_bh(lock)                     \
14953 +       do {                                    \
14954 +               local_bh_disable();             \
14955 +               rt_spin_lock(lock);             \
14956 +       } while (0)
14957 +
14958 +#define spin_lock_irq(lock)            spin_lock(lock)
14959 +
14960 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
14961 +
14962 +#define spin_trylock(lock)                     \
14963 +({                                             \
14964 +       int __locked;                           \
14965 +       __locked = spin_do_trylock(lock);       \
14966 +       __locked;                               \
14967 +})
14968 +
14969 +#ifdef CONFIG_LOCKDEP
14970 +# define spin_lock_nested(lock, subclass)              \
14971 +       do {                                            \
14972 +               rt_spin_lock_nested(lock, subclass);    \
14973 +       } while (0)
14974 +
14975 +#define spin_lock_bh_nested(lock, subclass)            \
14976 +       do {                                            \
14977 +               local_bh_disable();                     \
14978 +               rt_spin_lock_nested(lock, subclass);    \
14979 +       } while (0)
14980 +
14981 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
14982 +       do {                                             \
14983 +               typecheck(unsigned long, flags);         \
14984 +               flags = 0;                               \
14985 +               rt_spin_lock_nested(lock, subclass);     \
14986 +       } while (0)
14987 +#else
14988 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
14989 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
14990 +
14991 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
14992 +       do {                                             \
14993 +               typecheck(unsigned long, flags);         \
14994 +               flags = 0;                               \
14995 +               spin_lock(lock);                         \
14996 +       } while (0)
14997 +#endif
14998 +
14999 +#define spin_lock_irqsave(lock, flags)                  \
15000 +       do {                                             \
15001 +               typecheck(unsigned long, flags);         \
15002 +               flags = 0;                               \
15003 +               spin_lock(lock);                         \
15004 +       } while (0)
15005 +
15006 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15007 +{
15008 +       unsigned long flags = 0;
15009 +#ifdef CONFIG_TRACE_IRQFLAGS
15010 +       flags = rt_spin_lock_trace_flags(lock);
15011 +#else
15012 +       spin_lock(lock); /* lock_local */
15013 +#endif
15014 +       return flags;
15015 +}
15016 +
15017 +/* FIXME: we need rt_spin_lock_nest_lock */
15018 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15019 +
15020 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15021 +
15022 +#define spin_unlock_bh(lock)                           \
15023 +       do {                                            \
15024 +               rt_spin_unlock(lock);                   \
15025 +               local_bh_enable();                      \
15026 +       } while (0)
15027 +
15028 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15029 +
15030 +#define spin_unlock_irqrestore(lock, flags)            \
15031 +       do {                                            \
15032 +               typecheck(unsigned long, flags);        \
15033 +               (void) flags;                           \
15034 +               spin_unlock(lock);                      \
15035 +       } while (0)
15036 +
15037 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15038 +#define spin_trylock_irq(lock) spin_trylock(lock)
15039 +
15040 +#define spin_trylock_irqsave(lock, flags)      \
15041 +       rt_spin_trylock_irqsave(lock, &(flags))
15042 +
15043 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15044 +
15045 +#ifdef CONFIG_GENERIC_LOCKBREAK
15046 +# define spin_is_contended(lock)       ((lock)->break_lock)
15047 +#else
15048 +# define spin_is_contended(lock)       (((void)(lock), 0))
15049 +#endif
15050 +
15051 +static inline int spin_can_lock(spinlock_t *lock)
15052 +{
15053 +       return !rt_mutex_is_locked(&lock->lock);
15054 +}
15055 +
15056 +static inline int spin_is_locked(spinlock_t *lock)
15057 +{
15058 +       return rt_mutex_is_locked(&lock->lock);
15059 +}
15060 +
15061 +static inline void assert_spin_locked(spinlock_t *lock)
15062 +{
15063 +       BUG_ON(!spin_is_locked(lock));
15064 +}
15065 +
15066 +#define atomic_dec_and_lock(atomic, lock) \
15067 +       atomic_dec_and_spin_lock(atomic, lock)
15068 +
15069 +#endif
15070 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15071 index 73548eb13a5d..10bac715ea96 100644
15072 --- a/include/linux/spinlock_types.h
15073 +++ b/include/linux/spinlock_types.h
15074 @@ -9,80 +9,15 @@
15075   * Released under the General Public License (GPL).
15076   */
15077  
15078 -#if defined(CONFIG_SMP)
15079 -# include <asm/spinlock_types.h>
15080 -#else
15081 -# include <linux/spinlock_types_up.h>
15082 -#endif
15083 -
15084 -#include <linux/lockdep.h>
15085 -
15086 -typedef struct raw_spinlock {
15087 -       arch_spinlock_t raw_lock;
15088 -#ifdef CONFIG_GENERIC_LOCKBREAK
15089 -       unsigned int break_lock;
15090 -#endif
15091 -#ifdef CONFIG_DEBUG_SPINLOCK
15092 -       unsigned int magic, owner_cpu;
15093 -       void *owner;
15094 -#endif
15095 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15096 -       struct lockdep_map dep_map;
15097 -#endif
15098 -} raw_spinlock_t;
15099 -
15100 -#define SPINLOCK_MAGIC         0xdead4ead
15101 -
15102 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15103 -
15104 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15105 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15106 -#else
15107 -# define SPIN_DEP_MAP_INIT(lockname)
15108 -#endif
15109 +#include <linux/spinlock_types_raw.h>
15110  
15111 -#ifdef CONFIG_DEBUG_SPINLOCK
15112 -# define SPIN_DEBUG_INIT(lockname)             \
15113 -       .magic = SPINLOCK_MAGIC,                \
15114 -       .owner_cpu = -1,                        \
15115 -       .owner = SPINLOCK_OWNER_INIT,
15116 +#ifndef CONFIG_PREEMPT_RT_FULL
15117 +# include <linux/spinlock_types_nort.h>
15118 +# include <linux/rwlock_types.h>
15119  #else
15120 -# define SPIN_DEBUG_INIT(lockname)
15121 +# include <linux/rtmutex.h>
15122 +# include <linux/spinlock_types_rt.h>
15123 +# include <linux/rwlock_types_rt.h>
15124  #endif
15125  
15126 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15127 -       {                                       \
15128 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15129 -       SPIN_DEBUG_INIT(lockname)               \
15130 -       SPIN_DEP_MAP_INIT(lockname) }
15131 -
15132 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15133 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15134 -
15135 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15136 -
15137 -typedef struct spinlock {
15138 -       union {
15139 -               struct raw_spinlock rlock;
15140 -
15141 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15142 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15143 -               struct {
15144 -                       u8 __padding[LOCK_PADSIZE];
15145 -                       struct lockdep_map dep_map;
15146 -               };
15147 -#endif
15148 -       };
15149 -} spinlock_t;
15150 -
15151 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15152 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15153 -
15154 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15155 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15156 -
15157 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15158 -
15159 -#include <linux/rwlock_types.h>
15160 -
15161  #endif /* __LINUX_SPINLOCK_TYPES_H */
15162 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15163 new file mode 100644
15164 index 000000000000..f1dac1fb1d6a
15165 --- /dev/null
15166 +++ b/include/linux/spinlock_types_nort.h
15167 @@ -0,0 +1,33 @@
15168 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15169 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15170 +
15171 +#ifndef __LINUX_SPINLOCK_TYPES_H
15172 +#error "Do not include directly. Include spinlock_types.h instead"
15173 +#endif
15174 +
15175 +/*
15176 + * The non RT version maps spinlocks to raw_spinlocks
15177 + */
15178 +typedef struct spinlock {
15179 +       union {
15180 +               struct raw_spinlock rlock;
15181 +
15182 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15183 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15184 +               struct {
15185 +                       u8 __padding[LOCK_PADSIZE];
15186 +                       struct lockdep_map dep_map;
15187 +               };
15188 +#endif
15189 +       };
15190 +} spinlock_t;
15191 +
15192 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15193 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15194 +
15195 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15196 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15197 +
15198 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15199 +
15200 +#endif
15201 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15202 new file mode 100644
15203 index 000000000000..edffc4d53fc9
15204 --- /dev/null
15205 +++ b/include/linux/spinlock_types_raw.h
15206 @@ -0,0 +1,56 @@
15207 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15208 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15209 +
15210 +#if defined(CONFIG_SMP)
15211 +# include <asm/spinlock_types.h>
15212 +#else
15213 +# include <linux/spinlock_types_up.h>
15214 +#endif
15215 +
15216 +#include <linux/lockdep.h>
15217 +
15218 +typedef struct raw_spinlock {
15219 +       arch_spinlock_t raw_lock;
15220 +#ifdef CONFIG_GENERIC_LOCKBREAK
15221 +       unsigned int break_lock;
15222 +#endif
15223 +#ifdef CONFIG_DEBUG_SPINLOCK
15224 +       unsigned int magic, owner_cpu;
15225 +       void *owner;
15226 +#endif
15227 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15228 +       struct lockdep_map dep_map;
15229 +#endif
15230 +} raw_spinlock_t;
15231 +
15232 +#define SPINLOCK_MAGIC         0xdead4ead
15233 +
15234 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15235 +
15236 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15237 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15238 +#else
15239 +# define SPIN_DEP_MAP_INIT(lockname)
15240 +#endif
15241 +
15242 +#ifdef CONFIG_DEBUG_SPINLOCK
15243 +# define SPIN_DEBUG_INIT(lockname)             \
15244 +       .magic = SPINLOCK_MAGIC,                \
15245 +       .owner_cpu = -1,                        \
15246 +       .owner = SPINLOCK_OWNER_INIT,
15247 +#else
15248 +# define SPIN_DEBUG_INIT(lockname)
15249 +#endif
15250 +
15251 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15252 +       {                                       \
15253 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15254 +       SPIN_DEBUG_INIT(lockname)               \
15255 +       SPIN_DEP_MAP_INIT(lockname) }
15256 +
15257 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15258 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15259 +
15260 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15261 +
15262 +#endif
15263 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15264 new file mode 100644
15265 index 000000000000..9fd431967abc
15266 --- /dev/null
15267 +++ b/include/linux/spinlock_types_rt.h
15268 @@ -0,0 +1,51 @@
15269 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15270 +#define __LINUX_SPINLOCK_TYPES_RT_H
15271 +
15272 +#ifndef __LINUX_SPINLOCK_TYPES_H
15273 +#error "Do not include directly. Include spinlock_types.h instead"
15274 +#endif
15275 +
15276 +#include <linux/cache.h>
15277 +
15278 +/*
15279 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15280 + */
15281 +typedef struct spinlock {
15282 +       struct rt_mutex         lock;
15283 +       unsigned int            break_lock;
15284 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15285 +       struct lockdep_map      dep_map;
15286 +#endif
15287 +} spinlock_t;
15288 +
15289 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15290 +# define __RT_SPIN_INITIALIZER(name) \
15291 +       { \
15292 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15293 +       .save_state = 1, \
15294 +       .file = __FILE__, \
15295 +       .line = __LINE__ , \
15296 +       }
15297 +#else
15298 +# define __RT_SPIN_INITIALIZER(name) \
15299 +       {                                                               \
15300 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15301 +       .save_state = 1, \
15302 +       }
15303 +#endif
15304 +
15305 +/*
15306 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15307 +*/
15308 +
15309 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15310 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15311 +         SPIN_DEP_MAP_INIT(name) }
15312 +
15313 +#define __DEFINE_SPINLOCK(name) \
15314 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15315 +
15316 +#define DEFINE_SPINLOCK(name) \
15317 +       spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
15318 +
15319 +#endif
15320 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
15321 index f5f80c5643ac..ec1a8f01563c 100644
15322 --- a/include/linux/srcu.h
15323 +++ b/include/linux/srcu.h
15324 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
15325  
15326  void process_srcu(struct work_struct *work);
15327  
15328 -#define __SRCU_STRUCT_INIT(name)                                       \
15329 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
15330         {                                                               \
15331                 .completed = -300,                                      \
15332 -               .per_cpu_ref = &name##_srcu_array,                      \
15333 +               .per_cpu_ref = &pcpu_name,                              \
15334                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
15335                 .running = false,                                       \
15336                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
15337 @@ -104,7 +104,7 @@ void process_srcu(struct work_struct *work);
15338   */
15339  #define __DEFINE_SRCU(name, is_static)                                 \
15340         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
15341 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15342 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
15343  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
15344  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
15345  
15346 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
15347 index 8b6ec7ef0854..9b77d4cc929f 100644
15348 --- a/include/linux/suspend.h
15349 +++ b/include/linux/suspend.h
15350 @@ -194,6 +194,12 @@ struct platform_freeze_ops {
15351         void (*end)(void);
15352  };
15353  
15354 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15355 +extern bool pm_in_action;
15356 +#else
15357 +# define pm_in_action false
15358 +#endif
15359 +
15360  #ifdef CONFIG_SUSPEND
15361  /**
15362   * suspend_set_ops - set platform dependent suspend operations
15363 diff --git a/include/linux/swait.h b/include/linux/swait.h
15364 new file mode 100644
15365 index 000000000000..83f004a72320
15366 --- /dev/null
15367 +++ b/include/linux/swait.h
15368 @@ -0,0 +1,173 @@
15369 +#ifndef _LINUX_SWAIT_H
15370 +#define _LINUX_SWAIT_H
15371 +
15372 +#include <linux/list.h>
15373 +#include <linux/stddef.h>
15374 +#include <linux/spinlock.h>
15375 +#include <asm/current.h>
15376 +
15377 +/*
15378 + * Simple wait queues
15379 + *
15380 + * While these are very similar to the other/complex wait queues (wait.h) the
15381 + * most important difference is that the simple waitqueue allows for
15382 + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15383 + * times.
15384 + *
15385 + * In order to make this so, we had to drop a fair number of features of the
15386 + * other waitqueue code; notably:
15387 + *
15388 + *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
15389 + *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
15390 + *    sleeper state.
15391 + *
15392 + *  - the exclusive mode; because this requires preserving the list order
15393 + *    and this is hard.
15394 + *
15395 + *  - custom wake functions; because you cannot give any guarantees about
15396 + *    random code.
15397 + *
15398 + * As a side effect of this; the data structures are slimmer.
15399 + *
15400 + * One would recommend using this wait queue where possible.
15401 + */
15402 +
15403 +struct task_struct;
15404 +
15405 +struct swait_queue_head {
15406 +       raw_spinlock_t          lock;
15407 +       struct list_head        task_list;
15408 +};
15409 +
15410 +struct swait_queue {
15411 +       struct task_struct      *task;
15412 +       struct list_head        task_list;
15413 +};
15414 +
15415 +#define __SWAITQUEUE_INITIALIZER(name) {                               \
15416 +       .task           = current,                                      \
15417 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15418 +}
15419 +
15420 +#define DECLARE_SWAITQUEUE(name)                                       \
15421 +       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
15422 +
15423 +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
15424 +       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
15425 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15426 +}
15427 +
15428 +#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
15429 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
15430 +
15431 +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15432 +                                   struct lock_class_key *key);
15433 +
15434 +#define init_swait_queue_head(q)                               \
15435 +       do {                                                    \
15436 +               static struct lock_class_key __key;             \
15437 +               __init_swait_queue_head((q), #q, &__key);       \
15438 +       } while (0)
15439 +
15440 +#ifdef CONFIG_LOCKDEP
15441 +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
15442 +       ({ init_swait_queue_head(&name); name; })
15443 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15444 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
15445 +#else
15446 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15447 +       DECLARE_SWAIT_QUEUE_HEAD(name)
15448 +#endif
15449 +
15450 +static inline int swait_active(struct swait_queue_head *q)
15451 +{
15452 +       return !list_empty(&q->task_list);
15453 +}
15454 +
15455 +extern void swake_up(struct swait_queue_head *q);
15456 +extern void swake_up_all(struct swait_queue_head *q);
15457 +extern void swake_up_locked(struct swait_queue_head *q);
15458 +extern void swake_up_all_locked(struct swait_queue_head *q);
15459 +
15460 +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15461 +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15462 +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
15463 +
15464 +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15465 +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15466 +
15467 +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
15468 +#define ___swait_event(wq, condition, state, ret, cmd)                 \
15469 +({                                                                     \
15470 +       struct swait_queue __wait;                                      \
15471 +       long __ret = ret;                                               \
15472 +                                                                       \
15473 +       INIT_LIST_HEAD(&__wait.task_list);                              \
15474 +       for (;;) {                                                      \
15475 +               long __int = prepare_to_swait_event(&wq, &__wait, state);\
15476 +                                                                       \
15477 +               if (condition)                                          \
15478 +                       break;                                          \
15479 +                                                                       \
15480 +               if (___wait_is_interruptible(state) && __int) {         \
15481 +                       __ret = __int;                                  \
15482 +                       break;                                          \
15483 +               }                                                       \
15484 +                                                                       \
15485 +               cmd;                                                    \
15486 +       }                                                               \
15487 +       finish_swait(&wq, &__wait);                                     \
15488 +       __ret;                                                          \
15489 +})
15490 +
15491 +#define __swait_event(wq, condition)                                   \
15492 +       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
15493 +                           schedule())
15494 +
15495 +#define swait_event(wq, condition)                                     \
15496 +do {                                                                   \
15497 +       if (condition)                                                  \
15498 +               break;                                                  \
15499 +       __swait_event(wq, condition);                                   \
15500 +} while (0)
15501 +
15502 +#define __swait_event_timeout(wq, condition, timeout)                  \
15503 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15504 +                     TASK_UNINTERRUPTIBLE, timeout,                    \
15505 +                     __ret = schedule_timeout(__ret))
15506 +
15507 +#define swait_event_timeout(wq, condition, timeout)                    \
15508 +({                                                                     \
15509 +       long __ret = timeout;                                           \
15510 +       if (!___wait_cond_timeout(condition))                           \
15511 +               __ret = __swait_event_timeout(wq, condition, timeout);  \
15512 +       __ret;                                                          \
15513 +})
15514 +
15515 +#define __swait_event_interruptible(wq, condition)                     \
15516 +       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
15517 +                     schedule())
15518 +
15519 +#define swait_event_interruptible(wq, condition)                       \
15520 +({                                                                     \
15521 +       int __ret = 0;                                                  \
15522 +       if (!(condition))                                               \
15523 +               __ret = __swait_event_interruptible(wq, condition);     \
15524 +       __ret;                                                          \
15525 +})
15526 +
15527 +#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
15528 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15529 +                     TASK_INTERRUPTIBLE, timeout,                      \
15530 +                     __ret = schedule_timeout(__ret))
15531 +
15532 +#define swait_event_interruptible_timeout(wq, condition, timeout)      \
15533 +({                                                                     \
15534 +       long __ret = timeout;                                           \
15535 +       if (!___wait_cond_timeout(condition))                           \
15536 +               __ret = __swait_event_interruptible_timeout(wq,         \
15537 +                                               condition, timeout);    \
15538 +       __ret;                                                          \
15539 +})
15540 +
15541 +#endif /* _LINUX_SWAIT_H */
15542 diff --git a/include/linux/swap.h b/include/linux/swap.h
15543 index 7ba7dccaf0e7..da646f2eb3c6 100644
15544 --- a/include/linux/swap.h
15545 +++ b/include/linux/swap.h
15546 @@ -11,6 +11,7 @@
15547  #include <linux/fs.h>
15548  #include <linux/atomic.h>
15549  #include <linux/page-flags.h>
15550 +#include <linux/locallock.h>
15551  #include <asm/page.h>
15552  
15553  struct notifier_block;
15554 @@ -252,7 +253,8 @@ struct swap_info_struct {
15555  void *workingset_eviction(struct address_space *mapping, struct page *page);
15556  bool workingset_refault(void *shadow);
15557  void workingset_activation(struct page *page);
15558 -extern struct list_lru workingset_shadow_nodes;
15559 +extern struct list_lru __workingset_shadow_nodes;
15560 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
15561  
15562  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
15563  {
15564 @@ -296,6 +298,7 @@ extern unsigned long nr_free_pagecache_pages(void);
15565  
15566  
15567  /* linux/mm/swap.c */
15568 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15569  extern void lru_cache_add(struct page *);
15570  extern void lru_cache_add_anon(struct page *page);
15571  extern void lru_cache_add_file(struct page *page);
15572 diff --git a/include/linux/swork.h b/include/linux/swork.h
15573 new file mode 100644
15574 index 000000000000..f175fa9a6016
15575 --- /dev/null
15576 +++ b/include/linux/swork.h
15577 @@ -0,0 +1,24 @@
15578 +#ifndef _LINUX_SWORK_H
15579 +#define _LINUX_SWORK_H
15580 +
15581 +#include <linux/list.h>
15582 +
15583 +struct swork_event {
15584 +       struct list_head item;
15585 +       unsigned long flags;
15586 +       void (*func)(struct swork_event *);
15587 +};
15588 +
15589 +static inline void INIT_SWORK(struct swork_event *event,
15590 +                             void (*func)(struct swork_event *))
15591 +{
15592 +       event->flags = 0;
15593 +       event->func = func;
15594 +}
15595 +
15596 +bool swork_queue(struct swork_event *sev);
15597 +
15598 +int swork_get(void);
15599 +void swork_put(void);
15600 +
15601 +#endif /* _LINUX_SWORK_H */
15602 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
15603 index ff307b548ed3..be9f9dc6a4e1 100644
15604 --- a/include/linux/thread_info.h
15605 +++ b/include/linux/thread_info.h
15606 @@ -102,7 +102,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
15607  #define test_thread_flag(flag) \
15608         test_ti_thread_flag(current_thread_info(), flag)
15609  
15610 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15611 +#ifdef CONFIG_PREEMPT_LAZY
15612 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
15613 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
15614 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15615 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
15616 +
15617 +#else
15618 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
15619 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15620 +#define tif_need_resched_lazy()        0
15621 +#endif
15622  
15623  #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
15624  /*
15625 diff --git a/include/linux/timer.h b/include/linux/timer.h
15626 index 61aa61dc410c..299d2b78591f 100644
15627 --- a/include/linux/timer.h
15628 +++ b/include/linux/timer.h
15629 @@ -225,7 +225,7 @@ extern void add_timer(struct timer_list *timer);
15630  
15631  extern int try_to_del_timer_sync(struct timer_list *timer);
15632  
15633 -#ifdef CONFIG_SMP
15634 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15635    extern int del_timer_sync(struct timer_list *timer);
15636  #else
15637  # define del_timer_sync(t)             del_timer(t)
15638 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
15639 index 925730bc9fc1..a591f414da6f 100644
15640 --- a/include/linux/trace_events.h
15641 +++ b/include/linux/trace_events.h
15642 @@ -66,6 +66,9 @@ struct trace_entry {
15643         unsigned char           flags;
15644         unsigned char           preempt_count;
15645         int                     pid;
15646 +       unsigned short          migrate_disable;
15647 +       unsigned short          padding;
15648 +       unsigned char           preempt_lazy_count;
15649  };
15650  
15651  #define TRACE_EVENT_TYPE_MAX                                           \
15652 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
15653 index 558129af828a..cf5c472bbc79 100644
15654 --- a/include/linux/uaccess.h
15655 +++ b/include/linux/uaccess.h
15656 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
15657   */
15658  static inline void pagefault_disable(void)
15659  {
15660 +       migrate_disable();
15661         pagefault_disabled_inc();
15662         /*
15663          * make sure to have issued the store before a pagefault
15664 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
15665          */
15666         barrier();
15667         pagefault_disabled_dec();
15668 +       migrate_enable();
15669  }
15670  
15671  /*
15672 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
15673 index 4a29c75b146e..0a294e950df8 100644
15674 --- a/include/linux/uprobes.h
15675 +++ b/include/linux/uprobes.h
15676 @@ -27,6 +27,7 @@
15677  #include <linux/errno.h>
15678  #include <linux/rbtree.h>
15679  #include <linux/types.h>
15680 +#include <linux/wait.h>
15681  
15682  struct vm_area_struct;
15683  struct mm_struct;
15684 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
15685 index 3e5d9075960f..7eaa847cd5a5 100644
15686 --- a/include/linux/vmstat.h
15687 +++ b/include/linux/vmstat.h
15688 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
15689   */
15690  static inline void __count_vm_event(enum vm_event_item item)
15691  {
15692 +       preempt_disable_rt();
15693         raw_cpu_inc(vm_event_states.event[item]);
15694 +       preempt_enable_rt();
15695  }
15696  
15697  static inline void count_vm_event(enum vm_event_item item)
15698 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
15699  
15700  static inline void __count_vm_events(enum vm_event_item item, long delta)
15701  {
15702 +       preempt_disable_rt();
15703         raw_cpu_add(vm_event_states.event[item], delta);
15704 +       preempt_enable_rt();
15705  }
15706  
15707  static inline void count_vm_events(enum vm_event_item item, long delta)
15708 diff --git a/include/linux/wait.h b/include/linux/wait.h
15709 index 513b36f04dfd..981c8a840f96 100644
15710 --- a/include/linux/wait.h
15711 +++ b/include/linux/wait.h
15712 @@ -8,6 +8,7 @@
15713  #include <linux/spinlock.h>
15714  #include <asm/current.h>
15715  #include <uapi/linux/wait.h>
15716 +#include <linux/atomic.h>
15717  
15718  typedef struct __wait_queue wait_queue_t;
15719  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
15720 diff --git a/include/net/dst.h b/include/net/dst.h
15721 index c7329dcd90cc..35c3dba16728 100644
15722 --- a/include/net/dst.h
15723 +++ b/include/net/dst.h
15724 @@ -437,7 +437,7 @@ static inline void dst_confirm(struct dst_entry *dst)
15725  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
15726                                    struct sk_buff *skb)
15727  {
15728 -       const struct hh_cache *hh;
15729 +       struct hh_cache *hh;
15730  
15731         if (dst->pending_confirm) {
15732                 unsigned long now = jiffies;
15733 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
15734 index 8b683841e574..bf656008f6e7 100644
15735 --- a/include/net/neighbour.h
15736 +++ b/include/net/neighbour.h
15737 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
15738  }
15739  #endif
15740  
15741 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
15742 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
15743  {
15744         unsigned int seq;
15745         int hh_len;
15746 @@ -501,7 +501,7 @@ struct neighbour_cb {
15747  
15748  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
15749  
15750 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
15751 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
15752                                      const struct net_device *dev)
15753  {
15754         unsigned int seq;
15755 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
15756 index c68926b4899c..dd0751e76065 100644
15757 --- a/include/net/netns/ipv4.h
15758 +++ b/include/net/netns/ipv4.h
15759 @@ -70,6 +70,7 @@ struct netns_ipv4 {
15760  
15761         int sysctl_icmp_echo_ignore_all;
15762         int sysctl_icmp_echo_ignore_broadcasts;
15763 +       int sysctl_icmp_echo_sysrq;
15764         int sysctl_icmp_ignore_bogus_error_responses;
15765         int sysctl_icmp_ratelimit;
15766         int sysctl_icmp_ratemask;
15767 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
15768 new file mode 100644
15769 index 000000000000..f7710de1b1f3
15770 --- /dev/null
15771 +++ b/include/trace/events/hist.h
15772 @@ -0,0 +1,73 @@
15773 +#undef TRACE_SYSTEM
15774 +#define TRACE_SYSTEM hist
15775 +
15776 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
15777 +#define _TRACE_HIST_H
15778 +
15779 +#include "latency_hist.h"
15780 +#include <linux/tracepoint.h>
15781 +
15782 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
15783 +#define trace_preemptirqsoff_hist(a, b)
15784 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
15785 +#else
15786 +TRACE_EVENT(preemptirqsoff_hist,
15787 +
15788 +       TP_PROTO(int reason, int starthist),
15789 +
15790 +       TP_ARGS(reason, starthist),
15791 +
15792 +       TP_STRUCT__entry(
15793 +               __field(int,    reason)
15794 +               __field(int,    starthist)
15795 +       ),
15796 +
15797 +       TP_fast_assign(
15798 +               __entry->reason         = reason;
15799 +               __entry->starthist      = starthist;
15800 +       ),
15801 +
15802 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
15803 +                 __entry->starthist ? "start" : "stop")
15804 +);
15805 +#endif
15806 +
15807 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
15808 +#define trace_hrtimer_interrupt(a, b, c, d)
15809 +#else
15810 +TRACE_EVENT(hrtimer_interrupt,
15811 +
15812 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
15813 +               struct task_struct *task),
15814 +
15815 +       TP_ARGS(cpu, offset, curr, task),
15816 +
15817 +       TP_STRUCT__entry(
15818 +               __field(int,            cpu)
15819 +               __field(long long,      offset)
15820 +               __array(char,           ccomm,  TASK_COMM_LEN)
15821 +               __field(int,            cprio)
15822 +               __array(char,           tcomm,  TASK_COMM_LEN)
15823 +               __field(int,            tprio)
15824 +       ),
15825 +
15826 +       TP_fast_assign(
15827 +               __entry->cpu    = cpu;
15828 +               __entry->offset = offset;
15829 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
15830 +               __entry->cprio  = curr->prio;
15831 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
15832 +                       task != NULL ? TASK_COMM_LEN : 7);
15833 +               __entry->tprio  = task != NULL ? task->prio : -1;
15834 +       ),
15835 +
15836 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
15837 +               __entry->cpu, __entry->offset, __entry->ccomm,
15838 +               __entry->cprio, __entry->tcomm, __entry->tprio)
15839 +);
15840 +#endif
15841 +
15842 +#endif /* _TRACE_HIST_H */
15843 +
15844 +/* This part must be outside protection */
15845 +#include <trace/define_trace.h>
15846 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
15847 new file mode 100644
15848 index 000000000000..d3f2fbd560b1
15849 --- /dev/null
15850 +++ b/include/trace/events/latency_hist.h
15851 @@ -0,0 +1,29 @@
15852 +#ifndef _LATENCY_HIST_H
15853 +#define _LATENCY_HIST_H
15854 +
15855 +enum hist_action {
15856 +       IRQS_ON,
15857 +       PREEMPT_ON,
15858 +       TRACE_STOP,
15859 +       IRQS_OFF,
15860 +       PREEMPT_OFF,
15861 +       TRACE_START,
15862 +};
15863 +
15864 +static char *actions[] = {
15865 +       "IRQS_ON",
15866 +       "PREEMPT_ON",
15867 +       "TRACE_STOP",
15868 +       "IRQS_OFF",
15869 +       "PREEMPT_OFF",
15870 +       "TRACE_START",
15871 +};
15872 +
15873 +static inline char *getaction(int action)
15874 +{
15875 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
15876 +               return actions[action];
15877 +       return "unknown";
15878 +}
15879 +
15880 +#endif /* _LATENCY_HIST_H */
15881 diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
15882 index fff846b512e6..73614ce1d204 100644
15883 --- a/include/trace/events/writeback.h
15884 +++ b/include/trace/events/writeback.h
15885 @@ -134,58 +134,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
15886  #ifdef CREATE_TRACE_POINTS
15887  #ifdef CONFIG_CGROUP_WRITEBACK
15888  
15889 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
15890 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
15891  {
15892 -       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
15893 +       return wb->memcg_css->cgroup->kn->ino;
15894  }
15895  
15896 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
15897 -{
15898 -       struct cgroup *cgrp = wb->memcg_css->cgroup;
15899 -       char *path;
15900 -
15901 -       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
15902 -       WARN_ON_ONCE(path != buf);
15903 -}
15904 -
15905 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
15906 -{
15907 -       if (wbc->wb)
15908 -               return __trace_wb_cgroup_size(wbc->wb);
15909 -       else
15910 -               return 2;
15911 -}
15912 -
15913 -static inline void __trace_wbc_assign_cgroup(char *buf,
15914 -                                            struct writeback_control *wbc)
15915 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
15916  {
15917         if (wbc->wb)
15918 -               __trace_wb_assign_cgroup(buf, wbc->wb);
15919 +               return __trace_wb_assign_cgroup(wbc->wb);
15920         else
15921 -               strcpy(buf, "/");
15922 +               return -1U;
15923  }
15924 -
15925  #else  /* CONFIG_CGROUP_WRITEBACK */
15926  
15927 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
15928 -{
15929 -       return 2;
15930 -}
15931 -
15932 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
15933 -{
15934 -       strcpy(buf, "/");
15935 -}
15936 -
15937 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
15938 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
15939  {
15940 -       return 2;
15941 +       return -1U;
15942  }
15943  
15944 -static inline void __trace_wbc_assign_cgroup(char *buf,
15945 -                                            struct writeback_control *wbc)
15946 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
15947  {
15948 -       strcpy(buf, "/");
15949 +       return -1U;
15950  }
15951  
15952  #endif /* CONFIG_CGROUP_WRITEBACK */
15953 @@ -201,7 +171,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
15954                 __array(char, name, 32)
15955                 __field(unsigned long, ino)
15956                 __field(int, sync_mode)
15957 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
15958 +               __field(unsigned int, cgroup_ino)
15959         ),
15960  
15961         TP_fast_assign(
15962 @@ -209,14 +179,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
15963                         dev_name(inode_to_bdi(inode)->dev), 32);
15964                 __entry->ino            = inode->i_ino;
15965                 __entry->sync_mode      = wbc->sync_mode;
15966 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
15967 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
15968         ),
15969  
15970 -       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
15971 +       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u",
15972                 __entry->name,
15973                 __entry->ino,
15974                 __entry->sync_mode,
15975 -               __get_str(cgroup)
15976 +               __entry->cgroup_ino
15977         )
15978  );
15979  
15980 @@ -246,7 +216,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
15981                 __field(int, range_cyclic)
15982                 __field(int, for_background)
15983                 __field(int, reason)
15984 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
15985 +               __field(unsigned int, cgroup_ino)
15986         ),
15987         TP_fast_assign(
15988                 strncpy(__entry->name,
15989 @@ -258,10 +228,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
15990                 __entry->range_cyclic = work->range_cyclic;
15991                 __entry->for_background = work->for_background;
15992                 __entry->reason = work->reason;
15993 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
15994 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
15995         ),
15996         TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
15997 -                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
15998 +                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u",
15999                   __entry->name,
16000                   MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
16001                   __entry->nr_pages,
16002 @@ -270,7 +240,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16003                   __entry->range_cyclic,
16004                   __entry->for_background,
16005                   __print_symbolic(__entry->reason, WB_WORK_REASON),
16006 -                 __get_str(cgroup)
16007 +                 __entry->cgroup_ino
16008         )
16009  );
16010  #define DEFINE_WRITEBACK_WORK_EVENT(name) \
16011 @@ -300,15 +270,15 @@ DECLARE_EVENT_CLASS(writeback_class,
16012         TP_ARGS(wb),
16013         TP_STRUCT__entry(
16014                 __array(char, name, 32)
16015 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16016 +               __field(unsigned int, cgroup_ino)
16017         ),
16018         TP_fast_assign(
16019                 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
16020 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16021 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16022         ),
16023 -       TP_printk("bdi %s: cgroup=%s",
16024 +       TP_printk("bdi %s: cgroup_ino=%u",
16025                   __entry->name,
16026 -                 __get_str(cgroup)
16027 +                 __entry->cgroup_ino
16028         )
16029  );
16030  #define DEFINE_WRITEBACK_EVENT(name) \
16031 @@ -347,7 +317,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16032                 __field(int, range_cyclic)
16033                 __field(long, range_start)
16034                 __field(long, range_end)
16035 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16036 +               __field(unsigned int, cgroup_ino)
16037         ),
16038  
16039         TP_fast_assign(
16040 @@ -361,12 +331,12 @@ DECLARE_EVENT_CLASS(wbc_class,
16041                 __entry->range_cyclic   = wbc->range_cyclic;
16042                 __entry->range_start    = (long)wbc->range_start;
16043                 __entry->range_end      = (long)wbc->range_end;
16044 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16045 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16046         ),
16047  
16048         TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
16049                 "bgrd=%d reclm=%d cyclic=%d "
16050 -               "start=0x%lx end=0x%lx cgroup=%s",
16051 +               "start=0x%lx end=0x%lx cgroup_ino=%u",
16052                 __entry->name,
16053                 __entry->nr_to_write,
16054                 __entry->pages_skipped,
16055 @@ -377,7 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16056                 __entry->range_cyclic,
16057                 __entry->range_start,
16058                 __entry->range_end,
16059 -               __get_str(cgroup)
16060 +               __entry->cgroup_ino
16061         )
16062  )
16063  
16064 @@ -398,7 +368,7 @@ TRACE_EVENT(writeback_queue_io,
16065                 __field(long,           age)
16066                 __field(int,            moved)
16067                 __field(int,            reason)
16068 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16069 +               __field(unsigned int,   cgroup_ino)
16070         ),
16071         TP_fast_assign(
16072                 unsigned long *older_than_this = work->older_than_this;
16073 @@ -408,15 +378,15 @@ TRACE_EVENT(writeback_queue_io,
16074                                   (jiffies - *older_than_this) * 1000 / HZ : -1;
16075                 __entry->moved  = moved;
16076                 __entry->reason = work->reason;
16077 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16078 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16079         ),
16080 -       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
16081 +       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u",
16082                 __entry->name,
16083                 __entry->older, /* older_than_this in jiffies */
16084                 __entry->age,   /* older_than_this in relative milliseconds */
16085                 __entry->moved,
16086                 __print_symbolic(__entry->reason, WB_WORK_REASON),
16087 -               __get_str(cgroup)
16088 +               __entry->cgroup_ino
16089         )
16090  );
16091  
16092 @@ -484,7 +454,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16093                 __field(unsigned long,  dirty_ratelimit)
16094                 __field(unsigned long,  task_ratelimit)
16095                 __field(unsigned long,  balanced_dirty_ratelimit)
16096 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16097 +               __field(unsigned int,   cgroup_ino)
16098         ),
16099  
16100         TP_fast_assign(
16101 @@ -496,13 +466,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16102                 __entry->task_ratelimit = KBps(task_ratelimit);
16103                 __entry->balanced_dirty_ratelimit =
16104                                         KBps(wb->balanced_dirty_ratelimit);
16105 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16106 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16107         ),
16108  
16109         TP_printk("bdi %s: "
16110                   "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
16111                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16112 -                 "balanced_dirty_ratelimit=%lu cgroup=%s",
16113 +                 "balanced_dirty_ratelimit=%lu cgroup_ino=%u",
16114                   __entry->bdi,
16115                   __entry->write_bw,            /* write bandwidth */
16116                   __entry->avg_write_bw,        /* avg write bandwidth */
16117 @@ -510,7 +480,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16118                   __entry->dirty_ratelimit,     /* base ratelimit */
16119                   __entry->task_ratelimit, /* ratelimit with position control */
16120                   __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
16121 -                 __get_str(cgroup)
16122 +                 __entry->cgroup_ino
16123         )
16124  );
16125  
16126 @@ -548,7 +518,7 @@ TRACE_EVENT(balance_dirty_pages,
16127                 __field(         long,  pause)
16128                 __field(unsigned long,  period)
16129                 __field(         long,  think)
16130 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16131 +               __field(unsigned int,   cgroup_ino)
16132         ),
16133  
16134         TP_fast_assign(
16135 @@ -571,7 +541,7 @@ TRACE_EVENT(balance_dirty_pages,
16136                 __entry->period         = period * 1000 / HZ;
16137                 __entry->pause          = pause * 1000 / HZ;
16138                 __entry->paused         = (jiffies - start_time) * 1000 / HZ;
16139 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16140 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16141         ),
16142  
16143  
16144 @@ -580,7 +550,7 @@ TRACE_EVENT(balance_dirty_pages,
16145                   "bdi_setpoint=%lu bdi_dirty=%lu "
16146                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16147                   "dirtied=%u dirtied_pause=%u "
16148 -                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
16149 +                 "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u",
16150                   __entry->bdi,
16151                   __entry->limit,
16152                   __entry->setpoint,
16153 @@ -595,7 +565,7 @@ TRACE_EVENT(balance_dirty_pages,
16154                   __entry->pause,       /* ms */
16155                   __entry->period,      /* ms */
16156                   __entry->think,       /* ms */
16157 -                 __get_str(cgroup)
16158 +                 __entry->cgroup_ino
16159           )
16160  );
16161  
16162 @@ -609,8 +579,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16163                 __field(unsigned long, ino)
16164                 __field(unsigned long, state)
16165                 __field(unsigned long, dirtied_when)
16166 -               __dynamic_array(char, cgroup,
16167 -                               __trace_wb_cgroup_size(inode_to_wb(inode)))
16168 +               __field(unsigned int, cgroup_ino)
16169         ),
16170  
16171         TP_fast_assign(
16172 @@ -619,16 +588,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16173                 __entry->ino            = inode->i_ino;
16174                 __entry->state          = inode->i_state;
16175                 __entry->dirtied_when   = inode->dirtied_when;
16176 -               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
16177 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(inode_to_wb(inode));
16178         ),
16179  
16180 -       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
16181 +       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u",
16182                   __entry->name,
16183                   __entry->ino,
16184                   show_inode_state(__entry->state),
16185                   __entry->dirtied_when,
16186                   (jiffies - __entry->dirtied_when) / HZ,
16187 -                 __get_str(cgroup)
16188 +                 __entry->cgroup_ino
16189         )
16190  );
16191  
16192 @@ -684,7 +653,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16193                 __field(unsigned long, writeback_index)
16194                 __field(long, nr_to_write)
16195                 __field(unsigned long, wrote)
16196 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16197 +               __field(unsigned int, cgroup_ino)
16198         ),
16199  
16200         TP_fast_assign(
16201 @@ -696,11 +665,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16202                 __entry->writeback_index = inode->i_mapping->writeback_index;
16203                 __entry->nr_to_write    = nr_to_write;
16204                 __entry->wrote          = nr_to_write - wbc->nr_to_write;
16205 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16206 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16207         ),
16208  
16209         TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
16210 -                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
16211 +                 "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u",
16212                   __entry->name,
16213                   __entry->ino,
16214                   show_inode_state(__entry->state),
16215 @@ -709,7 +678,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16216                   __entry->writeback_index,
16217                   __entry->nr_to_write,
16218                   __entry->wrote,
16219 -                 __get_str(cgroup)
16220 +                 __entry->cgroup_ino
16221         )
16222  );
16223  
16224 diff --git a/init/Kconfig b/init/Kconfig
16225 index 235c7a2c0d20..a7c81c0911da 100644
16226 --- a/init/Kconfig
16227 +++ b/init/Kconfig
16228 @@ -498,7 +498,7 @@ config TINY_RCU
16229  
16230  config RCU_EXPERT
16231         bool "Make expert-level adjustments to RCU configuration"
16232 -       default n
16233 +       default y if PREEMPT_RT_FULL
16234         help
16235           This option needs to be enabled if you wish to make
16236           expert-level adjustments to RCU configuration.  By default,
16237 @@ -614,7 +614,7 @@ config RCU_FANOUT_LEAF
16238  
16239  config RCU_FAST_NO_HZ
16240         bool "Accelerate last non-dyntick-idle CPU's grace periods"
16241 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
16242 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
16243         default n
16244         help
16245           This option permits CPUs to enter dynticks-idle state even if
16246 @@ -641,7 +641,7 @@ config TREE_RCU_TRACE
16247  config RCU_BOOST
16248         bool "Enable RCU priority boosting"
16249         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
16250 -       default n
16251 +       default y if PREEMPT_RT_FULL
16252         help
16253           This option boosts the priority of preempted RCU readers that
16254           block the current preemptible RCU grace period for too long.
16255 @@ -1106,6 +1106,7 @@ config CFS_BANDWIDTH
16256  config RT_GROUP_SCHED
16257         bool "Group scheduling for SCHED_RR/FIFO"
16258         depends on CGROUP_SCHED
16259 +       depends on !PREEMPT_RT_FULL
16260         default n
16261         help
16262           This feature lets you explicitly allocate real CPU bandwidth
16263 @@ -1719,6 +1720,7 @@ choice
16264  
16265  config SLAB
16266         bool "SLAB"
16267 +       depends on !PREEMPT_RT_FULL
16268         help
16269           The regular slab allocator that is established and known to work
16270           well in all environments. It organizes cache hot objects in
16271 @@ -1737,6 +1739,7 @@ config SLUB
16272  config SLOB
16273         depends on EXPERT
16274         bool "SLOB (Simple Allocator)"
16275 +       depends on !PREEMPT_RT_FULL
16276         help
16277            SLOB replaces the stock allocator with a drastically simpler
16278            allocator. SLOB is generally more space efficient but
16279 @@ -1746,7 +1749,7 @@ endchoice
16280  
16281  config SLUB_CPU_PARTIAL
16282         default y
16283 -       depends on SLUB && SMP
16284 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16285         bool "SLUB per cpu partial cache"
16286         help
16287           Per cpu partial caches accellerate objects allocation and freeing
16288 diff --git a/init/Makefile b/init/Makefile
16289 index 7bc47ee31c36..88cf473554e0 100644
16290 --- a/init/Makefile
16291 +++ b/init/Makefile
16292 @@ -33,4 +33,4 @@ silent_chk_compile.h = :
16293  include/generated/compile.h: FORCE
16294         @$($(quiet)chk_compile.h)
16295         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16296 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16297 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16298 diff --git a/init/main.c b/init/main.c
16299 index 9e64d7097f1a..4a76e629c137 100644
16300 --- a/init/main.c
16301 +++ b/init/main.c
16302 @@ -530,6 +530,7 @@ asmlinkage __visible void __init start_kernel(void)
16303         setup_command_line(command_line);
16304         setup_nr_cpu_ids();
16305         setup_per_cpu_areas();
16306 +       softirq_early_init();
16307         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16308  
16309         build_all_zonelists(NULL, NULL);
16310 diff --git a/ipc/msg.c b/ipc/msg.c
16311 index c6521c205cb4..996d89023552 100644
16312 --- a/ipc/msg.c
16313 +++ b/ipc/msg.c
16314 @@ -183,20 +183,14 @@ static void ss_wakeup(struct list_head *h, int kill)
16315         }
16316  }
16317  
16318 -static void expunge_all(struct msg_queue *msq, int res)
16319 +static void expunge_all(struct msg_queue *msq, int res,
16320 +                       struct wake_q_head *wake_q)
16321  {
16322         struct msg_receiver *msr, *t;
16323  
16324         list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
16325 -               msr->r_msg = NULL; /* initialize expunge ordering */
16326 -               wake_up_process(msr->r_tsk);
16327 -               /*
16328 -                * Ensure that the wakeup is visible before setting r_msg as
16329 -                * the receiving end depends on it: either spinning on a nil,
16330 -                * or dealing with -EAGAIN cases. See lockless receive part 1
16331 -                * and 2 in do_msgrcv().
16332 -                */
16333 -               smp_wmb(); /* barrier (B) */
16334 +
16335 +               wake_q_add(wake_q, msr->r_tsk);
16336                 msr->r_msg = ERR_PTR(res);
16337         }
16338  }
16339 @@ -213,11 +207,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
16340  {
16341         struct msg_msg *msg, *t;
16342         struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
16343 +       WAKE_Q(wake_q);
16344  
16345 -       expunge_all(msq, -EIDRM);
16346 +       expunge_all(msq, -EIDRM, &wake_q);
16347         ss_wakeup(&msq->q_senders, 1);
16348         msg_rmid(ns, msq);
16349         ipc_unlock_object(&msq->q_perm);
16350 +       wake_up_q(&wake_q);
16351         rcu_read_unlock();
16352  
16353         list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
16354 @@ -342,6 +338,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16355         struct kern_ipc_perm *ipcp;
16356         struct msqid64_ds uninitialized_var(msqid64);
16357         struct msg_queue *msq;
16358 +       WAKE_Q(wake_q);
16359         int err;
16360  
16361         if (cmd == IPC_SET) {
16362 @@ -389,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16363                 /* sleeping receivers might be excluded by
16364                  * stricter permissions.
16365                  */
16366 -               expunge_all(msq, -EAGAIN);
16367 +               expunge_all(msq, -EAGAIN, &wake_q);
16368                 /* sleeping senders might be able to send
16369                  * due to a larger queue size.
16370                  */
16371 @@ -402,6 +399,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16372  
16373  out_unlock0:
16374         ipc_unlock_object(&msq->q_perm);
16375 +       wake_up_q(&wake_q);
16376  out_unlock1:
16377         rcu_read_unlock();
16378  out_up:
16379 @@ -566,7 +564,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode)
16380         return 0;
16381  }
16382  
16383 -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16384 +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
16385 +                                struct wake_q_head *wake_q)
16386  {
16387         struct msg_receiver *msr, *t;
16388  
16389 @@ -577,27 +576,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16390  
16391                         list_del(&msr->r_list);
16392                         if (msr->r_maxsize < msg->m_ts) {
16393 -                               /* initialize pipelined send ordering */
16394 -                               msr->r_msg = NULL;
16395 -                               wake_up_process(msr->r_tsk);
16396 -                               /* barrier (B) see barrier comment below */
16397 -                               smp_wmb();
16398 +                               wake_q_add(wake_q, msr->r_tsk);
16399                                 msr->r_msg = ERR_PTR(-E2BIG);
16400                         } else {
16401 -                               msr->r_msg = NULL;
16402                                 msq->q_lrpid = task_pid_vnr(msr->r_tsk);
16403                                 msq->q_rtime = get_seconds();
16404 -                               wake_up_process(msr->r_tsk);
16405 -                               /*
16406 -                                * Ensure that the wakeup is visible before
16407 -                                * setting r_msg, as the receiving can otherwise
16408 -                                * exit - once r_msg is set, the receiver can
16409 -                                * continue. See lockless receive part 1 and 2
16410 -                                * in do_msgrcv(). Barrier (B).
16411 -                                */
16412 -                               smp_wmb();
16413 +                               wake_q_add(wake_q, msr->r_tsk);
16414                                 msr->r_msg = msg;
16415 -
16416                                 return 1;
16417                         }
16418                 }
16419 @@ -613,6 +598,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16420         struct msg_msg *msg;
16421         int err;
16422         struct ipc_namespace *ns;
16423 +       WAKE_Q(wake_q);
16424  
16425         ns = current->nsproxy->ipc_ns;
16426  
16427 @@ -698,7 +684,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16428         msq->q_lspid = task_tgid_vnr(current);
16429         msq->q_stime = get_seconds();
16430  
16431 -       if (!pipelined_send(msq, msg)) {
16432 +       if (!pipelined_send(msq, msg, &wake_q)) {
16433                 /* no one is waiting for this message, enqueue it */
16434                 list_add_tail(&msg->m_list, &msq->q_messages);
16435                 msq->q_cbytes += msgsz;
16436 @@ -712,6 +698,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16437  
16438  out_unlock0:
16439         ipc_unlock_object(&msq->q_perm);
16440 +       wake_up_q(&wake_q);
16441  out_unlock1:
16442         rcu_read_unlock();
16443         if (msg != NULL)
16444 @@ -932,57 +919,25 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
16445                 rcu_read_lock();
16446  
16447                 /* Lockless receive, part 2:
16448 -                * Wait until pipelined_send or expunge_all are outside of
16449 -                * wake_up_process(). There is a race with exit(), see
16450 -                * ipc/mqueue.c for the details. The correct serialization
16451 -                * ensures that a receiver cannot continue without the wakeup
16452 -                * being visibible _before_ setting r_msg:
16453 +                * The work in pipelined_send() and expunge_all():
16454 +                * - Set pointer to message
16455 +                * - Queue the receiver task for later wakeup
16456 +                * - Wake up the process after the lock is dropped.
16457                  *
16458 -                * CPU 0                             CPU 1
16459 -                * <loop receiver>
16460 -                *   smp_rmb(); (A) <-- pair -.      <waker thread>
16461 -                *   <load ->r_msg>           |        msr->r_msg = NULL;
16462 -                *                            |        wake_up_process();
16463 -                * <continue>                 `------> smp_wmb(); (B)
16464 -                *                                     msr->r_msg = msg;
16465 -                *
16466 -                * Where (A) orders the message value read and where (B) orders
16467 -                * the write to the r_msg -- done in both pipelined_send and
16468 -                * expunge_all.
16469 +                * Should the process wake up before this wakeup (due to a
16470 +                * signal) it will either see the message and continue â€¦
16471                  */
16472 -               for (;;) {
16473 -                       /*
16474 -                        * Pairs with writer barrier in pipelined_send
16475 -                        * or expunge_all.
16476 -                        */
16477 -                       smp_rmb(); /* barrier (A) */
16478 -                       msg = (struct msg_msg *)msr_d.r_msg;
16479 -                       if (msg)
16480 -                               break;
16481  
16482 -                       /*
16483 -                        * The cpu_relax() call is a compiler barrier
16484 -                        * which forces everything in this loop to be
16485 -                        * re-loaded.
16486 -                        */
16487 -                       cpu_relax();
16488 -               }
16489 -
16490 -               /* Lockless receive, part 3:
16491 -                * If there is a message or an error then accept it without
16492 -                * locking.
16493 -                */
16494 +               msg = (struct msg_msg *)msr_d.r_msg;
16495                 if (msg != ERR_PTR(-EAGAIN))
16496                         goto out_unlock1;
16497  
16498 -               /* Lockless receive, part 3:
16499 -                * Acquire the queue spinlock.
16500 -                */
16501 +                /*
16502 +                 * â€¦ or see -EAGAIN, acquire the lock to check the message
16503 +                 * again.
16504 +                 */
16505                 ipc_lock_object(&msq->q_perm);
16506  
16507 -               /* Lockless receive, part 4:
16508 -                * Repeat test after acquiring the spinlock.
16509 -                */
16510                 msg = (struct msg_msg *)msr_d.r_msg;
16511                 if (msg != ERR_PTR(-EAGAIN))
16512                         goto out_unlock0;
16513 diff --git a/ipc/sem.c b/ipc/sem.c
16514 index 20d07008ad5e..40b5cc070720 100644
16515 --- a/ipc/sem.c
16516 +++ b/ipc/sem.c
16517 @@ -690,6 +690,13 @@ undo:
16518  static void wake_up_sem_queue_prepare(struct list_head *pt,
16519                                 struct sem_queue *q, int error)
16520  {
16521 +#ifdef CONFIG_PREEMPT_RT_BASE
16522 +       struct task_struct *p = q->sleeper;
16523 +       get_task_struct(p);
16524 +       q->status = error;
16525 +       wake_up_process(p);
16526 +       put_task_struct(p);
16527 +#else
16528         if (list_empty(pt)) {
16529                 /*
16530                  * Hold preempt off so that we don't get preempted and have the
16531 @@ -701,6 +708,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16532         q->pid = error;
16533  
16534         list_add_tail(&q->list, pt);
16535 +#endif
16536  }
16537  
16538  /**
16539 @@ -714,6 +722,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16540   */
16541  static void wake_up_sem_queue_do(struct list_head *pt)
16542  {
16543 +#ifndef CONFIG_PREEMPT_RT_BASE
16544         struct sem_queue *q, *t;
16545         int did_something;
16546  
16547 @@ -726,6 +735,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
16548         }
16549         if (did_something)
16550                 preempt_enable();
16551 +#endif
16552  }
16553  
16554  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
16555 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16556 index ebdb0043203a..b9e6aa7e5aa6 100644
16557 --- a/kernel/Kconfig.locks
16558 +++ b/kernel/Kconfig.locks
16559 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16560  
16561  config MUTEX_SPIN_ON_OWNER
16562         def_bool y
16563 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
16564 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16565  
16566  config RWSEM_SPIN_ON_OWNER
16567         def_bool y
16568 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16569 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16570  
16571  config LOCK_SPIN_ON_OWNER
16572         def_bool y
16573 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16574 index 3f9c97419f02..11dbe26a8279 100644
16575 --- a/kernel/Kconfig.preempt
16576 +++ b/kernel/Kconfig.preempt
16577 @@ -1,3 +1,16 @@
16578 +config PREEMPT
16579 +       bool
16580 +       select PREEMPT_COUNT
16581 +
16582 +config PREEMPT_RT_BASE
16583 +       bool
16584 +       select PREEMPT
16585 +
16586 +config HAVE_PREEMPT_LAZY
16587 +       bool
16588 +
16589 +config PREEMPT_LAZY
16590 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16591  
16592  choice
16593         prompt "Preemption Model"
16594 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16595  
16596           Select this if you are building a kernel for a desktop system.
16597  
16598 -config PREEMPT
16599 +config PREEMPT__LL
16600         bool "Preemptible Kernel (Low-Latency Desktop)"
16601 -       select PREEMPT_COUNT
16602 +       select PREEMPT
16603         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16604         help
16605           This option reduces the latency of the kernel by making
16606 @@ -52,6 +65,22 @@ config PREEMPT
16607           embedded system with latency requirements in the milliseconds
16608           range.
16609  
16610 +config PREEMPT_RTB
16611 +       bool "Preemptible Kernel (Basic RT)"
16612 +       select PREEMPT_RT_BASE
16613 +       help
16614 +         This option is basically the same as (Low-Latency Desktop) but
16615 +         enables changes which are preliminary for the full preemptible
16616 +         RT kernel.
16617 +
16618 +config PREEMPT_RT_FULL
16619 +       bool "Fully Preemptible Kernel (RT)"
16620 +       depends on IRQ_FORCED_THREADING
16621 +       select PREEMPT_RT_BASE
16622 +       select PREEMPT_RCU
16623 +       help
16624 +         All and everything
16625 +
16626  endchoice
16627  
16628  config PREEMPT_COUNT
16629 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
16630 index a3424f28aaf4..69434d231e21 100644
16631 --- a/kernel/cgroup.c
16632 +++ b/kernel/cgroup.c
16633 @@ -4737,10 +4737,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
16634         queue_work(cgroup_destroy_wq, &css->destroy_work);
16635  }
16636  
16637 -static void css_release_work_fn(struct work_struct *work)
16638 +static void css_release_work_fn(struct swork_event *sev)
16639  {
16640         struct cgroup_subsys_state *css =
16641 -               container_of(work, struct cgroup_subsys_state, destroy_work);
16642 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
16643         struct cgroup_subsys *ss = css->ss;
16644         struct cgroup *cgrp = css->cgroup;
16645  
16646 @@ -4779,8 +4779,8 @@ static void css_release(struct percpu_ref *ref)
16647         struct cgroup_subsys_state *css =
16648                 container_of(ref, struct cgroup_subsys_state, refcnt);
16649  
16650 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
16651 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
16652 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16653 +       swork_queue(&css->destroy_swork);
16654  }
16655  
16656  static void init_and_link_css(struct cgroup_subsys_state *css,
16657 @@ -5397,6 +5397,7 @@ static int __init cgroup_wq_init(void)
16658          */
16659         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16660         BUG_ON(!cgroup_destroy_wq);
16661 +       BUG_ON(swork_get());
16662  
16663         /*
16664          * Used to destroy pidlists and separate to serve as flush domain.
16665 diff --git a/kernel/cpu.c b/kernel/cpu.c
16666 index 85ff5e26e23b..8edd3c716092 100644
16667 --- a/kernel/cpu.c
16668 +++ b/kernel/cpu.c
16669 @@ -75,8 +75,8 @@ static struct {
16670  #endif
16671  } cpu_hotplug = {
16672         .active_writer = NULL,
16673 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
16674         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
16675 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
16676  #ifdef CONFIG_DEBUG_LOCK_ALLOC
16677         .dep_map = {.name = "cpu_hotplug.lock" },
16678  #endif
16679 @@ -89,6 +89,289 @@ static struct {
16680  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
16681  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
16682  
16683 +/**
16684 + * hotplug_pcp - per cpu hotplug descriptor
16685 + * @unplug:    set when pin_current_cpu() needs to sync tasks
16686 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
16687 + * @refcount:  counter of tasks in pinned sections
16688 + * @grab_lock: set when the tasks entering pinned sections should wait
16689 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
16690 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
16691 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
16692 + *
16693 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
16694 + * is used as a flag and still exists after @sync_tsk has exited and
16695 + * @sync_tsk set to NULL.
16696 + */
16697 +struct hotplug_pcp {
16698 +       struct task_struct *unplug;
16699 +       struct task_struct *sync_tsk;
16700 +       int refcount;
16701 +       int grab_lock;
16702 +       struct completion synced;
16703 +       struct completion unplug_wait;
16704 +#ifdef CONFIG_PREEMPT_RT_FULL
16705 +       /*
16706 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
16707 +        * the task, otherwise the mutex will cause the task to fail
16708 +        * to sleep when required. (Because it's called from migrate_disable())
16709 +        *
16710 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
16711 +        * state.
16712 +        */
16713 +       spinlock_t lock;
16714 +#else
16715 +       struct mutex mutex;
16716 +#endif
16717 +       int mutex_init;
16718 +};
16719 +
16720 +#ifdef CONFIG_PREEMPT_RT_FULL
16721 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
16722 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
16723 +#else
16724 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
16725 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
16726 +#endif
16727 +
16728 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
16729 +
16730 +/**
16731 + * pin_current_cpu - Prevent the current cpu from being unplugged
16732 + *
16733 + * Lightweight version of get_online_cpus() to prevent cpu from being
16734 + * unplugged when code runs in a migration disabled region.
16735 + *
16736 + * Must be called with preemption disabled (preempt_count = 1)!
16737 + */
16738 +void pin_current_cpu(void)
16739 +{
16740 +       struct hotplug_pcp *hp;
16741 +       int force = 0;
16742 +
16743 +retry:
16744 +       hp = this_cpu_ptr(&hotplug_pcp);
16745 +
16746 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
16747 +           hp->unplug == current) {
16748 +               hp->refcount++;
16749 +               return;
16750 +       }
16751 +       if (hp->grab_lock) {
16752 +               preempt_enable();
16753 +               hotplug_lock(hp);
16754 +               hotplug_unlock(hp);
16755 +       } else {
16756 +               preempt_enable();
16757 +               /*
16758 +                * Try to push this task off of this CPU.
16759 +                */
16760 +               if (!migrate_me()) {
16761 +                       preempt_disable();
16762 +                       hp = this_cpu_ptr(&hotplug_pcp);
16763 +                       if (!hp->grab_lock) {
16764 +                               /*
16765 +                                * Just let it continue it's already pinned
16766 +                                * or about to sleep.
16767 +                                */
16768 +                               force = 1;
16769 +                               goto retry;
16770 +                       }
16771 +                       preempt_enable();
16772 +               }
16773 +       }
16774 +       preempt_disable();
16775 +       goto retry;
16776 +}
16777 +
16778 +/**
16779 + * unpin_current_cpu - Allow unplug of current cpu
16780 + *
16781 + * Must be called with preemption or interrupts disabled!
16782 + */
16783 +void unpin_current_cpu(void)
16784 +{
16785 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
16786 +
16787 +       WARN_ON(hp->refcount <= 0);
16788 +
16789 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
16790 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
16791 +               wake_up_process(hp->unplug);
16792 +}
16793 +
16794 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
16795 +{
16796 +       set_current_state(TASK_UNINTERRUPTIBLE);
16797 +       while (hp->refcount) {
16798 +               schedule_preempt_disabled();
16799 +               set_current_state(TASK_UNINTERRUPTIBLE);
16800 +       }
16801 +}
16802 +
16803 +static int sync_unplug_thread(void *data)
16804 +{
16805 +       struct hotplug_pcp *hp = data;
16806 +
16807 +       wait_for_completion(&hp->unplug_wait);
16808 +       preempt_disable();
16809 +       hp->unplug = current;
16810 +       wait_for_pinned_cpus(hp);
16811 +
16812 +       /*
16813 +        * This thread will synchronize the cpu_down() with threads
16814 +        * that have pinned the CPU. When the pinned CPU count reaches
16815 +        * zero, we inform the cpu_down code to continue to the next step.
16816 +        */
16817 +       set_current_state(TASK_UNINTERRUPTIBLE);
16818 +       preempt_enable();
16819 +       complete(&hp->synced);
16820 +
16821 +       /*
16822 +        * If all succeeds, the next step will need tasks to wait till
16823 +        * the CPU is offline before continuing. To do this, the grab_lock
16824 +        * is set and tasks going into pin_current_cpu() will block on the
16825 +        * mutex. But we still need to wait for those that are already in
16826 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
16827 +        * will kick this thread out.
16828 +        */
16829 +       while (!hp->grab_lock && !kthread_should_stop()) {
16830 +               schedule();
16831 +               set_current_state(TASK_UNINTERRUPTIBLE);
16832 +       }
16833 +
16834 +       /* Make sure grab_lock is seen before we see a stale completion */
16835 +       smp_mb();
16836 +
16837 +       /*
16838 +        * Now just before cpu_down() enters stop machine, we need to make
16839 +        * sure all tasks that are in pinned CPU sections are out, and new
16840 +        * tasks will now grab the lock, keeping them from entering pinned
16841 +        * CPU sections.
16842 +        */
16843 +       if (!kthread_should_stop()) {
16844 +               preempt_disable();
16845 +               wait_for_pinned_cpus(hp);
16846 +               preempt_enable();
16847 +               complete(&hp->synced);
16848 +       }
16849 +
16850 +       set_current_state(TASK_UNINTERRUPTIBLE);
16851 +       while (!kthread_should_stop()) {
16852 +               schedule();
16853 +               set_current_state(TASK_UNINTERRUPTIBLE);
16854 +       }
16855 +       set_current_state(TASK_RUNNING);
16856 +
16857 +       /*
16858 +        * Force this thread off this CPU as it's going down and
16859 +        * we don't want any more work on this CPU.
16860 +        */
16861 +       current->flags &= ~PF_NO_SETAFFINITY;
16862 +       set_cpus_allowed_ptr(current, cpu_present_mask);
16863 +       migrate_me();
16864 +       return 0;
16865 +}
16866 +
16867 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
16868 +{
16869 +       wake_up_process(hp->sync_tsk);
16870 +       wait_for_completion(&hp->synced);
16871 +}
16872 +
16873 +static void __cpu_unplug_wait(unsigned int cpu)
16874 +{
16875 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16876 +
16877 +       complete(&hp->unplug_wait);
16878 +       wait_for_completion(&hp->synced);
16879 +}
16880 +
16881 +/*
16882 + * Start the sync_unplug_thread on the target cpu and wait for it to
16883 + * complete.
16884 + */
16885 +static int cpu_unplug_begin(unsigned int cpu)
16886 +{
16887 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16888 +       int err;
16889 +
16890 +       /* Protected by cpu_hotplug.lock */
16891 +       if (!hp->mutex_init) {
16892 +#ifdef CONFIG_PREEMPT_RT_FULL
16893 +               spin_lock_init(&hp->lock);
16894 +#else
16895 +               mutex_init(&hp->mutex);
16896 +#endif
16897 +               hp->mutex_init = 1;
16898 +       }
16899 +
16900 +       /* Inform the scheduler to migrate tasks off this CPU */
16901 +       tell_sched_cpu_down_begin(cpu);
16902 +
16903 +       init_completion(&hp->synced);
16904 +       init_completion(&hp->unplug_wait);
16905 +
16906 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
16907 +       if (IS_ERR(hp->sync_tsk)) {
16908 +               err = PTR_ERR(hp->sync_tsk);
16909 +               hp->sync_tsk = NULL;
16910 +               return err;
16911 +       }
16912 +       kthread_bind(hp->sync_tsk, cpu);
16913 +
16914 +       /*
16915 +        * Wait for tasks to get out of the pinned sections,
16916 +        * it's still OK if new tasks enter. Some CPU notifiers will
16917 +        * wait for tasks that are going to enter these sections and
16918 +        * we must not have them block.
16919 +        */
16920 +       wake_up_process(hp->sync_tsk);
16921 +       return 0;
16922 +}
16923 +
16924 +static void cpu_unplug_sync(unsigned int cpu)
16925 +{
16926 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16927 +
16928 +       init_completion(&hp->synced);
16929 +       /* The completion needs to be initialzied before setting grab_lock */
16930 +       smp_wmb();
16931 +
16932 +       /* Grab the mutex before setting grab_lock */
16933 +       hotplug_lock(hp);
16934 +       hp->grab_lock = 1;
16935 +
16936 +       /*
16937 +        * The CPU notifiers have been completed.
16938 +        * Wait for tasks to get out of pinned CPU sections and have new
16939 +        * tasks block until the CPU is completely down.
16940 +        */
16941 +       __cpu_unplug_sync(hp);
16942 +
16943 +       /* All done with the sync thread */
16944 +       kthread_stop(hp->sync_tsk);
16945 +       hp->sync_tsk = NULL;
16946 +}
16947 +
16948 +static void cpu_unplug_done(unsigned int cpu)
16949 +{
16950 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16951 +
16952 +       hp->unplug = NULL;
16953 +       /* Let all tasks know cpu unplug is finished before cleaning up */
16954 +       smp_wmb();
16955 +
16956 +       if (hp->sync_tsk)
16957 +               kthread_stop(hp->sync_tsk);
16958 +
16959 +       if (hp->grab_lock) {
16960 +               hotplug_unlock(hp);
16961 +               /* protected by cpu_hotplug.lock */
16962 +               hp->grab_lock = 0;
16963 +       }
16964 +       tell_sched_cpu_down_done(cpu);
16965 +}
16966  
16967  void get_online_cpus(void)
16968  {
16969 @@ -338,13 +621,15 @@ static int take_cpu_down(void *_param)
16970  /* Requires cpu_add_remove_lock to be held */
16971  static int _cpu_down(unsigned int cpu, int tasks_frozen)
16972  {
16973 -       int err, nr_calls = 0;
16974 +       int mycpu, err, nr_calls = 0;
16975         void *hcpu = (void *)(long)cpu;
16976         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
16977         struct take_cpu_down_param tcd_param = {
16978                 .mod = mod,
16979                 .hcpu = hcpu,
16980         };
16981 +       cpumask_var_t cpumask;
16982 +       cpumask_var_t cpumask_org;
16983  
16984         if (num_online_cpus() == 1)
16985                 return -EBUSY;
16986 @@ -352,7 +637,34 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
16987         if (!cpu_online(cpu))
16988                 return -EINVAL;
16989  
16990 +       /* Move the downtaker off the unplug cpu */
16991 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
16992 +               return -ENOMEM;
16993 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
16994 +               free_cpumask_var(cpumask);
16995 +               return -ENOMEM;
16996 +       }
16997 +
16998 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
16999 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
17000 +       set_cpus_allowed_ptr(current, cpumask);
17001 +       free_cpumask_var(cpumask);
17002 +       migrate_disable();
17003 +       mycpu = smp_processor_id();
17004 +       if (mycpu == cpu) {
17005 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
17006 +               migrate_enable();
17007 +               err = -EBUSY;
17008 +               goto restore_cpus;
17009 +       }
17010 +       migrate_enable();
17011 +
17012         cpu_hotplug_begin();
17013 +       err = cpu_unplug_begin(cpu);
17014 +       if (err) {
17015 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
17016 +               goto out_cancel;
17017 +       }
17018  
17019         err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
17020         if (err) {
17021 @@ -378,8 +690,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17022         else
17023                 synchronize_rcu();
17024  
17025 +       __cpu_unplug_wait(cpu);
17026         smpboot_park_threads(cpu);
17027  
17028 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
17029 +       cpu_unplug_sync(cpu);
17030 +
17031         /*
17032          * Prevent irq alloc/free while the dying cpu reorganizes the
17033          * interrupt affinities.
17034 @@ -424,9 +740,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17035         check_for_tasks(cpu);
17036  
17037  out_release:
17038 +       cpu_unplug_done(cpu);
17039 +out_cancel:
17040         cpu_hotplug_done();
17041         if (!err)
17042                 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
17043 +restore_cpus:
17044 +       set_cpus_allowed_ptr(current, cpumask_org);
17045 +       free_cpumask_var(cpumask_org);
17046         return err;
17047  }
17048  
17049 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17050 index fc1ef736253c..83c666537a7a 100644
17051 --- a/kernel/debug/kdb/kdb_io.c
17052 +++ b/kernel/debug/kdb/kdb_io.c
17053 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17054         int linecount;
17055         int colcount;
17056         int logging, saved_loglevel = 0;
17057 -       int saved_trap_printk;
17058         int got_printf_lock = 0;
17059         int retlen = 0;
17060         int fnd, len;
17061 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17062         unsigned long uninitialized_var(flags);
17063  
17064         preempt_disable();
17065 -       saved_trap_printk = kdb_trap_printk;
17066 -       kdb_trap_printk = 0;
17067  
17068         /* Serialize kdb_printf if multiple cpus try to write at once.
17069          * But if any cpu goes recursive in kdb, just print the output,
17070 @@ -855,7 +852,6 @@ kdb_print_out:
17071         } else {
17072                 __release(kdb_printf_lock);
17073         }
17074 -       kdb_trap_printk = saved_trap_printk;
17075         preempt_enable();
17076         return retlen;
17077  }
17078 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
17079         va_list ap;
17080         int r;
17081  
17082 +       kdb_trap_printk++;
17083         va_start(ap, fmt);
17084         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17085         va_end(ap);
17086 +       kdb_trap_printk--;
17087  
17088         return r;
17089  }
17090 diff --git a/kernel/events/core.c b/kernel/events/core.c
17091 index 12ecd4f0329f..560ea99875d3 100644
17092 --- a/kernel/events/core.c
17093 +++ b/kernel/events/core.c
17094 @@ -802,6 +802,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17095         raw_spin_lock_init(&cpuctx->hrtimer_lock);
17096         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17097         timer->function = perf_mux_hrtimer_handler;
17098 +       timer->irqsafe = 1;
17099  }
17100  
17101  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
17102 @@ -7219,6 +7220,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17103  
17104         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17105         hwc->hrtimer.function = perf_swevent_hrtimer;
17106 +       hwc->hrtimer.irqsafe = 1;
17107  
17108         /*
17109          * Since hrtimers have a fixed rate, we can do a static freq->period
17110 diff --git a/kernel/exit.c b/kernel/exit.c
17111 index ffba5df4abd5..e199407f8831 100644
17112 --- a/kernel/exit.c
17113 +++ b/kernel/exit.c
17114 @@ -144,7 +144,7 @@ static void __exit_signal(struct task_struct *tsk)
17115          * Do this under ->siglock, we can race with another thread
17116          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17117          */
17118 -       flush_sigqueue(&tsk->pending);
17119 +       flush_task_sigqueue(tsk);
17120         tsk->sighand = NULL;
17121         spin_unlock(&sighand->siglock);
17122  
17123 diff --git a/kernel/fork.c b/kernel/fork.c
17124 index 8860d1f50d24..1e2599a403a5 100644
17125 --- a/kernel/fork.c
17126 +++ b/kernel/fork.c
17127 @@ -108,7 +108,7 @@ int max_threads;            /* tunable limit on nr_threads */
17128  
17129  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
17130  
17131 -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
17132 +DEFINE_RWLOCK(tasklist_lock);  /* outer */
17133  
17134  #ifdef CONFIG_PROVE_RCU
17135  int lockdep_tasklist_lock_is_held(void)
17136 @@ -244,7 +244,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
17137         if (atomic_dec_and_test(&sig->sigcnt))
17138                 free_signal_struct(sig);
17139  }
17140 -
17141 +#ifdef CONFIG_PREEMPT_RT_BASE
17142 +static
17143 +#endif
17144  void __put_task_struct(struct task_struct *tsk)
17145  {
17146         WARN_ON(!tsk->exit_state);
17147 @@ -261,7 +263,18 @@ void __put_task_struct(struct task_struct *tsk)
17148         if (!profile_handoff_task(tsk))
17149                 free_task(tsk);
17150  }
17151 +#ifndef CONFIG_PREEMPT_RT_BASE
17152  EXPORT_SYMBOL_GPL(__put_task_struct);
17153 +#else
17154 +void __put_task_struct_cb(struct rcu_head *rhp)
17155 +{
17156 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17157 +
17158 +       __put_task_struct(tsk);
17159 +
17160 +}
17161 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17162 +#endif
17163  
17164  void __init __weak arch_task_cache_init(void) { }
17165  
17166 @@ -689,6 +702,19 @@ void __mmdrop(struct mm_struct *mm)
17167  }
17168  EXPORT_SYMBOL_GPL(__mmdrop);
17169  
17170 +#ifdef CONFIG_PREEMPT_RT_BASE
17171 +/*
17172 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17173 + * want another facility to make this work.
17174 + */
17175 +void __mmdrop_delayed(struct rcu_head *rhp)
17176 +{
17177 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17178 +
17179 +       __mmdrop(mm);
17180 +}
17181 +#endif
17182 +
17183  /*
17184   * Decrement the use count and release all resources for an mm.
17185   */
17186 @@ -1241,6 +1267,9 @@ static void rt_mutex_init_task(struct task_struct *p)
17187   */
17188  static void posix_cpu_timers_init(struct task_struct *tsk)
17189  {
17190 +#ifdef CONFIG_PREEMPT_RT_BASE
17191 +       tsk->posix_timer_list = NULL;
17192 +#endif
17193         tsk->cputime_expires.prof_exp = 0;
17194         tsk->cputime_expires.virt_exp = 0;
17195         tsk->cputime_expires.sched_exp = 0;
17196 @@ -1366,15 +1395,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
17197         spin_lock_init(&p->alloc_lock);
17198  
17199         init_sigpending(&p->pending);
17200 +       p->sigqueue_cache = NULL;
17201  
17202         p->utime = p->stime = p->gtime = 0;
17203         p->utimescaled = p->stimescaled = 0;
17204         prev_cputime_init(&p->prev_cputime);
17205  
17206  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
17207 -       seqlock_init(&p->vtime_seqlock);
17208 +       seqcount_init(&p->vtime_seqcount);
17209         p->vtime_snap = 0;
17210 -       p->vtime_snap_whence = VTIME_SLEEPING;
17211 +       p->vtime_snap_whence = VTIME_INACTIVE;
17212  #endif
17213  
17214  #if defined(SPLIT_RSS_COUNTING)
17215 diff --git a/kernel/futex.c b/kernel/futex.c
17216 index 9d8163afd87c..ad38af0bcff3 100644
17217 --- a/kernel/futex.c
17218 +++ b/kernel/futex.c
17219 @@ -815,7 +815,9 @@ void exit_pi_state_list(struct task_struct *curr)
17220                  * task still owns the PI-state:
17221                  */
17222                 if (head->next != next) {
17223 +                       raw_spin_unlock_irq(&curr->pi_lock);
17224                         spin_unlock(&hb->lock);
17225 +                       raw_spin_lock_irq(&curr->pi_lock);
17226                         continue;
17227                 }
17228  
17229 @@ -1210,6 +1212,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17230         struct futex_pi_state *pi_state = this->pi_state;
17231         u32 uninitialized_var(curval), newval;
17232         WAKE_Q(wake_q);
17233 +       WAKE_Q(wake_sleeper_q);
17234         bool deboost;
17235         int ret = 0;
17236  
17237 @@ -1223,7 +1226,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17238         if (pi_state->owner != current)
17239                 return -EINVAL;
17240  
17241 -       raw_spin_lock(&pi_state->pi_mutex.wait_lock);
17242 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17243         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
17244  
17245         /*
17246 @@ -1259,24 +1262,25 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17247                         ret = -EINVAL;
17248         }
17249         if (ret) {
17250 -               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17251 +               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17252                 return ret;
17253         }
17254  
17255 -       raw_spin_lock_irq(&pi_state->owner->pi_lock);
17256 +       raw_spin_lock(&pi_state->owner->pi_lock);
17257         WARN_ON(list_empty(&pi_state->list));
17258         list_del_init(&pi_state->list);
17259 -       raw_spin_unlock_irq(&pi_state->owner->pi_lock);
17260 +       raw_spin_unlock(&pi_state->owner->pi_lock);
17261  
17262 -       raw_spin_lock_irq(&new_owner->pi_lock);
17263 +       raw_spin_lock(&new_owner->pi_lock);
17264         WARN_ON(!list_empty(&pi_state->list));
17265         list_add(&pi_state->list, &new_owner->pi_state_list);
17266         pi_state->owner = new_owner;
17267 -       raw_spin_unlock_irq(&new_owner->pi_lock);
17268 +       raw_spin_unlock(&new_owner->pi_lock);
17269  
17270 -       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17271 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17272  
17273 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
17274 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17275 +                                       &wake_sleeper_q);
17276  
17277         /*
17278          * First unlock HB so the waiter does not spin on it once he got woken
17279 @@ -1286,6 +1290,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17280          */
17281         spin_unlock(&hb->lock);
17282         wake_up_q(&wake_q);
17283 +       wake_up_q_sleeper(&wake_sleeper_q);
17284         if (deboost)
17285                 rt_mutex_adjust_prio(current);
17286  
17287 @@ -1822,6 +1827,16 @@ retry_private:
17288                                 requeue_pi_wake_futex(this, &key2, hb2);
17289                                 drop_count++;
17290                                 continue;
17291 +                       } else if (ret == -EAGAIN) {
17292 +                               /*
17293 +                                * Waiter was woken by timeout or
17294 +                                * signal and has set pi_blocked_on to
17295 +                                * PI_WAKEUP_INPROGRESS before we
17296 +                                * tried to enqueue it on the rtmutex.
17297 +                                */
17298 +                               this->pi_state = NULL;
17299 +                               free_pi_state(pi_state);
17300 +                               continue;
17301                         } else if (ret) {
17302                                 /* -EDEADLK */
17303                                 this->pi_state = NULL;
17304 @@ -2139,11 +2154,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
17305                  * we returned due to timeout or signal without taking the
17306                  * rt_mutex. Too late.
17307                  */
17308 -               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
17309 +               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
17310                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
17311                 if (!owner)
17312                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
17313 -               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
17314 +               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
17315                 ret = fixup_pi_state_owner(uaddr, q, owner);
17316                 goto out;
17317         }
17318 @@ -2691,7 +2706,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17319         struct hrtimer_sleeper timeout, *to = NULL;
17320         struct rt_mutex_waiter rt_waiter;
17321         struct rt_mutex *pi_mutex = NULL;
17322 -       struct futex_hash_bucket *hb;
17323 +       struct futex_hash_bucket *hb, *hb2;
17324         union futex_key key2 = FUTEX_KEY_INIT;
17325         struct futex_q q = futex_q_init;
17326         int res, ret;
17327 @@ -2716,10 +2731,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17328          * The waiter is allocated on our stack, manipulated by the requeue
17329          * code while we sleep on uaddr.
17330          */
17331 -       debug_rt_mutex_init_waiter(&rt_waiter);
17332 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
17333 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
17334 -       rt_waiter.task = NULL;
17335 +       rt_mutex_init_waiter(&rt_waiter, false);
17336  
17337         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17338         if (unlikely(ret != 0))
17339 @@ -2750,20 +2762,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17340         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17341         futex_wait_queue_me(hb, &q, to);
17342  
17343 -       spin_lock(&hb->lock);
17344 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17345 -       spin_unlock(&hb->lock);
17346 -       if (ret)
17347 -               goto out_put_keys;
17348 +       /*
17349 +        * On RT we must avoid races with requeue and trying to block
17350 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
17351 +        * serializing access to pi_blocked_on with pi_lock.
17352 +        */
17353 +       raw_spin_lock_irq(&current->pi_lock);
17354 +       if (current->pi_blocked_on) {
17355 +               /*
17356 +                * We have been requeued or are in the process of
17357 +                * being requeued.
17358 +                */
17359 +               raw_spin_unlock_irq(&current->pi_lock);
17360 +       } else {
17361 +               /*
17362 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17363 +                * prevents a concurrent requeue from moving us to the
17364 +                * uaddr2 rtmutex. After that we can safely acquire
17365 +                * (and possibly block on) hb->lock.
17366 +                */
17367 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17368 +               raw_spin_unlock_irq(&current->pi_lock);
17369 +
17370 +               spin_lock(&hb->lock);
17371 +
17372 +               /*
17373 +                * Clean up pi_blocked_on. We might leak it otherwise
17374 +                * when we succeeded with the hb->lock in the fast
17375 +                * path.
17376 +                */
17377 +               raw_spin_lock_irq(&current->pi_lock);
17378 +               current->pi_blocked_on = NULL;
17379 +               raw_spin_unlock_irq(&current->pi_lock);
17380 +
17381 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17382 +               spin_unlock(&hb->lock);
17383 +               if (ret)
17384 +                       goto out_put_keys;
17385 +       }
17386  
17387         /*
17388 -        * In order for us to be here, we know our q.key == key2, and since
17389 -        * we took the hb->lock above, we also know that futex_requeue() has
17390 -        * completed and we no longer have to concern ourselves with a wakeup
17391 -        * race with the atomic proxy lock acquisition by the requeue code. The
17392 -        * futex_requeue dropped our key1 reference and incremented our key2
17393 -        * reference count.
17394 +        * In order to be here, we have either been requeued, are in
17395 +        * the process of being requeued, or requeue successfully
17396 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
17397 +        * non-null above, we may be racing with a requeue.  Do not
17398 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
17399 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
17400 +        * reference and incremented our key2 reference count.
17401          */
17402 +       hb2 = hash_futex(&key2);
17403  
17404         /* Check if the requeue code acquired the second futex for us. */
17405         if (!q.rt_waiter) {
17406 @@ -2772,14 +2819,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17407                  * did a lock-steal - fix up the PI-state in that case.
17408                  */
17409                 if (q.pi_state && (q.pi_state->owner != current)) {
17410 -                       spin_lock(q.lock_ptr);
17411 +                       spin_lock(&hb2->lock);
17412 +                       BUG_ON(&hb2->lock != q.lock_ptr);
17413                         ret = fixup_pi_state_owner(uaddr2, &q, current);
17414                         /*
17415                          * Drop the reference to the pi state which
17416                          * the requeue_pi() code acquired for us.
17417                          */
17418                         free_pi_state(q.pi_state);
17419 -                       spin_unlock(q.lock_ptr);
17420 +                       spin_unlock(&hb2->lock);
17421                 }
17422         } else {
17423                 /*
17424 @@ -2792,7 +2840,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17425                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
17426                 debug_rt_mutex_free_waiter(&rt_waiter);
17427  
17428 -               spin_lock(q.lock_ptr);
17429 +               spin_lock(&hb2->lock);
17430 +               BUG_ON(&hb2->lock != q.lock_ptr);
17431                 /*
17432                  * Fixup the pi_state owner and possibly acquire the lock if we
17433                  * haven't already.
17434 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
17435 index 57bff7857e87..6c65c9252991 100644
17436 --- a/kernel/irq/handle.c
17437 +++ b/kernel/irq/handle.c
17438 @@ -134,6 +134,8 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
17439  
17440  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17441  {
17442 +       struct pt_regs *regs = get_irq_regs();
17443 +       u64 ip = regs ? instruction_pointer(regs) : 0;
17444         irqreturn_t retval = IRQ_NONE;
17445         unsigned int flags = 0, irq = desc->irq_data.irq;
17446         struct irqaction *action = desc->action;
17447 @@ -176,7 +178,11 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17448                 action = action->next;
17449         }
17450  
17451 -       add_interrupt_randomness(irq, flags);
17452 +#ifdef CONFIG_PREEMPT_RT_FULL
17453 +       desc->random_ip = ip;
17454 +#else
17455 +       add_interrupt_randomness(irq, flags, ip);
17456 +#endif
17457  
17458         if (!noirqdebug)
17459                 note_interrupt(desc, retval);
17460 diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
17461 index 239e2ae2c947..0b73349a42d5 100644
17462 --- a/kernel/irq/irqdesc.c
17463 +++ b/kernel/irq/irqdesc.c
17464 @@ -24,10 +24,27 @@
17465  static struct lock_class_key irq_desc_lock_class;
17466  
17467  #if defined(CONFIG_SMP)
17468 +static int __init irq_affinity_setup(char *str)
17469 +{
17470 +       zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17471 +       cpulist_parse(str, irq_default_affinity);
17472 +       /*
17473 +        * Set at least the boot cpu. We don't want to end up with
17474 +        * bugreports caused by random comandline masks
17475 +        */
17476 +       cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
17477 +       return 1;
17478 +}
17479 +__setup("irqaffinity=", irq_affinity_setup);
17480 +
17481  static void __init init_irq_default_affinity(void)
17482  {
17483 -       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17484 -       cpumask_setall(irq_default_affinity);
17485 +#ifdef CONFIG_CPUMASK_OFFSTACK
17486 +       if (!irq_default_affinity)
17487 +               zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17488 +#endif
17489 +       if (cpumask_empty(irq_default_affinity))
17490 +               cpumask_setall(irq_default_affinity);
17491  }
17492  #else
17493  static void __init init_irq_default_affinity(void)
17494 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
17495 index 6ead200370da..8e89554aa345 100644
17496 --- a/kernel/irq/manage.c
17497 +++ b/kernel/irq/manage.c
17498 @@ -22,6 +22,7 @@
17499  #include "internals.h"
17500  
17501  #ifdef CONFIG_IRQ_FORCED_THREADING
17502 +# ifndef CONFIG_PREEMPT_RT_BASE
17503  __read_mostly bool force_irqthreads;
17504  
17505  static int __init setup_forced_irqthreads(char *arg)
17506 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
17507         return 0;
17508  }
17509  early_param("threadirqs", setup_forced_irqthreads);
17510 +# endif
17511  #endif
17512  
17513  static void __synchronize_hardirq(struct irq_desc *desc)
17514 @@ -181,6 +183,62 @@ static inline void
17515  irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
17516  #endif
17517  
17518 +#ifdef CONFIG_PREEMPT_RT_FULL
17519 +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
17520 +static struct task_struct *set_affinity_helper;
17521 +static LIST_HEAD(affinity_list);
17522 +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
17523 +
17524 +static int set_affinity_thread(void *unused)
17525 +{
17526 +       while (1) {
17527 +               struct irq_affinity_notify *notify;
17528 +               int empty;
17529 +
17530 +               set_current_state(TASK_INTERRUPTIBLE);
17531 +
17532 +               raw_spin_lock_irq(&affinity_list_lock);
17533 +               empty = list_empty(&affinity_list);
17534 +               raw_spin_unlock_irq(&affinity_list_lock);
17535 +
17536 +               if (empty)
17537 +                       schedule();
17538 +               if (kthread_should_stop())
17539 +                       break;
17540 +               set_current_state(TASK_RUNNING);
17541 +try_next:
17542 +               notify = NULL;
17543 +
17544 +               raw_spin_lock_irq(&affinity_list_lock);
17545 +               if (!list_empty(&affinity_list)) {
17546 +                       notify = list_first_entry(&affinity_list,
17547 +                                       struct irq_affinity_notify, list);
17548 +                       list_del_init(&notify->list);
17549 +               }
17550 +               raw_spin_unlock_irq(&affinity_list_lock);
17551 +
17552 +               if (!notify)
17553 +                       continue;
17554 +               _irq_affinity_notify(notify);
17555 +               goto try_next;
17556 +       }
17557 +       return 0;
17558 +}
17559 +
17560 +static void init_helper_thread(void)
17561 +{
17562 +       if (set_affinity_helper)
17563 +               return;
17564 +       set_affinity_helper = kthread_run(set_affinity_thread, NULL,
17565 +                       "affinity-cb");
17566 +       WARN_ON(IS_ERR(set_affinity_helper));
17567 +}
17568 +#else
17569 +
17570 +static inline void init_helper_thread(void) { }
17571 +
17572 +#endif
17573 +
17574  int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
17575                         bool force)
17576  {
17577 @@ -220,7 +278,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
17578  
17579         if (desc->affinity_notify) {
17580                 kref_get(&desc->affinity_notify->kref);
17581 +
17582 +#ifdef CONFIG_PREEMPT_RT_FULL
17583 +               raw_spin_lock(&affinity_list_lock);
17584 +               if (list_empty(&desc->affinity_notify->list))
17585 +                       list_add_tail(&affinity_list,
17586 +                                       &desc->affinity_notify->list);
17587 +               raw_spin_unlock(&affinity_list_lock);
17588 +               wake_up_process(set_affinity_helper);
17589 +#else
17590                 schedule_work(&desc->affinity_notify->work);
17591 +#endif
17592         }
17593         irqd_set(data, IRQD_AFFINITY_SET);
17594  
17595 @@ -258,10 +326,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
17596  }
17597  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
17598  
17599 -static void irq_affinity_notify(struct work_struct *work)
17600 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
17601  {
17602 -       struct irq_affinity_notify *notify =
17603 -               container_of(work, struct irq_affinity_notify, work);
17604         struct irq_desc *desc = irq_to_desc(notify->irq);
17605         cpumask_var_t cpumask;
17606         unsigned long flags;
17607 @@ -283,6 +349,13 @@ out:
17608         kref_put(&notify->kref, notify->release);
17609  }
17610  
17611 +static void irq_affinity_notify(struct work_struct *work)
17612 +{
17613 +       struct irq_affinity_notify *notify =
17614 +               container_of(work, struct irq_affinity_notify, work);
17615 +       _irq_affinity_notify(notify);
17616 +}
17617 +
17618  /**
17619   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
17620   *     @irq:           Interrupt for which to enable/disable notification
17621 @@ -312,6 +385,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
17622                 notify->irq = irq;
17623                 kref_init(&notify->kref);
17624                 INIT_WORK(&notify->work, irq_affinity_notify);
17625 +               INIT_LIST_HEAD(&notify->list);
17626 +               init_helper_thread();
17627         }
17628  
17629         raw_spin_lock_irqsave(&desc->lock, flags);
17630 @@ -865,7 +940,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
17631         local_bh_disable();
17632         ret = action->thread_fn(action->irq, action->dev_id);
17633         irq_finalize_oneshot(desc, action);
17634 -       local_bh_enable();
17635 +       /*
17636 +        * Interrupts which have real time requirements can be set up
17637 +        * to avoid softirq processing in the thread handler. This is
17638 +        * safe as these interrupts do not raise soft interrupts.
17639 +        */
17640 +       if (irq_settings_no_softirq_call(desc))
17641 +               _local_bh_enable();
17642 +       else
17643 +               local_bh_enable();
17644         return ret;
17645  }
17646  
17647 @@ -962,6 +1045,12 @@ static int irq_thread(void *data)
17648                 if (action_ret == IRQ_WAKE_THREAD)
17649                         irq_wake_secondary(desc, action);
17650  
17651 +#ifdef CONFIG_PREEMPT_RT_FULL
17652 +               migrate_disable();
17653 +               add_interrupt_randomness(action->irq, 0,
17654 +                                desc->random_ip ^ (unsigned long) action);
17655 +               migrate_enable();
17656 +#endif
17657                 wake_threads_waitq(desc);
17658         }
17659  
17660 @@ -1315,6 +1404,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
17661                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
17662                 }
17663  
17664 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
17665 +                       irq_settings_set_no_softirq_call(desc);
17666 +
17667                 /* Set default affinity mask once everything is setup */
17668                 setup_affinity(desc, mask);
17669  
17670 @@ -1968,7 +2060,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
17671   *     This call sets the internal irqchip state of an interrupt,
17672   *     depending on the value of @which.
17673   *
17674 - *     This function should be called with preemption disabled if the
17675 + *     This function should be called with migration disabled if the
17676   *     interrupt controller has per-cpu registers.
17677   */
17678  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17679 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
17680 index 320579d89091..2df2d4445b1e 100644
17681 --- a/kernel/irq/settings.h
17682 +++ b/kernel/irq/settings.h
17683 @@ -16,6 +16,7 @@ enum {
17684         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
17685         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
17686         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
17687 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
17688         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
17689  };
17690  
17691 @@ -30,6 +31,7 @@ enum {
17692  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
17693  #define IRQ_IS_POLLED          GOT_YOU_MORON
17694  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
17695 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
17696  #undef IRQF_MODIFY_MASK
17697  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
17698  
17699 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
17700         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17701  }
17702  
17703 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17704 +{
17705 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17706 +}
17707 +
17708 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17709 +{
17710 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17711 +}
17712 +
17713  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17714  {
17715         return desc->status_use_accessors & _IRQ_PER_CPU;
17716 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
17717 index 32144175458d..ed26f2554972 100644
17718 --- a/kernel/irq/spurious.c
17719 +++ b/kernel/irq/spurious.c
17720 @@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
17721  
17722  static int __init irqfixup_setup(char *str)
17723  {
17724 +#ifdef CONFIG_PREEMPT_RT_BASE
17725 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17726 +       return 1;
17727 +#endif
17728         irqfixup = 1;
17729         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17730         printk(KERN_WARNING "This may impact system performance.\n");
17731 @@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
17732  
17733  static int __init irqpoll_setup(char *str)
17734  {
17735 +#ifdef CONFIG_PREEMPT_RT_BASE
17736 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17737 +       return 1;
17738 +#endif
17739         irqfixup = 2;
17740         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17741                                 "enabled\n");
17742 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
17743 index bcf107ce0854..2899ba0d23d1 100644
17744 --- a/kernel/irq_work.c
17745 +++ b/kernel/irq_work.c
17746 @@ -17,6 +17,7 @@
17747  #include <linux/cpu.h>
17748  #include <linux/notifier.h>
17749  #include <linux/smp.h>
17750 +#include <linux/interrupt.h>
17751  #include <asm/processor.h>
17752  
17753  
17754 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
17755   */
17756  bool irq_work_queue_on(struct irq_work *work, int cpu)
17757  {
17758 +       struct llist_head *list;
17759 +
17760         /* All work should have been flushed before going offline */
17761         WARN_ON_ONCE(cpu_is_offline(cpu));
17762  
17763 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
17764         if (!irq_work_claim(work))
17765                 return false;
17766  
17767 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17768 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17769 +               list = &per_cpu(lazy_list, cpu);
17770 +       else
17771 +               list = &per_cpu(raised_list, cpu);
17772 +
17773 +       if (llist_add(&work->llnode, list))
17774                 arch_send_call_function_single_ipi(cpu);
17775  
17776         return true;
17777 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
17778  /* Enqueue the irq work @work on the current CPU */
17779  bool irq_work_queue(struct irq_work *work)
17780  {
17781 +       struct llist_head *list;
17782 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17783 +
17784         /* Only queue if not already pending */
17785         if (!irq_work_claim(work))
17786                 return false;
17787 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
17788         /* Queue the entry and raise the IPI if needed. */
17789         preempt_disable();
17790  
17791 -       /* If the work is "lazy", handle it from next tick if any */
17792 -       if (work->flags & IRQ_WORK_LAZY) {
17793 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17794 -                   tick_nohz_tick_stopped())
17795 -                       arch_irq_work_raise();
17796 -       } else {
17797 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17798 +       lazy_work = work->flags & IRQ_WORK_LAZY;
17799 +
17800 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17801 +               list = this_cpu_ptr(&lazy_list);
17802 +       else
17803 +               list = this_cpu_ptr(&raised_list);
17804 +
17805 +       if (llist_add(&work->llnode, list)) {
17806 +               if (!lazy_work || tick_nohz_tick_stopped())
17807                         arch_irq_work_raise();
17808         }
17809  
17810 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
17811         raised = this_cpu_ptr(&raised_list);
17812         lazy = this_cpu_ptr(&lazy_list);
17813  
17814 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
17815 -               if (llist_empty(lazy))
17816 -                       return false;
17817 +       if (llist_empty(raised) && llist_empty(lazy))
17818 +               return false;
17819  
17820         /* All work should have been flushed before going offline */
17821         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17822 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
17823         struct irq_work *work;
17824         struct llist_node *llnode;
17825  
17826 -       BUG_ON(!irqs_disabled());
17827 +       BUG_ON_NONRT(!irqs_disabled());
17828  
17829         if (llist_empty(list))
17830                 return;
17831 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
17832  void irq_work_run(void)
17833  {
17834         irq_work_run_list(this_cpu_ptr(&raised_list));
17835 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
17836 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17837 +               /*
17838 +                * NOTE: we raise softirq via IPI for safety,
17839 +                * and execute in irq_work_tick() to move the
17840 +                * overhead from hard to soft irq context.
17841 +                */
17842 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
17843 +                       raise_softirq(TIMER_SOFTIRQ);
17844 +       } else
17845 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17846  }
17847  EXPORT_SYMBOL_GPL(irq_work_run);
17848  
17849 @@ -179,8 +200,17 @@ void irq_work_tick(void)
17850  
17851         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17852                 irq_work_run_list(raised);
17853 +
17854 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17855 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17856 +}
17857 +
17858 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17859 +void irq_work_tick_soft(void)
17860 +{
17861         irq_work_run_list(this_cpu_ptr(&lazy_list));
17862  }
17863 +#endif
17864  
17865  /*
17866   * Synchronize against the irq_work @entry, ensures the entry is not
17867 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
17868 index e83b26464061..c0e08d1cf33e 100644
17869 --- a/kernel/ksysfs.c
17870 +++ b/kernel/ksysfs.c
17871 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
17872  
17873  #endif /* CONFIG_KEXEC_CORE */
17874  
17875 +#if defined(CONFIG_PREEMPT_RT_FULL)
17876 +static ssize_t  realtime_show(struct kobject *kobj,
17877 +                             struct kobj_attribute *attr, char *buf)
17878 +{
17879 +       return sprintf(buf, "%d\n", 1);
17880 +}
17881 +KERNEL_ATTR_RO(realtime);
17882 +#endif
17883 +
17884  /* whether file capabilities are enabled */
17885  static ssize_t fscaps_show(struct kobject *kobj,
17886                                   struct kobj_attribute *attr, char *buf)
17887 @@ -203,6 +212,9 @@ static struct attribute * kernel_attrs[] = {
17888         &vmcoreinfo_attr.attr,
17889  #endif
17890         &rcu_expedited_attr.attr,
17891 +#ifdef CONFIG_PREEMPT_RT_FULL
17892 +       &realtime_attr.attr,
17893 +#endif
17894         NULL
17895  };
17896  
17897 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
17898 index 8e96f6cc2a4a..447b03082d88 100644
17899 --- a/kernel/locking/Makefile
17900 +++ b/kernel/locking/Makefile
17901 @@ -1,5 +1,5 @@
17902  
17903 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17904 +obj-y += semaphore.o percpu-rwsem.o
17905  
17906  ifdef CONFIG_FUNCTION_TRACER
17907  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17908 @@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
17909  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17910  endif
17911  
17912 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17913 +obj-y += mutex.o
17914  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17915 +obj-y += rwsem.o
17916 +endif
17917  obj-$(CONFIG_LOCKDEP) += lockdep.o
17918  ifeq ($(CONFIG_PROC_FS),y)
17919  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17920 @@ -22,7 +26,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
17921  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17922  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17923  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17924 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17925  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17926  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17927 +endif
17928 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
17929  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17930  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17931 diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
17932 index 951cfcd10b4a..57e0ea72c28a 100644
17933 --- a/kernel/locking/lglock.c
17934 +++ b/kernel/locking/lglock.c
17935 @@ -4,6 +4,15 @@
17936  #include <linux/cpu.h>
17937  #include <linux/string.h>
17938  
17939 +#ifndef CONFIG_PREEMPT_RT_FULL
17940 +# define lg_lock_ptr           arch_spinlock_t
17941 +# define lg_do_lock(l)         arch_spin_lock(l)
17942 +# define lg_do_unlock(l)       arch_spin_unlock(l)
17943 +#else
17944 +# define lg_lock_ptr           struct rt_mutex
17945 +# define lg_do_lock(l)         __rt_spin_lock__no_mg(l)
17946 +# define lg_do_unlock(l)       __rt_spin_unlock(l)
17947 +#endif
17948  /*
17949   * Note there is no uninit, so lglocks cannot be defined in
17950   * modules (but it's fine to use them from there)
17951 @@ -12,51 +21,60 @@
17952  
17953  void lg_lock_init(struct lglock *lg, char *name)
17954  {
17955 +#ifdef CONFIG_PREEMPT_RT_FULL
17956 +       int i;
17957 +
17958 +       for_each_possible_cpu(i) {
17959 +               struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
17960 +
17961 +               rt_mutex_init(lock);
17962 +       }
17963 +#endif
17964         LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
17965  }
17966  EXPORT_SYMBOL(lg_lock_init);
17967  
17968  void lg_local_lock(struct lglock *lg)
17969  {
17970 -       arch_spinlock_t *lock;
17971 +       lg_lock_ptr *lock;
17972  
17973 -       preempt_disable();
17974 +       migrate_disable();
17975         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
17976         lock = this_cpu_ptr(lg->lock);
17977 -       arch_spin_lock(lock);
17978 +       lg_do_lock(lock);
17979  }
17980  EXPORT_SYMBOL(lg_local_lock);
17981  
17982  void lg_local_unlock(struct lglock *lg)
17983  {
17984 -       arch_spinlock_t *lock;
17985 +       lg_lock_ptr *lock;
17986  
17987         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
17988         lock = this_cpu_ptr(lg->lock);
17989 -       arch_spin_unlock(lock);
17990 -       preempt_enable();
17991 +       lg_do_unlock(lock);
17992 +       migrate_enable();
17993  }
17994  EXPORT_SYMBOL(lg_local_unlock);
17995  
17996  void lg_local_lock_cpu(struct lglock *lg, int cpu)
17997  {
17998 -       arch_spinlock_t *lock;
17999 +       lg_lock_ptr *lock;
18000  
18001 -       preempt_disable();
18002 +       preempt_disable_nort();
18003         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18004         lock = per_cpu_ptr(lg->lock, cpu);
18005 -       arch_spin_lock(lock);
18006 +       lg_do_lock(lock);
18007  }
18008  EXPORT_SYMBOL(lg_local_lock_cpu);
18009  
18010  void lg_local_unlock_cpu(struct lglock *lg, int cpu)
18011  {
18012 -       arch_spinlock_t *lock;
18013 +       lg_lock_ptr *lock;
18014  
18015         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18016         lock = per_cpu_ptr(lg->lock, cpu);
18017 -       arch_spin_unlock(lock);
18018 -       preempt_enable();
18019 +       lg_do_unlock(lock);
18020 +       preempt_enable_nort();
18021  }
18022  EXPORT_SYMBOL(lg_local_unlock_cpu);
18023  
18024 @@ -68,30 +86,30 @@ void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
18025         if (cpu2 < cpu1)
18026                 swap(cpu1, cpu2);
18027  
18028 -       preempt_disable();
18029 +       preempt_disable_nort();
18030         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18031 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
18032 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
18033 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu1));
18034 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu2));
18035  }
18036  
18037  void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
18038  {
18039         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18040 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
18041 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
18042 -       preempt_enable();
18043 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu1));
18044 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu2));
18045 +       preempt_enable_nort();
18046  }
18047  
18048  void lg_global_lock(struct lglock *lg)
18049  {
18050         int i;
18051  
18052 -       preempt_disable();
18053 +       preempt_disable_nort();
18054         lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18055         for_each_possible_cpu(i) {
18056 -               arch_spinlock_t *lock;
18057 +               lg_lock_ptr *lock;
18058                 lock = per_cpu_ptr(lg->lock, i);
18059 -               arch_spin_lock(lock);
18060 +               lg_do_lock(lock);
18061         }
18062  }
18063  EXPORT_SYMBOL(lg_global_lock);
18064 @@ -102,10 +120,35 @@ void lg_global_unlock(struct lglock *lg)
18065  
18066         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18067         for_each_possible_cpu(i) {
18068 -               arch_spinlock_t *lock;
18069 +               lg_lock_ptr *lock;
18070                 lock = per_cpu_ptr(lg->lock, i);
18071 -               arch_spin_unlock(lock);
18072 +               lg_do_unlock(lock);
18073         }
18074 -       preempt_enable();
18075 +       preempt_enable_nort();
18076  }
18077  EXPORT_SYMBOL(lg_global_unlock);
18078 +
18079 +#ifdef CONFIG_PREEMPT_RT_FULL
18080 +/*
18081 + * HACK: If you use this, you get to keep the pieces.
18082 + * Used in queue_stop_cpus_work() when stop machinery
18083 + * is called from inactive CPU, so we can't schedule.
18084 + */
18085 +# define lg_do_trylock_relax(l)                        \
18086 +       do {                                    \
18087 +               while (!__rt_spin_trylock(l))   \
18088 +                       cpu_relax();            \
18089 +       } while (0)
18090 +
18091 +void lg_global_trylock_relax(struct lglock *lg)
18092 +{
18093 +       int i;
18094 +
18095 +       lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18096 +       for_each_possible_cpu(i) {
18097 +               lg_lock_ptr *lock;
18098 +               lock = per_cpu_ptr(lg->lock, i);
18099 +               lg_do_trylock_relax(lock);
18100 +       }
18101 +}
18102 +#endif
18103 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
18104 index 60ace56618f6..e98ee958a353 100644
18105 --- a/kernel/locking/lockdep.c
18106 +++ b/kernel/locking/lockdep.c
18107 @@ -3525,6 +3525,7 @@ static void check_flags(unsigned long flags)
18108                 }
18109         }
18110  
18111 +#ifndef CONFIG_PREEMPT_RT_FULL
18112         /*
18113          * We dont accurately track softirq state in e.g.
18114          * hardirq contexts (such as on 4KSTACKS), so only
18115 @@ -3539,6 +3540,7 @@ static void check_flags(unsigned long flags)
18116                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
18117                 }
18118         }
18119 +#endif
18120  
18121         if (!debug_locks)
18122                 print_irqtrace_events(current);
18123 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
18124 index 8ef1919d63b2..291fc19e28e0 100644
18125 --- a/kernel/locking/locktorture.c
18126 +++ b/kernel/locking/locktorture.c
18127 @@ -26,7 +26,6 @@
18128  #include <linux/kthread.h>
18129  #include <linux/sched/rt.h>
18130  #include <linux/spinlock.h>
18131 -#include <linux/rwlock.h>
18132  #include <linux/mutex.h>
18133  #include <linux/rwsem.h>
18134  #include <linux/smp.h>
18135 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
18136 new file mode 100644
18137 index 000000000000..d4ab61c1848b
18138 --- /dev/null
18139 +++ b/kernel/locking/rt.c
18140 @@ -0,0 +1,474 @@
18141 +/*
18142 + * kernel/rt.c
18143 + *
18144 + * Real-Time Preemption Support
18145 + *
18146 + * started by Ingo Molnar:
18147 + *
18148 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18149 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18150 + *
18151 + * historic credit for proving that Linux spinlocks can be implemented via
18152 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
18153 + * and others) who prototyped it on 2.4 and did lots of comparative
18154 + * research and analysis; TimeSys, for proving that you can implement a
18155 + * fully preemptible kernel via the use of IRQ threading and mutexes;
18156 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
18157 + * right one; and to MontaVista, who ported pmutexes to 2.6.
18158 + *
18159 + * This code is a from-scratch implementation and is not based on pmutexes,
18160 + * but the idea of converting spinlocks to mutexes is used here too.
18161 + *
18162 + * lock debugging, locking tree, deadlock detection:
18163 + *
18164 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
18165 + *  Released under the General Public License (GPL).
18166 + *
18167 + * Includes portions of the generic R/W semaphore implementation from:
18168 + *
18169 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
18170 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
18171 + *  - Derived also from comments by Linus
18172 + *
18173 + * Pending ownership of locks and ownership stealing:
18174 + *
18175 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
18176 + *
18177 + *   (also by Steven Rostedt)
18178 + *    - Converted single pi_lock to individual task locks.
18179 + *
18180 + * By Esben Nielsen:
18181 + *    Doing priority inheritance with help of the scheduler.
18182 + *
18183 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18184 + *  - major rework based on Esben Nielsens initial patch
18185 + *  - replaced thread_info references by task_struct refs
18186 + *  - removed task->pending_owner dependency
18187 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
18188 + *    in the scheduler return path as discussed with Steven Rostedt
18189 + *
18190 + *  Copyright (C) 2006, Kihon Technologies Inc.
18191 + *    Steven Rostedt <rostedt@goodmis.org>
18192 + *  - debugged and patched Thomas Gleixner's rework.
18193 + *  - added back the cmpxchg to the rework.
18194 + *  - turned atomic require back on for SMP.
18195 + */
18196 +
18197 +#include <linux/spinlock.h>
18198 +#include <linux/rtmutex.h>
18199 +#include <linux/sched.h>
18200 +#include <linux/delay.h>
18201 +#include <linux/module.h>
18202 +#include <linux/kallsyms.h>
18203 +#include <linux/syscalls.h>
18204 +#include <linux/interrupt.h>
18205 +#include <linux/plist.h>
18206 +#include <linux/fs.h>
18207 +#include <linux/futex.h>
18208 +#include <linux/hrtimer.h>
18209 +
18210 +#include "rtmutex_common.h"
18211 +
18212 +/*
18213 + * struct mutex functions
18214 + */
18215 +void __mutex_do_init(struct mutex *mutex, const char *name,
18216 +                    struct lock_class_key *key)
18217 +{
18218 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18219 +       /*
18220 +        * Make sure we are not reinitializing a held lock:
18221 +        */
18222 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18223 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
18224 +#endif
18225 +       mutex->lock.save_state = 0;
18226 +}
18227 +EXPORT_SYMBOL(__mutex_do_init);
18228 +
18229 +void __lockfunc _mutex_lock(struct mutex *lock)
18230 +{
18231 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18232 +       rt_mutex_lock(&lock->lock);
18233 +}
18234 +EXPORT_SYMBOL(_mutex_lock);
18235 +
18236 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18237 +{
18238 +       int ret;
18239 +
18240 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18241 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18242 +       if (ret)
18243 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18244 +       return ret;
18245 +}
18246 +EXPORT_SYMBOL(_mutex_lock_interruptible);
18247 +
18248 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
18249 +{
18250 +       int ret;
18251 +
18252 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18253 +       ret = rt_mutex_lock_killable(&lock->lock);
18254 +       if (ret)
18255 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18256 +       return ret;
18257 +}
18258 +EXPORT_SYMBOL(_mutex_lock_killable);
18259 +
18260 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18261 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18262 +{
18263 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18264 +       rt_mutex_lock(&lock->lock);
18265 +}
18266 +EXPORT_SYMBOL(_mutex_lock_nested);
18267 +
18268 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18269 +{
18270 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18271 +       rt_mutex_lock(&lock->lock);
18272 +}
18273 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
18274 +
18275 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
18276 +{
18277 +       int ret;
18278 +
18279 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18280 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18281 +       if (ret)
18282 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18283 +       return ret;
18284 +}
18285 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18286 +
18287 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18288 +{
18289 +       int ret;
18290 +
18291 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18292 +       ret = rt_mutex_lock_killable(&lock->lock);
18293 +       if (ret)
18294 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18295 +       return ret;
18296 +}
18297 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
18298 +#endif
18299 +
18300 +int __lockfunc _mutex_trylock(struct mutex *lock)
18301 +{
18302 +       int ret = rt_mutex_trylock(&lock->lock);
18303 +
18304 +       if (ret)
18305 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18306 +
18307 +       return ret;
18308 +}
18309 +EXPORT_SYMBOL(_mutex_trylock);
18310 +
18311 +void __lockfunc _mutex_unlock(struct mutex *lock)
18312 +{
18313 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
18314 +       rt_mutex_unlock(&lock->lock);
18315 +}
18316 +EXPORT_SYMBOL(_mutex_unlock);
18317 +
18318 +/*
18319 + * rwlock_t functions
18320 + */
18321 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
18322 +{
18323 +       int ret;
18324 +
18325 +       migrate_disable();
18326 +       ret = rt_mutex_trylock(&rwlock->lock);
18327 +       if (ret)
18328 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18329 +       else
18330 +               migrate_enable();
18331 +
18332 +       return ret;
18333 +}
18334 +EXPORT_SYMBOL(rt_write_trylock);
18335 +
18336 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
18337 +{
18338 +       int ret;
18339 +
18340 +       *flags = 0;
18341 +       ret = rt_write_trylock(rwlock);
18342 +       return ret;
18343 +}
18344 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
18345 +
18346 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
18347 +{
18348 +       struct rt_mutex *lock = &rwlock->lock;
18349 +       int ret = 1;
18350 +
18351 +       /*
18352 +        * recursive read locks succeed when current owns the lock,
18353 +        * but not when read_depth == 0 which means that the lock is
18354 +        * write locked.
18355 +        */
18356 +       if (rt_mutex_owner(lock) != current) {
18357 +               migrate_disable();
18358 +               ret = rt_mutex_trylock(lock);
18359 +               if (ret)
18360 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18361 +               else
18362 +                       migrate_enable();
18363 +
18364 +       } else if (!rwlock->read_depth) {
18365 +               ret = 0;
18366 +       }
18367 +
18368 +       if (ret)
18369 +               rwlock->read_depth++;
18370 +
18371 +       return ret;
18372 +}
18373 +EXPORT_SYMBOL(rt_read_trylock);
18374 +
18375 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
18376 +{
18377 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
18378 +       __rt_spin_lock(&rwlock->lock);
18379 +}
18380 +EXPORT_SYMBOL(rt_write_lock);
18381 +
18382 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
18383 +{
18384 +       struct rt_mutex *lock = &rwlock->lock;
18385 +
18386 +
18387 +       /*
18388 +        * recursive read locks succeed when current owns the lock
18389 +        */
18390 +       if (rt_mutex_owner(lock) != current) {
18391 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
18392 +               __rt_spin_lock(lock);
18393 +       }
18394 +       rwlock->read_depth++;
18395 +}
18396 +
18397 +EXPORT_SYMBOL(rt_read_lock);
18398 +
18399 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
18400 +{
18401 +       /* NOTE: we always pass in '1' for nested, for simplicity */
18402 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
18403 +       __rt_spin_unlock(&rwlock->lock);
18404 +       migrate_enable();
18405 +}
18406 +EXPORT_SYMBOL(rt_write_unlock);
18407 +
18408 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
18409 +{
18410 +       /* Release the lock only when read_depth is down to 0 */
18411 +       if (--rwlock->read_depth == 0) {
18412 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
18413 +               __rt_spin_unlock(&rwlock->lock);
18414 +               migrate_enable();
18415 +       }
18416 +}
18417 +EXPORT_SYMBOL(rt_read_unlock);
18418 +
18419 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
18420 +{
18421 +       rt_write_lock(rwlock);
18422 +
18423 +       return 0;
18424 +}
18425 +EXPORT_SYMBOL(rt_write_lock_irqsave);
18426 +
18427 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
18428 +{
18429 +       rt_read_lock(rwlock);
18430 +
18431 +       return 0;
18432 +}
18433 +EXPORT_SYMBOL(rt_read_lock_irqsave);
18434 +
18435 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
18436 +{
18437 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18438 +       /*
18439 +        * Make sure we are not reinitializing a held lock:
18440 +        */
18441 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
18442 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
18443 +#endif
18444 +       rwlock->lock.save_state = 1;
18445 +       rwlock->read_depth = 0;
18446 +}
18447 +EXPORT_SYMBOL(__rt_rwlock_init);
18448 +
18449 +/*
18450 + * rw_semaphores
18451 + */
18452 +
18453 +void  rt_up_write(struct rw_semaphore *rwsem)
18454 +{
18455 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
18456 +       rt_mutex_unlock(&rwsem->lock);
18457 +}
18458 +EXPORT_SYMBOL(rt_up_write);
18459 +
18460 +void __rt_up_read(struct rw_semaphore *rwsem)
18461 +{
18462 +       if (--rwsem->read_depth == 0)
18463 +               rt_mutex_unlock(&rwsem->lock);
18464 +}
18465 +
18466 +void  rt_up_read(struct rw_semaphore *rwsem)
18467 +{
18468 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
18469 +       __rt_up_read(rwsem);
18470 +}
18471 +EXPORT_SYMBOL(rt_up_read);
18472 +
18473 +/*
18474 + * downgrade a write lock into a read lock
18475 + * - just wake up any readers at the front of the queue
18476 + */
18477 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
18478 +{
18479 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
18480 +       rwsem->read_depth = 1;
18481 +}
18482 +EXPORT_SYMBOL(rt_downgrade_write);
18483 +
18484 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
18485 +{
18486 +       int ret = rt_mutex_trylock(&rwsem->lock);
18487 +
18488 +       if (ret)
18489 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
18490 +       return ret;
18491 +}
18492 +EXPORT_SYMBOL(rt_down_write_trylock);
18493 +
18494 +void  rt_down_write(struct rw_semaphore *rwsem)
18495 +{
18496 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
18497 +       rt_mutex_lock(&rwsem->lock);
18498 +}
18499 +EXPORT_SYMBOL(rt_down_write);
18500 +
18501 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
18502 +{
18503 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
18504 +       rt_mutex_lock(&rwsem->lock);
18505 +}
18506 +EXPORT_SYMBOL(rt_down_write_nested);
18507 +
18508 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
18509 +                              struct lockdep_map *nest)
18510 +{
18511 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
18512 +       rt_mutex_lock(&rwsem->lock);
18513 +}
18514 +EXPORT_SYMBOL(rt_down_write_nested_lock);
18515 +
18516 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
18517 +{
18518 +       struct rt_mutex *lock = &rwsem->lock;
18519 +       int ret = 1;
18520 +
18521 +       /*
18522 +        * recursive read locks succeed when current owns the rwsem,
18523 +        * but not when read_depth == 0 which means that the rwsem is
18524 +        * write locked.
18525 +        */
18526 +       if (rt_mutex_owner(lock) != current)
18527 +               ret = rt_mutex_trylock(&rwsem->lock);
18528 +       else if (!rwsem->read_depth)
18529 +               ret = 0;
18530 +
18531 +       if (ret)
18532 +               rwsem->read_depth++;
18533 +       return ret;
18534 +
18535 +}
18536 +
18537 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
18538 +{
18539 +       int ret;
18540 +
18541 +       ret = rt__down_read_trylock(rwsem);
18542 +       if (ret)
18543 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
18544 +
18545 +       return ret;
18546 +}
18547 +EXPORT_SYMBOL(rt_down_read_trylock);
18548 +
18549 +void rt__down_read(struct rw_semaphore *rwsem)
18550 +{
18551 +       struct rt_mutex *lock = &rwsem->lock;
18552 +
18553 +       if (rt_mutex_owner(lock) != current)
18554 +               rt_mutex_lock(&rwsem->lock);
18555 +       rwsem->read_depth++;
18556 +}
18557 +EXPORT_SYMBOL(rt__down_read);
18558 +
18559 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
18560 +{
18561 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
18562 +       rt__down_read(rwsem);
18563 +}
18564 +
18565 +void  rt_down_read(struct rw_semaphore *rwsem)
18566 +{
18567 +       __rt_down_read(rwsem, 0);
18568 +}
18569 +EXPORT_SYMBOL(rt_down_read);
18570 +
18571 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
18572 +{
18573 +       __rt_down_read(rwsem, subclass);
18574 +}
18575 +EXPORT_SYMBOL(rt_down_read_nested);
18576 +
18577 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
18578 +                             struct lock_class_key *key)
18579 +{
18580 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18581 +       /*
18582 +        * Make sure we are not reinitializing a held lock:
18583 +        */
18584 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
18585 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
18586 +#endif
18587 +       rwsem->read_depth = 0;
18588 +       rwsem->lock.save_state = 0;
18589 +}
18590 +EXPORT_SYMBOL(__rt_rwsem_init);
18591 +
18592 +/**
18593 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
18594 + * @cnt: the atomic which we are to dec
18595 + * @lock: the mutex to return holding if we dec to 0
18596 + *
18597 + * return true and hold lock if we dec to 0, return false otherwise
18598 + */
18599 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
18600 +{
18601 +       /* dec if we can't possibly hit 0 */
18602 +       if (atomic_add_unless(cnt, -1, 1))
18603 +               return 0;
18604 +       /* we might hit 0, so take the lock */
18605 +       mutex_lock(lock);
18606 +       if (!atomic_dec_and_test(cnt)) {
18607 +               /* when we actually did the dec, we didn't hit 0 */
18608 +               mutex_unlock(lock);
18609 +               return 0;
18610 +       }
18611 +       /* we hit 0, and we hold the lock */
18612 +       return 1;
18613 +}
18614 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
18615 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
18616 index 8251e75dd9c0..fde5e54f1096 100644
18617 --- a/kernel/locking/rtmutex.c
18618 +++ b/kernel/locking/rtmutex.c
18619 @@ -7,6 +7,11 @@
18620   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18621   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
18622   *  Copyright (C) 2006 Esben Nielsen
18623 + *  Adaptive Spinlocks:
18624 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
18625 + *                                  and Peter Morreale,
18626 + * Adaptive Spinlocks simplification:
18627 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
18628   *
18629   *  See Documentation/locking/rt-mutex-design.txt for details.
18630   */
18631 @@ -16,6 +21,7 @@
18632  #include <linux/sched/rt.h>
18633  #include <linux/sched/deadline.h>
18634  #include <linux/timer.h>
18635 +#include <linux/ww_mutex.h>
18636  
18637  #include "rtmutex_common.h"
18638  
18639 @@ -69,6 +75,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
18640                 clear_rt_mutex_waiters(lock);
18641  }
18642  
18643 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
18644 +{
18645 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
18646 +               waiter != PI_REQUEUE_INPROGRESS;
18647 +}
18648 +
18649  /*
18650   * We can speed up the acquire/release, if there's no debugging state to be
18651   * set up.
18652 @@ -99,13 +111,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
18653   * 2) Drop lock->wait_lock
18654   * 3) Try to unlock the lock with cmpxchg
18655   */
18656 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
18657 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18658 +                                       unsigned long flags)
18659         __releases(lock->wait_lock)
18660  {
18661         struct task_struct *owner = rt_mutex_owner(lock);
18662  
18663         clear_rt_mutex_waiters(lock);
18664 -       raw_spin_unlock(&lock->wait_lock);
18665 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18666         /*
18667          * If a new waiter comes in between the unlock and the cmpxchg
18668          * we have two situations:
18669 @@ -147,11 +160,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
18670  /*
18671   * Simple slow path only version: lock->owner is protected by lock->wait_lock.
18672   */
18673 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
18674 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18675 +                                       unsigned long flags)
18676         __releases(lock->wait_lock)
18677  {
18678         lock->owner = NULL;
18679 -       raw_spin_unlock(&lock->wait_lock);
18680 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18681         return true;
18682  }
18683  #endif
18684 @@ -348,6 +362,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
18685         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
18686  }
18687  
18688 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
18689 +{
18690 +       if (waiter->savestate)
18691 +               wake_up_lock_sleeper(waiter->task);
18692 +       else
18693 +               wake_up_process(waiter->task);
18694 +}
18695 +
18696  /*
18697   * Max number of times we'll walk the boosting chain:
18698   */
18699 @@ -355,7 +377,8 @@ int max_lock_depth = 1024;
18700  
18701  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
18702  {
18703 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
18704 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
18705 +               p->pi_blocked_on->lock : NULL;
18706  }
18707  
18708  /*
18709 @@ -433,7 +456,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18710         int ret = 0, depth = 0;
18711         struct rt_mutex *lock;
18712         bool detect_deadlock;
18713 -       unsigned long flags;
18714         bool requeue = true;
18715  
18716         detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
18717 @@ -476,7 +498,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18718         /*
18719          * [1] Task cannot go away as we did a get_task() before !
18720          */
18721 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18722 +       raw_spin_lock_irq(&task->pi_lock);
18723  
18724         /*
18725          * [2] Get the waiter on which @task is blocked on.
18726 @@ -492,7 +514,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18727          * reached or the state of the chain has changed while we
18728          * dropped the locks.
18729          */
18730 -       if (!waiter)
18731 +       if (!rt_mutex_real_waiter(waiter))
18732                 goto out_unlock_pi;
18733  
18734         /*
18735 @@ -560,7 +582,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18736          * operations.
18737          */
18738         if (!raw_spin_trylock(&lock->wait_lock)) {
18739 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18740 +               raw_spin_unlock_irq(&task->pi_lock);
18741                 cpu_relax();
18742                 goto retry;
18743         }
18744 @@ -591,7 +613,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18745                 /*
18746                  * No requeue[7] here. Just release @task [8]
18747                  */
18748 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18749 +               raw_spin_unlock(&task->pi_lock);
18750                 put_task_struct(task);
18751  
18752                 /*
18753 @@ -599,14 +621,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18754                  * If there is no owner of the lock, end of chain.
18755                  */
18756                 if (!rt_mutex_owner(lock)) {
18757 -                       raw_spin_unlock(&lock->wait_lock);
18758 +                       raw_spin_unlock_irq(&lock->wait_lock);
18759                         return 0;
18760                 }
18761  
18762                 /* [10] Grab the next task, i.e. owner of @lock */
18763                 task = rt_mutex_owner(lock);
18764                 get_task_struct(task);
18765 -               raw_spin_lock_irqsave(&task->pi_lock, flags);
18766 +               raw_spin_lock(&task->pi_lock);
18767  
18768                 /*
18769                  * No requeue [11] here. We just do deadlock detection.
18770 @@ -621,8 +643,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18771                 top_waiter = rt_mutex_top_waiter(lock);
18772  
18773                 /* [13] Drop locks */
18774 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18775 -               raw_spin_unlock(&lock->wait_lock);
18776 +               raw_spin_unlock(&task->pi_lock);
18777 +               raw_spin_unlock_irq(&lock->wait_lock);
18778  
18779                 /* If owner is not blocked, end of chain. */
18780                 if (!next_lock)
18781 @@ -643,7 +665,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18782         rt_mutex_enqueue(lock, waiter);
18783  
18784         /* [8] Release the task */
18785 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18786 +       raw_spin_unlock(&task->pi_lock);
18787         put_task_struct(task);
18788  
18789         /*
18790 @@ -654,21 +676,24 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18791          * follow here. This is the end of the chain we are walking.
18792          */
18793         if (!rt_mutex_owner(lock)) {
18794 +               struct rt_mutex_waiter *lock_top_waiter;
18795 +
18796                 /*
18797                  * If the requeue [7] above changed the top waiter,
18798                  * then we need to wake the new top waiter up to try
18799                  * to get the lock.
18800                  */
18801 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
18802 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
18803 -               raw_spin_unlock(&lock->wait_lock);
18804 +               lock_top_waiter = rt_mutex_top_waiter(lock);
18805 +               if (prerequeue_top_waiter != lock_top_waiter)
18806 +                       rt_mutex_wake_waiter(lock_top_waiter);
18807 +               raw_spin_unlock_irq(&lock->wait_lock);
18808                 return 0;
18809         }
18810  
18811         /* [10] Grab the next task, i.e. the owner of @lock */
18812         task = rt_mutex_owner(lock);
18813         get_task_struct(task);
18814 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18815 +       raw_spin_lock(&task->pi_lock);
18816  
18817         /* [11] requeue the pi waiters if necessary */
18818         if (waiter == rt_mutex_top_waiter(lock)) {
18819 @@ -722,8 +747,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18820         top_waiter = rt_mutex_top_waiter(lock);
18821  
18822         /* [13] Drop the locks */
18823 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18824 -       raw_spin_unlock(&lock->wait_lock);
18825 +       raw_spin_unlock(&task->pi_lock);
18826 +       raw_spin_unlock_irq(&lock->wait_lock);
18827  
18828         /*
18829          * Make the actual exit decisions [12], based on the stored
18830 @@ -746,28 +771,46 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18831         goto again;
18832  
18833   out_unlock_pi:
18834 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18835 +       raw_spin_unlock_irq(&task->pi_lock);
18836   out_put_task:
18837         put_task_struct(task);
18838  
18839         return ret;
18840  }
18841  
18842 +
18843 +#define STEAL_NORMAL  0
18844 +#define STEAL_LATERAL 1
18845 +
18846 +/*
18847 + * Note that RT tasks are excluded from lateral-steals to prevent the
18848 + * introduction of an unbounded latency
18849 + */
18850 +static inline int lock_is_stealable(struct task_struct *task,
18851 +                                   struct task_struct *pendowner, int mode)
18852 +{
18853 +    if (mode == STEAL_NORMAL || rt_task(task)) {
18854 +           if (task->prio >= pendowner->prio)
18855 +                   return 0;
18856 +    } else if (task->prio > pendowner->prio)
18857 +           return 0;
18858 +    return 1;
18859 +}
18860 +
18861  /*
18862   * Try to take an rt-mutex
18863   *
18864 - * Must be called with lock->wait_lock held.
18865 + * Must be called with lock->wait_lock held and interrupts disabled
18866   *
18867   * @lock:   The lock to be acquired.
18868   * @task:   The task which wants to acquire the lock
18869   * @waiter: The waiter that is queued to the lock's wait tree if the
18870   *         callsite called task_blocked_on_lock(), otherwise NULL
18871   */
18872 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18873 -                               struct rt_mutex_waiter *waiter)
18874 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
18875 +                                 struct task_struct *task,
18876 +                                 struct rt_mutex_waiter *waiter, int mode)
18877  {
18878 -       unsigned long flags;
18879 -
18880         /*
18881          * Before testing whether we can acquire @lock, we set the
18882          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
18883 @@ -803,8 +846,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18884                  * If waiter is not the highest priority waiter of
18885                  * @lock, give up.
18886                  */
18887 -               if (waiter != rt_mutex_top_waiter(lock))
18888 +               if (waiter != rt_mutex_top_waiter(lock)) {
18889 +                       /* XXX lock_is_stealable() ? */
18890                         return 0;
18891 +               }
18892  
18893                 /*
18894                  * We can acquire the lock. Remove the waiter from the
18895 @@ -822,14 +867,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18896                  * not need to be dequeued.
18897                  */
18898                 if (rt_mutex_has_waiters(lock)) {
18899 -                       /*
18900 -                        * If @task->prio is greater than or equal to
18901 -                        * the top waiter priority (kernel view),
18902 -                        * @task lost.
18903 -                        */
18904 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
18905 -                               return 0;
18906 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
18907  
18908 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
18909 +                               return 0;
18910                         /*
18911                          * The current top waiter stays enqueued. We
18912                          * don't have to change anything in the lock
18913 @@ -852,7 +893,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18914          * case, but conditionals are more expensive than a redundant
18915          * store.
18916          */
18917 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18918 +       raw_spin_lock(&task->pi_lock);
18919         task->pi_blocked_on = NULL;
18920         /*
18921          * Finish the lock acquisition. @task is the new owner. If
18922 @@ -861,7 +902,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18923          */
18924         if (rt_mutex_has_waiters(lock))
18925                 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
18926 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18927 +       raw_spin_unlock(&task->pi_lock);
18928  
18929  takeit:
18930         /* We got the lock. */
18931 @@ -878,12 +919,405 @@ takeit:
18932         return 1;
18933  }
18934  
18935 +#ifdef CONFIG_PREEMPT_RT_FULL
18936 +/*
18937 + * preemptible spin_lock functions:
18938 + */
18939 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
18940 +                                        void  (*slowfn)(struct rt_mutex *lock,
18941 +                                                        bool mg_off),
18942 +                                        bool do_mig_dis)
18943 +{
18944 +       might_sleep_no_state_check();
18945 +
18946 +       if (do_mig_dis)
18947 +               migrate_disable();
18948 +
18949 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18950 +               rt_mutex_deadlock_account_lock(lock, current);
18951 +       else
18952 +               slowfn(lock, do_mig_dis);
18953 +}
18954 +
18955 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
18956 +                                          void  (*slowfn)(struct rt_mutex *lock))
18957 +{
18958 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18959 +               rt_mutex_deadlock_account_unlock(current);
18960 +       else
18961 +               slowfn(lock);
18962 +}
18963 +#ifdef CONFIG_SMP
18964 +/*
18965 + * Note that owner is a speculative pointer and dereferencing relies
18966 + * on rcu_read_lock() and the check against the lock owner.
18967 + */
18968 +static int adaptive_wait(struct rt_mutex *lock,
18969 +                        struct task_struct *owner)
18970 +{
18971 +       int res = 0;
18972 +
18973 +       rcu_read_lock();
18974 +       for (;;) {
18975 +               if (owner != rt_mutex_owner(lock))
18976 +                       break;
18977 +               /*
18978 +                * Ensure that owner->on_cpu is dereferenced _after_
18979 +                * checking the above to be valid.
18980 +                */
18981 +               barrier();
18982 +               if (!owner->on_cpu) {
18983 +                       res = 1;
18984 +                       break;
18985 +               }
18986 +               cpu_relax();
18987 +       }
18988 +       rcu_read_unlock();
18989 +       return res;
18990 +}
18991 +#else
18992 +static int adaptive_wait(struct rt_mutex *lock,
18993 +                        struct task_struct *orig_owner)
18994 +{
18995 +       return 1;
18996 +}
18997 +#endif
18998 +
18999 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19000 +                                  struct rt_mutex_waiter *waiter,
19001 +                                  struct task_struct *task,
19002 +                                  enum rtmutex_chainwalk chwalk);
19003 +/*
19004 + * Slow path lock function spin_lock style: this variant is very
19005 + * careful not to miss any non-lock wakeups.
19006 + *
19007 + * We store the current state under p->pi_lock in p->saved_state and
19008 + * the try_to_wake_up() code handles this accordingly.
19009 + */
19010 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
19011 +                                                   bool mg_off)
19012 +{
19013 +       struct task_struct *lock_owner, *self = current;
19014 +       struct rt_mutex_waiter waiter, *top_waiter;
19015 +       unsigned long flags;
19016 +       int ret;
19017 +
19018 +       rt_mutex_init_waiter(&waiter, true);
19019 +
19020 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19021 +
19022 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
19023 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19024 +               return;
19025 +       }
19026 +
19027 +       BUG_ON(rt_mutex_owner(lock) == self);
19028 +
19029 +       /*
19030 +        * We save whatever state the task is in and we'll restore it
19031 +        * after acquiring the lock taking real wakeups into account
19032 +        * as well. We are serialized via pi_lock against wakeups. See
19033 +        * try_to_wake_up().
19034 +        */
19035 +       raw_spin_lock(&self->pi_lock);
19036 +       self->saved_state = self->state;
19037 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19038 +       raw_spin_unlock(&self->pi_lock);
19039 +
19040 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
19041 +       BUG_ON(ret);
19042 +
19043 +       for (;;) {
19044 +               /* Try to acquire the lock again. */
19045 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
19046 +                       break;
19047 +
19048 +               top_waiter = rt_mutex_top_waiter(lock);
19049 +               lock_owner = rt_mutex_owner(lock);
19050 +
19051 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19052 +
19053 +               debug_rt_mutex_print_deadlock(&waiter);
19054 +
19055 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
19056 +                       if (mg_off)
19057 +                               migrate_enable();
19058 +                       schedule();
19059 +                       if (mg_off)
19060 +                               migrate_disable();
19061 +               }
19062 +
19063 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19064 +
19065 +               raw_spin_lock(&self->pi_lock);
19066 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19067 +               raw_spin_unlock(&self->pi_lock);
19068 +       }
19069 +
19070 +       /*
19071 +        * Restore the task state to current->saved_state. We set it
19072 +        * to the original state above and the try_to_wake_up() code
19073 +        * has possibly updated it when a real (non-rtmutex) wakeup
19074 +        * happened while we were blocked. Clear saved_state so
19075 +        * try_to_wakeup() does not get confused.
19076 +        */
19077 +       raw_spin_lock(&self->pi_lock);
19078 +       __set_current_state_no_track(self->saved_state);
19079 +       self->saved_state = TASK_RUNNING;
19080 +       raw_spin_unlock(&self->pi_lock);
19081 +
19082 +       /*
19083 +        * try_to_take_rt_mutex() sets the waiter bit
19084 +        * unconditionally. We might have to fix that up:
19085 +        */
19086 +       fixup_rt_mutex_waiters(lock);
19087 +
19088 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
19089 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
19090 +
19091 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19092 +
19093 +       debug_rt_mutex_free_waiter(&waiter);
19094 +}
19095 +
19096 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19097 +                                   struct wake_q_head *wake_sleeper_q,
19098 +                                   struct rt_mutex *lock);
19099 +/*
19100 + * Slow path to release a rt_mutex spin_lock style
19101 + */
19102 +static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
19103 +{
19104 +       unsigned long flags;
19105 +       WAKE_Q(wake_q);
19106 +       WAKE_Q(wake_sleeper_q);
19107 +
19108 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19109 +
19110 +       debug_rt_mutex_unlock(lock);
19111 +
19112 +       rt_mutex_deadlock_account_unlock(current);
19113 +
19114 +       if (!rt_mutex_has_waiters(lock)) {
19115 +               lock->owner = NULL;
19116 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19117 +               return;
19118 +       }
19119 +
19120 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19121 +
19122 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19123 +       wake_up_q(&wake_q);
19124 +       wake_up_q_sleeper(&wake_sleeper_q);
19125 +
19126 +       /* Undo pi boosting.when necessary */
19127 +       rt_mutex_adjust_prio(current);
19128 +}
19129 +
19130 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
19131 +{
19132 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
19133 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19134 +}
19135 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
19136 +
19137 +void __lockfunc rt_spin_lock(spinlock_t *lock)
19138 +{
19139 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19140 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19141 +}
19142 +EXPORT_SYMBOL(rt_spin_lock);
19143 +
19144 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
19145 +{
19146 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
19147 +}
19148 +EXPORT_SYMBOL(__rt_spin_lock);
19149 +
19150 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
19151 +{
19152 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
19153 +}
19154 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
19155 +
19156 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19157 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
19158 +{
19159 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19160 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19161 +}
19162 +EXPORT_SYMBOL(rt_spin_lock_nested);
19163 +#endif
19164 +
19165 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
19166 +{
19167 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19168 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19169 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19170 +}
19171 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
19172 +
19173 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
19174 +{
19175 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19176 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19177 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19178 +       migrate_enable();
19179 +}
19180 +EXPORT_SYMBOL(rt_spin_unlock);
19181 +
19182 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
19183 +{
19184 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
19185 +}
19186 +EXPORT_SYMBOL(__rt_spin_unlock);
19187 +
19188 +/*
19189 + * Wait for the lock to get unlocked: instead of polling for an unlock
19190 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
19191 + * schedule if there's contention:
19192 + */
19193 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
19194 +{
19195 +       spin_lock(lock);
19196 +       spin_unlock(lock);
19197 +}
19198 +EXPORT_SYMBOL(rt_spin_unlock_wait);
19199 +
19200 +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
19201 +{
19202 +       return rt_mutex_trylock(lock);
19203 +}
19204 +
19205 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
19206 +{
19207 +       int ret;
19208 +
19209 +       ret = rt_mutex_trylock(&lock->lock);
19210 +       if (ret)
19211 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19212 +       return ret;
19213 +}
19214 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
19215 +
19216 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
19217 +{
19218 +       int ret;
19219 +
19220 +       migrate_disable();
19221 +       ret = rt_mutex_trylock(&lock->lock);
19222 +       if (ret)
19223 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19224 +       else
19225 +               migrate_enable();
19226 +       return ret;
19227 +}
19228 +EXPORT_SYMBOL(rt_spin_trylock);
19229 +
19230 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
19231 +{
19232 +       int ret;
19233 +
19234 +       local_bh_disable();
19235 +       ret = rt_mutex_trylock(&lock->lock);
19236 +       if (ret) {
19237 +               migrate_disable();
19238 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19239 +       } else
19240 +               local_bh_enable();
19241 +       return ret;
19242 +}
19243 +EXPORT_SYMBOL(rt_spin_trylock_bh);
19244 +
19245 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
19246 +{
19247 +       int ret;
19248 +
19249 +       *flags = 0;
19250 +       ret = rt_mutex_trylock(&lock->lock);
19251 +       if (ret) {
19252 +               migrate_disable();
19253 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19254 +       }
19255 +       return ret;
19256 +}
19257 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
19258 +
19259 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
19260 +{
19261 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
19262 +       if (atomic_add_unless(atomic, -1, 1))
19263 +               return 0;
19264 +       rt_spin_lock(lock);
19265 +       if (atomic_dec_and_test(atomic))
19266 +               return 1;
19267 +       rt_spin_unlock(lock);
19268 +       return 0;
19269 +}
19270 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
19271 +
19272 +       void
19273 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
19274 +{
19275 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19276 +       /*
19277 +        * Make sure we are not reinitializing a held lock:
19278 +        */
19279 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19280 +       lockdep_init_map(&lock->dep_map, name, key, 0);
19281 +#endif
19282 +}
19283 +EXPORT_SYMBOL(__rt_spin_lock_init);
19284 +
19285 +#endif /* PREEMPT_RT_FULL */
19286 +
19287 +#ifdef CONFIG_PREEMPT_RT_FULL
19288 +       static inline int __sched
19289 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19290 +{
19291 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19292 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
19293 +
19294 +       if (!hold_ctx)
19295 +               return 0;
19296 +
19297 +       if (unlikely(ctx == hold_ctx))
19298 +               return -EALREADY;
19299 +
19300 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
19301 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
19302 +#ifdef CONFIG_DEBUG_MUTEXES
19303 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
19304 +               ctx->contending_lock = ww;
19305 +#endif
19306 +               return -EDEADLK;
19307 +       }
19308 +
19309 +       return 0;
19310 +}
19311 +#else
19312 +       static inline int __sched
19313 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19314 +{
19315 +       BUG();
19316 +       return 0;
19317 +}
19318 +
19319 +#endif
19320 +
19321 +static inline int
19322 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19323 +                    struct rt_mutex_waiter *waiter)
19324 +{
19325 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
19326 +}
19327 +
19328  /*
19329   * Task blocks on lock.
19330   *
19331   * Prepare waiter and propagate pi chain
19332   *
19333 - * This must be called with lock->wait_lock held.
19334 + * This must be called with lock->wait_lock held and interrupts disabled
19335   */
19336  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19337                                    struct rt_mutex_waiter *waiter,
19338 @@ -894,7 +1328,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19339         struct rt_mutex_waiter *top_waiter = waiter;
19340         struct rt_mutex *next_lock;
19341         int chain_walk = 0, res;
19342 -       unsigned long flags;
19343  
19344         /*
19345          * Early deadlock detection. We really don't want the task to
19346 @@ -908,7 +1341,24 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19347         if (owner == task)
19348                 return -EDEADLK;
19349  
19350 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19351 +       raw_spin_lock(&task->pi_lock);
19352 +
19353 +       /*
19354 +        * In the case of futex requeue PI, this will be a proxy
19355 +        * lock. The task will wake unaware that it is enqueueed on
19356 +        * this lock. Avoid blocking on two locks and corrupting
19357 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
19358 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
19359 +        * before requeue (due to a signal or timeout). Do not enqueue
19360 +        * the task if PI_WAKEUP_INPROGRESS is set.
19361 +        */
19362 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
19363 +               raw_spin_unlock(&task->pi_lock);
19364 +               return -EAGAIN;
19365 +       }
19366 +
19367 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
19368 +
19369         __rt_mutex_adjust_prio(task);
19370         waiter->task = task;
19371         waiter->lock = lock;
19372 @@ -921,18 +1371,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19373  
19374         task->pi_blocked_on = waiter;
19375  
19376 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19377 +       raw_spin_unlock(&task->pi_lock);
19378  
19379         if (!owner)
19380                 return 0;
19381  
19382 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
19383 +       raw_spin_lock(&owner->pi_lock);
19384         if (waiter == rt_mutex_top_waiter(lock)) {
19385                 rt_mutex_dequeue_pi(owner, top_waiter);
19386                 rt_mutex_enqueue_pi(owner, waiter);
19387  
19388                 __rt_mutex_adjust_prio(owner);
19389 -               if (owner->pi_blocked_on)
19390 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
19391                         chain_walk = 1;
19392         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
19393                 chain_walk = 1;
19394 @@ -941,7 +1391,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19395         /* Store the lock on which owner is blocked or NULL */
19396         next_lock = task_blocked_on_lock(owner);
19397  
19398 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
19399 +       raw_spin_unlock(&owner->pi_lock);
19400         /*
19401          * Even if full deadlock detection is on, if the owner is not
19402          * blocked itself, we can avoid finding this out in the chain
19403 @@ -957,12 +1407,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19404          */
19405         get_task_struct(owner);
19406  
19407 -       raw_spin_unlock(&lock->wait_lock);
19408 +       raw_spin_unlock_irq(&lock->wait_lock);
19409  
19410         res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
19411                                          next_lock, waiter, task);
19412  
19413 -       raw_spin_lock(&lock->wait_lock);
19414 +       raw_spin_lock_irq(&lock->wait_lock);
19415  
19416         return res;
19417  }
19418 @@ -971,15 +1421,15 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19419   * Remove the top waiter from the current tasks pi waiter tree and
19420   * queue it up.
19421   *
19422 - * Called with lock->wait_lock held.
19423 + * Called with lock->wait_lock held and interrupts disabled.
19424   */
19425  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19426 +                                   struct wake_q_head *wake_sleeper_q,
19427                                     struct rt_mutex *lock)
19428  {
19429         struct rt_mutex_waiter *waiter;
19430 -       unsigned long flags;
19431  
19432 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
19433 +       raw_spin_lock(&current->pi_lock);
19434  
19435         waiter = rt_mutex_top_waiter(lock);
19436  
19437 @@ -1001,15 +1451,18 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19438          */
19439         lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
19440  
19441 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
19442 +       raw_spin_unlock(&current->pi_lock);
19443  
19444 -       wake_q_add(wake_q, waiter->task);
19445 +       if (waiter->savestate)
19446 +               wake_q_add(wake_sleeper_q, waiter->task);
19447 +       else
19448 +               wake_q_add(wake_q, waiter->task);
19449  }
19450  
19451  /*
19452   * Remove a waiter from a lock and give up
19453   *
19454 - * Must be called with lock->wait_lock held and
19455 + * Must be called with lock->wait_lock held and interrupts disabled. I must
19456   * have just failed to try_to_take_rt_mutex().
19457   */
19458  static void remove_waiter(struct rt_mutex *lock,
19459 @@ -1017,13 +1470,12 @@ static void remove_waiter(struct rt_mutex *lock,
19460  {
19461         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
19462         struct task_struct *owner = rt_mutex_owner(lock);
19463 -       struct rt_mutex *next_lock;
19464 -       unsigned long flags;
19465 +       struct rt_mutex *next_lock = NULL;
19466  
19467 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
19468 +       raw_spin_lock(&current->pi_lock);
19469         rt_mutex_dequeue(lock, waiter);
19470         current->pi_blocked_on = NULL;
19471 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
19472 +       raw_spin_unlock(&current->pi_lock);
19473  
19474         /*
19475          * Only update priority if the waiter was the highest priority
19476 @@ -1032,7 +1484,7 @@ static void remove_waiter(struct rt_mutex *lock,
19477         if (!owner || !is_top_waiter)
19478                 return;
19479  
19480 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
19481 +       raw_spin_lock(&owner->pi_lock);
19482  
19483         rt_mutex_dequeue_pi(owner, waiter);
19484  
19485 @@ -1042,9 +1494,10 @@ static void remove_waiter(struct rt_mutex *lock,
19486         __rt_mutex_adjust_prio(owner);
19487  
19488         /* Store the lock on which owner is blocked or NULL */
19489 -       next_lock = task_blocked_on_lock(owner);
19490 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
19491 +               next_lock = task_blocked_on_lock(owner);
19492  
19493 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
19494 +       raw_spin_unlock(&owner->pi_lock);
19495  
19496         /*
19497          * Don't walk the chain, if the owner task is not blocked
19498 @@ -1056,12 +1509,12 @@ static void remove_waiter(struct rt_mutex *lock,
19499         /* gets dropped in rt_mutex_adjust_prio_chain()! */
19500         get_task_struct(owner);
19501  
19502 -       raw_spin_unlock(&lock->wait_lock);
19503 +       raw_spin_unlock_irq(&lock->wait_lock);
19504  
19505         rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
19506                                    next_lock, NULL, current);
19507  
19508 -       raw_spin_lock(&lock->wait_lock);
19509 +       raw_spin_lock_irq(&lock->wait_lock);
19510  }
19511  
19512  /*
19513 @@ -1078,17 +1531,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
19514         raw_spin_lock_irqsave(&task->pi_lock, flags);
19515  
19516         waiter = task->pi_blocked_on;
19517 -       if (!waiter || (waiter->prio == task->prio &&
19518 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
19519                         !dl_prio(task->prio))) {
19520                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19521                 return;
19522         }
19523         next_lock = waiter->lock;
19524 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19525  
19526         /* gets dropped in rt_mutex_adjust_prio_chain()! */
19527         get_task_struct(task);
19528  
19529 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19530         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
19531                                    next_lock, NULL, task);
19532  }
19533 @@ -1097,16 +1550,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
19534   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
19535   * @lock:               the rt_mutex to take
19536   * @state:              the state the task should block in (TASK_INTERRUPTIBLE
19537 - *                      or TASK_UNINTERRUPTIBLE)
19538 + *                      or TASK_UNINTERRUPTIBLE)
19539   * @timeout:            the pre-initialized and started timer, or NULL for none
19540   * @waiter:             the pre-initialized rt_mutex_waiter
19541   *
19542 - * lock->wait_lock must be held by the caller.
19543 + * Must be called with lock->wait_lock held and interrupts disabled
19544   */
19545  static int __sched
19546  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
19547                     struct hrtimer_sleeper *timeout,
19548 -                   struct rt_mutex_waiter *waiter)
19549 +                   struct rt_mutex_waiter *waiter,
19550 +                   struct ww_acquire_ctx *ww_ctx)
19551  {
19552         int ret = 0;
19553  
19554 @@ -1129,13 +1583,19 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
19555                                 break;
19556                 }
19557  
19558 -               raw_spin_unlock(&lock->wait_lock);
19559 +               if (ww_ctx && ww_ctx->acquired > 0) {
19560 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
19561 +                       if (ret)
19562 +                               break;
19563 +               }
19564 +
19565 +               raw_spin_unlock_irq(&lock->wait_lock);
19566  
19567                 debug_rt_mutex_print_deadlock(waiter);
19568  
19569                 schedule();
19570  
19571 -               raw_spin_lock(&lock->wait_lock);
19572 +               raw_spin_lock_irq(&lock->wait_lock);
19573                 set_current_state(state);
19574         }
19575  
19576 @@ -1163,26 +1623,112 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
19577         }
19578  }
19579  
19580 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
19581 +                                                  struct ww_acquire_ctx *ww_ctx)
19582 +{
19583 +#ifdef CONFIG_DEBUG_MUTEXES
19584 +       /*
19585 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
19586 +        * but released with a normal mutex_unlock in this call.
19587 +        *
19588 +        * This should never happen, always use ww_mutex_unlock.
19589 +        */
19590 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
19591 +
19592 +       /*
19593 +        * Not quite done after calling ww_acquire_done() ?
19594 +        */
19595 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
19596 +
19597 +       if (ww_ctx->contending_lock) {
19598 +               /*
19599 +                * After -EDEADLK you tried to
19600 +                * acquire a different ww_mutex? Bad!
19601 +                */
19602 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
19603 +
19604 +               /*
19605 +                * You called ww_mutex_lock after receiving -EDEADLK,
19606 +                * but 'forgot' to unlock everything else first?
19607 +                */
19608 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
19609 +               ww_ctx->contending_lock = NULL;
19610 +       }
19611 +
19612 +       /*
19613 +        * Naughty, using a different class will lead to undefined behavior!
19614 +        */
19615 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
19616 +#endif
19617 +       ww_ctx->acquired++;
19618 +}
19619 +
19620 +#ifdef CONFIG_PREEMPT_RT_FULL
19621 +static void ww_mutex_account_lock(struct rt_mutex *lock,
19622 +                                 struct ww_acquire_ctx *ww_ctx)
19623 +{
19624 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19625 +       struct rt_mutex_waiter *waiter, *n;
19626 +
19627 +       /*
19628 +        * This branch gets optimized out for the common case,
19629 +        * and is only important for ww_mutex_lock.
19630 +        */
19631 +       ww_mutex_lock_acquired(ww, ww_ctx);
19632 +       ww->ctx = ww_ctx;
19633 +
19634 +       /*
19635 +        * Give any possible sleeping processes the chance to wake up,
19636 +        * so they can recheck if they have to back off.
19637 +        */
19638 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
19639 +                                            tree_entry) {
19640 +               /* XXX debug rt mutex waiter wakeup */
19641 +
19642 +               BUG_ON(waiter->lock != lock);
19643 +               rt_mutex_wake_waiter(waiter);
19644 +       }
19645 +}
19646 +
19647 +#else
19648 +
19649 +static void ww_mutex_account_lock(struct rt_mutex *lock,
19650 +                                 struct ww_acquire_ctx *ww_ctx)
19651 +{
19652 +       BUG();
19653 +}
19654 +#endif
19655 +
19656  /*
19657   * Slow path lock function:
19658   */
19659  static int __sched
19660  rt_mutex_slowlock(struct rt_mutex *lock, int state,
19661                   struct hrtimer_sleeper *timeout,
19662 -                 enum rtmutex_chainwalk chwalk)
19663 +                 enum rtmutex_chainwalk chwalk,
19664 +                 struct ww_acquire_ctx *ww_ctx)
19665  {
19666         struct rt_mutex_waiter waiter;
19667 +       unsigned long flags;
19668         int ret = 0;
19669  
19670 -       debug_rt_mutex_init_waiter(&waiter);
19671 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
19672 -       RB_CLEAR_NODE(&waiter.tree_entry);
19673 +       rt_mutex_init_waiter(&waiter, false);
19674  
19675 -       raw_spin_lock(&lock->wait_lock);
19676 +       /*
19677 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
19678 +        * be called in early boot if the cmpxchg() fast path is disabled
19679 +        * (debug, no architecture support). In this case we will acquire the
19680 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
19681 +        * enable interrupts in that early boot case. So we need to use the
19682 +        * irqsave/restore variants.
19683 +        */
19684 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19685  
19686         /* Try to acquire the lock again: */
19687         if (try_to_take_rt_mutex(lock, current, NULL)) {
19688 -               raw_spin_unlock(&lock->wait_lock);
19689 +               if (ww_ctx)
19690 +                       ww_mutex_account_lock(lock, ww_ctx);
19691 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19692                 return 0;
19693         }
19694  
19695 @@ -1196,13 +1742,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19696  
19697         if (likely(!ret))
19698                 /* sleep on the mutex */
19699 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
19700 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
19701 +                                         ww_ctx);
19702 +       else if (ww_ctx) {
19703 +               /* ww_mutex received EDEADLK, let it become EALREADY */
19704 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
19705 +               BUG_ON(!ret);
19706 +       }
19707  
19708         if (unlikely(ret)) {
19709                 __set_current_state(TASK_RUNNING);
19710                 if (rt_mutex_has_waiters(lock))
19711                         remove_waiter(lock, &waiter);
19712 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
19713 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
19714 +               if (!ww_ctx)
19715 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
19716 +       } else if (ww_ctx) {
19717 +               ww_mutex_account_lock(lock, ww_ctx);
19718         }
19719  
19720         /*
19721 @@ -1211,7 +1767,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19722          */
19723         fixup_rt_mutex_waiters(lock);
19724  
19725 -       raw_spin_unlock(&lock->wait_lock);
19726 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19727  
19728         /* Remove pending timer: */
19729         if (unlikely(timeout))
19730 @@ -1227,6 +1783,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19731   */
19732  static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19733  {
19734 +       unsigned long flags;
19735         int ret;
19736  
19737         /*
19738 @@ -1238,10 +1795,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19739                 return 0;
19740  
19741         /*
19742 -        * The mutex has currently no owner. Lock the wait lock and
19743 -        * try to acquire the lock.
19744 +        * The mutex has currently no owner. Lock the wait lock and try to
19745 +        * acquire the lock. We use irqsave here to support early boot calls.
19746          */
19747 -       raw_spin_lock(&lock->wait_lock);
19748 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19749  
19750         ret = try_to_take_rt_mutex(lock, current, NULL);
19751  
19752 @@ -1251,7 +1808,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19753          */
19754         fixup_rt_mutex_waiters(lock);
19755  
19756 -       raw_spin_unlock(&lock->wait_lock);
19757 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19758  
19759         return ret;
19760  }
19761 @@ -1261,9 +1818,13 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19762   * Return whether the current task needs to undo a potential priority boosting.
19763   */
19764  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19765 -                                       struct wake_q_head *wake_q)
19766 +                                       struct wake_q_head *wake_q,
19767 +                                       struct wake_q_head *wake_sleeper_q)
19768  {
19769 -       raw_spin_lock(&lock->wait_lock);
19770 +       unsigned long flags;
19771 +
19772 +       /* irqsave required to support early boot calls */
19773 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19774  
19775         debug_rt_mutex_unlock(lock);
19776  
19777 @@ -1302,10 +1863,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19778          */
19779         while (!rt_mutex_has_waiters(lock)) {
19780                 /* Drops lock->wait_lock ! */
19781 -               if (unlock_rt_mutex_safe(lock) == true)
19782 +               if (unlock_rt_mutex_safe(lock, flags) == true)
19783                         return false;
19784                 /* Relock the rtmutex and try again */
19785 -               raw_spin_lock(&lock->wait_lock);
19786 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19787         }
19788  
19789         /*
19790 @@ -1314,9 +1875,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19791          *
19792          * Queue the next waiter for wakeup once we release the wait_lock.
19793          */
19794 -       mark_wakeup_next_waiter(wake_q, lock);
19795 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
19796  
19797 -       raw_spin_unlock(&lock->wait_lock);
19798 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19799  
19800         /* check PI boosting */
19801         return true;
19802 @@ -1330,31 +1891,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19803   */
19804  static inline int
19805  rt_mutex_fastlock(struct rt_mutex *lock, int state,
19806 +                 struct ww_acquire_ctx *ww_ctx,
19807                   int (*slowfn)(struct rt_mutex *lock, int state,
19808                                 struct hrtimer_sleeper *timeout,
19809 -                               enum rtmutex_chainwalk chwalk))
19810 +                               enum rtmutex_chainwalk chwalk,
19811 +                               struct ww_acquire_ctx *ww_ctx))
19812  {
19813         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
19814                 rt_mutex_deadlock_account_lock(lock, current);
19815                 return 0;
19816         } else
19817 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
19818 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
19819 +                             ww_ctx);
19820  }
19821  
19822  static inline int
19823  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
19824                         struct hrtimer_sleeper *timeout,
19825                         enum rtmutex_chainwalk chwalk,
19826 +                       struct ww_acquire_ctx *ww_ctx,
19827                         int (*slowfn)(struct rt_mutex *lock, int state,
19828                                       struct hrtimer_sleeper *timeout,
19829 -                                     enum rtmutex_chainwalk chwalk))
19830 +                                     enum rtmutex_chainwalk chwalk,
19831 +                                     struct ww_acquire_ctx *ww_ctx))
19832  {
19833         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
19834             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
19835                 rt_mutex_deadlock_account_lock(lock, current);
19836                 return 0;
19837         } else
19838 -               return slowfn(lock, state, timeout, chwalk);
19839 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
19840  }
19841  
19842  static inline int
19843 @@ -1371,17 +1937,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
19844  static inline void
19845  rt_mutex_fastunlock(struct rt_mutex *lock,
19846                     bool (*slowfn)(struct rt_mutex *lock,
19847 -                                  struct wake_q_head *wqh))
19848 +                                  struct wake_q_head *wqh,
19849 +                                  struct wake_q_head *wq_sleeper))
19850  {
19851         WAKE_Q(wake_q);
19852 +       WAKE_Q(wake_sleeper_q);
19853  
19854         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19855                 rt_mutex_deadlock_account_unlock(current);
19856  
19857         } else {
19858 -               bool deboost = slowfn(lock, &wake_q);
19859 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
19860  
19861                 wake_up_q(&wake_q);
19862 +               wake_up_q_sleeper(&wake_sleeper_q);
19863  
19864                 /* Undo pi boosting if necessary: */
19865                 if (deboost)
19866 @@ -1398,7 +1967,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
19867  {
19868         might_sleep();
19869  
19870 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
19871 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
19872  }
19873  EXPORT_SYMBOL_GPL(rt_mutex_lock);
19874  
19875 @@ -1415,7 +1984,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
19876  {
19877         might_sleep();
19878  
19879 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
19880 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
19881  }
19882  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
19883  
19884 @@ -1428,11 +1997,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
19885         might_sleep();
19886  
19887         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19888 -                                      RT_MUTEX_FULL_CHAINWALK,
19889 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
19890                                        rt_mutex_slowlock);
19891  }
19892  
19893  /**
19894 + * rt_mutex_lock_killable - lock a rt_mutex killable
19895 + *
19896 + * @lock:              the rt_mutex to be locked
19897 + * @detect_deadlock:   deadlock detection on/off
19898 + *
19899 + * Returns:
19900 + *  0          on success
19901 + * -EINTR      when interrupted by a signal
19902 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
19903 + */
19904 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
19905 +{
19906 +       might_sleep();
19907 +
19908 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
19909 +}
19910 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
19911 +
19912 +/**
19913   * rt_mutex_timed_lock - lock a rt_mutex interruptible
19914   *                     the timeout structure is provided
19915   *                     by the caller
19916 @@ -1452,6 +2040,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19917  
19918         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19919                                        RT_MUTEX_MIN_CHAINWALK,
19920 +                                      NULL,
19921                                        rt_mutex_slowlock);
19922  }
19923  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19924 @@ -1469,7 +2058,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19925   */
19926  int __sched rt_mutex_trylock(struct rt_mutex *lock)
19927  {
19928 +#ifdef CONFIG_PREEMPT_RT_FULL
19929 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
19930 +#else
19931         if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
19932 +#endif
19933                 return 0;
19934  
19935         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
19936 @@ -1495,13 +2088,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
19937   * required or not.
19938   */
19939  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
19940 -                                  struct wake_q_head *wqh)
19941 +                                  struct wake_q_head *wqh,
19942 +                                  struct wake_q_head *wq_sleeper)
19943  {
19944         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19945                 rt_mutex_deadlock_account_unlock(current);
19946                 return false;
19947         }
19948 -       return rt_mutex_slowunlock(lock, wqh);
19949 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
19950  }
19951  
19952  /**
19953 @@ -1534,13 +2128,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
19954  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
19955  {
19956         lock->owner = NULL;
19957 -       raw_spin_lock_init(&lock->wait_lock);
19958         lock->waiters = RB_ROOT;
19959         lock->waiters_leftmost = NULL;
19960  
19961         debug_rt_mutex_init(lock, name);
19962  }
19963 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
19964 +EXPORT_SYMBOL(__rt_mutex_init);
19965  
19966  /**
19967   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
19968 @@ -1555,7 +2148,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
19969  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
19970                                 struct task_struct *proxy_owner)
19971  {
19972 -       __rt_mutex_init(lock, NULL);
19973 +       rt_mutex_init(lock);
19974         debug_rt_mutex_proxy_lock(lock, proxy_owner);
19975         rt_mutex_set_owner(lock, proxy_owner);
19976         rt_mutex_deadlock_account_lock(lock, proxy_owner);
19977 @@ -1596,13 +2189,42 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19978  {
19979         int ret;
19980  
19981 -       raw_spin_lock(&lock->wait_lock);
19982 +       raw_spin_lock_irq(&lock->wait_lock);
19983  
19984         if (try_to_take_rt_mutex(lock, task, NULL)) {
19985 -               raw_spin_unlock(&lock->wait_lock);
19986 +               raw_spin_unlock_irq(&lock->wait_lock);
19987                 return 1;
19988         }
19989  
19990 +#ifdef CONFIG_PREEMPT_RT_FULL
19991 +       /*
19992 +        * In PREEMPT_RT there's an added race.
19993 +        * If the task, that we are about to requeue, times out,
19994 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
19995 +        * to skip this task. But right after the task sets
19996 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
19997 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
19998 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
19999 +        * lock that it blocks on. We *must not* place this task
20000 +        * on this proxy lock in that case.
20001 +        *
20002 +        * To prevent this race, we first take the task's pi_lock
20003 +        * and check if it has updated its pi_blocked_on. If it has,
20004 +        * we assume that it woke up and we return -EAGAIN.
20005 +        * Otherwise, we set the task's pi_blocked_on to
20006 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
20007 +        * it will know that we are in the process of requeuing it.
20008 +        */
20009 +       raw_spin_lock(&task->pi_lock);
20010 +       if (task->pi_blocked_on) {
20011 +               raw_spin_unlock(&task->pi_lock);
20012 +               raw_spin_unlock_irq(&lock->wait_lock);
20013 +               return -EAGAIN;
20014 +       }
20015 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
20016 +       raw_spin_unlock(&task->pi_lock);
20017 +#endif
20018 +
20019         /* We enforce deadlock detection for futexes */
20020         ret = task_blocks_on_rt_mutex(lock, waiter, task,
20021                                       RT_MUTEX_FULL_CHAINWALK);
20022 @@ -1617,10 +2239,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20023                 ret = 0;
20024         }
20025  
20026 -       if (unlikely(ret))
20027 +       if (ret && rt_mutex_has_waiters(lock))
20028                 remove_waiter(lock, waiter);
20029  
20030 -       raw_spin_unlock(&lock->wait_lock);
20031 +       raw_spin_unlock_irq(&lock->wait_lock);
20032  
20033         debug_rt_mutex_print_deadlock(waiter);
20034  
20035 @@ -1668,12 +2290,12 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20036  {
20037         int ret;
20038  
20039 -       raw_spin_lock(&lock->wait_lock);
20040 +       raw_spin_lock_irq(&lock->wait_lock);
20041  
20042         set_current_state(TASK_INTERRUPTIBLE);
20043  
20044         /* sleep on the mutex */
20045 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
20046 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
20047  
20048         if (unlikely(ret))
20049                 remove_waiter(lock, waiter);
20050 @@ -1684,7 +2306,93 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20051          */
20052         fixup_rt_mutex_waiters(lock);
20053  
20054 -       raw_spin_unlock(&lock->wait_lock);
20055 +       raw_spin_unlock_irq(&lock->wait_lock);
20056  
20057         return ret;
20058  }
20059 +
20060 +static inline int
20061 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
20062 +{
20063 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
20064 +       unsigned tmp;
20065 +
20066 +       if (ctx->deadlock_inject_countdown-- == 0) {
20067 +               tmp = ctx->deadlock_inject_interval;
20068 +               if (tmp > UINT_MAX/4)
20069 +                       tmp = UINT_MAX;
20070 +               else
20071 +                       tmp = tmp*2 + tmp + tmp/2;
20072 +
20073 +               ctx->deadlock_inject_interval = tmp;
20074 +               ctx->deadlock_inject_countdown = tmp;
20075 +               ctx->contending_lock = lock;
20076 +
20077 +               ww_mutex_unlock(lock);
20078 +
20079 +               return -EDEADLK;
20080 +       }
20081 +#endif
20082 +
20083 +       return 0;
20084 +}
20085 +
20086 +#ifdef CONFIG_PREEMPT_RT_FULL
20087 +int __sched
20088 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20089 +{
20090 +       int ret;
20091 +
20092 +       might_sleep();
20093 +
20094 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20095 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
20096 +       if (ret)
20097 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20098 +       else if (!ret && ww_ctx->acquired > 1)
20099 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20100 +
20101 +       return ret;
20102 +}
20103 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
20104 +
20105 +int __sched
20106 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20107 +{
20108 +       int ret;
20109 +
20110 +       might_sleep();
20111 +
20112 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20113 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
20114 +       if (ret)
20115 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20116 +       else if (!ret && ww_ctx->acquired > 1)
20117 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20118 +
20119 +       return ret;
20120 +}
20121 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
20122 +
20123 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
20124 +{
20125 +       int nest = !!lock->ctx;
20126 +
20127 +       /*
20128 +        * The unlocking fastpath is the 0->1 transition from 'locked'
20129 +        * into 'unlocked' state:
20130 +        */
20131 +       if (nest) {
20132 +#ifdef CONFIG_DEBUG_MUTEXES
20133 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
20134 +#endif
20135 +               if (lock->ctx->acquired > 0)
20136 +                       lock->ctx->acquired--;
20137 +               lock->ctx = NULL;
20138 +       }
20139 +
20140 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
20141 +       rt_mutex_unlock(&lock->base.lock);
20142 +}
20143 +EXPORT_SYMBOL(ww_mutex_unlock);
20144 +#endif
20145 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
20146 index 4f5f83c7d2d3..289f062f26cd 100644
20147 --- a/kernel/locking/rtmutex_common.h
20148 +++ b/kernel/locking/rtmutex_common.h
20149 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
20150         struct rb_node          pi_tree_entry;
20151         struct task_struct      *task;
20152         struct rt_mutex         *lock;
20153 +       bool                    savestate;
20154  #ifdef CONFIG_DEBUG_RT_MUTEXES
20155         unsigned long           ip;
20156         struct pid              *deadlock_task_pid;
20157 @@ -97,6 +98,9 @@ enum rtmutex_chainwalk {
20158  /*
20159   * PI-futex support (proxy locking functions, etc.):
20160   */
20161 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
20162 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
20163 +
20164  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
20165  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20166                                        struct task_struct *proxy_owner);
20167 @@ -110,7 +114,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20168                                       struct rt_mutex_waiter *waiter);
20169  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
20170  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
20171 -                                 struct wake_q_head *wqh);
20172 +                                 struct wake_q_head *wqh,
20173 +                                 struct wake_q_head *wq_sleeper);
20174  extern void rt_mutex_adjust_prio(struct task_struct *task);
20175  
20176  #ifdef CONFIG_DEBUG_RT_MUTEXES
20177 @@ -119,4 +124,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
20178  # include "rtmutex.h"
20179  #endif
20180  
20181 +static inline void
20182 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
20183 +{
20184 +       debug_rt_mutex_init_waiter(waiter);
20185 +       waiter->task = NULL;
20186 +       waiter->savestate = savestate;
20187 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
20188 +       RB_CLEAR_NODE(&waiter->tree_entry);
20189 +}
20190 +
20191  #endif
20192 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20193 index db3ccb1dd614..909779647bd1 100644
20194 --- a/kernel/locking/spinlock.c
20195 +++ b/kernel/locking/spinlock.c
20196 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
20197   *         __[spin|read|write]_lock_bh()
20198   */
20199  BUILD_LOCK_OPS(spin, raw_spinlock);
20200 +
20201 +#ifndef CONFIG_PREEMPT_RT_FULL
20202  BUILD_LOCK_OPS(read, rwlock);
20203  BUILD_LOCK_OPS(write, rwlock);
20204 +#endif
20205  
20206  #endif
20207  
20208 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
20209  EXPORT_SYMBOL(_raw_spin_unlock_bh);
20210  #endif
20211  
20212 +#ifndef CONFIG_PREEMPT_RT_FULL
20213 +
20214  #ifndef CONFIG_INLINE_READ_TRYLOCK
20215  int __lockfunc _raw_read_trylock(rwlock_t *lock)
20216  {
20217 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
20218  EXPORT_SYMBOL(_raw_write_unlock_bh);
20219  #endif
20220  
20221 +#endif /* !PREEMPT_RT_FULL */
20222 +
20223  #ifdef CONFIG_DEBUG_LOCK_ALLOC
20224  
20225  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
20226 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20227 index 0374a596cffa..94970338d518 100644
20228 --- a/kernel/locking/spinlock_debug.c
20229 +++ b/kernel/locking/spinlock_debug.c
20230 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
20231  
20232  EXPORT_SYMBOL(__raw_spin_lock_init);
20233  
20234 +#ifndef CONFIG_PREEMPT_RT_FULL
20235  void __rwlock_init(rwlock_t *lock, const char *name,
20236                    struct lock_class_key *key)
20237  {
20238 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
20239  }
20240  
20241  EXPORT_SYMBOL(__rwlock_init);
20242 +#endif
20243  
20244  static void spin_dump(raw_spinlock_t *lock, const char *msg)
20245  {
20246 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
20247         arch_spin_unlock(&lock->raw_lock);
20248  }
20249  
20250 +#ifndef CONFIG_PREEMPT_RT_FULL
20251  static void rwlock_bug(rwlock_t *lock, const char *msg)
20252  {
20253         if (!debug_locks_off())
20254 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
20255         debug_write_unlock(lock);
20256         arch_write_unlock(&lock->raw_lock);
20257  }
20258 +
20259 +#endif
20260 diff --git a/kernel/panic.c b/kernel/panic.c
20261 index 41e2b54f36b5..3535f802953a 100644
20262 --- a/kernel/panic.c
20263 +++ b/kernel/panic.c
20264 @@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
20265                 cpu_relax();
20266  }
20267  
20268 +/*
20269 + * Stop ourselves in NMI context if another CPU has already panicked. Arch code
20270 + * may override this to prepare for crash dumping, e.g. save regs info.
20271 + */
20272 +void __weak nmi_panic_self_stop(struct pt_regs *regs)
20273 +{
20274 +       panic_smp_self_stop();
20275 +}
20276 +
20277 +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
20278 +
20279 +/*
20280 + * A variant of panic() called from NMI context. We return if we've already
20281 + * panicked on this CPU. If another CPU already panicked, loop in
20282 + * nmi_panic_self_stop() which can provide architecture dependent code such
20283 + * as saving register state for crash dump.
20284 + */
20285 +void nmi_panic(struct pt_regs *regs, const char *msg)
20286 +{
20287 +       int old_cpu, cpu;
20288 +
20289 +       cpu = raw_smp_processor_id();
20290 +       old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
20291 +
20292 +       if (old_cpu == PANIC_CPU_INVALID)
20293 +               panic("%s", msg);
20294 +       else if (old_cpu != cpu)
20295 +               nmi_panic_self_stop(regs);
20296 +}
20297 +EXPORT_SYMBOL(nmi_panic);
20298 +
20299  /**
20300   *     panic - halt the system
20301   *     @fmt: The text string to print
20302 @@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
20303   */
20304  void panic(const char *fmt, ...)
20305  {
20306 -       static DEFINE_SPINLOCK(panic_lock);
20307         static char buf[1024];
20308         va_list args;
20309         long i, i_next = 0;
20310         int state = 0;
20311 +       int old_cpu, this_cpu;
20312  
20313         /*
20314          * Disable local interrupts. This will prevent panic_smp_self_stop
20315          * from deadlocking the first cpu that invokes the panic, since
20316          * there is nothing to prevent an interrupt handler (that runs
20317 -        * after the panic_lock is acquired) from invoking panic again.
20318 +        * after setting panic_cpu) from invoking panic() again.
20319          */
20320         local_irq_disable();
20321  
20322 @@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
20323          * multiple parallel invocations of panic, all other CPUs either
20324          * stop themself or will wait until they are stopped by the 1st CPU
20325          * with smp_send_stop().
20326 +        *
20327 +        * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
20328 +        * comes here, so go ahead.
20329 +        * `old_cpu == this_cpu' means we came from nmi_panic() which sets
20330 +        * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
20331          */
20332 -       if (!spin_trylock(&panic_lock))
20333 +       this_cpu = raw_smp_processor_id();
20334 +       old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
20335 +
20336 +       if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
20337                 panic_smp_self_stop();
20338  
20339         console_verbose();
20340 @@ -400,9 +439,11 @@ static u64 oops_id;
20341  
20342  static int init_oops_id(void)
20343  {
20344 +#ifndef CONFIG_PREEMPT_RT_FULL
20345         if (!oops_id)
20346                 get_random_bytes(&oops_id, sizeof(oops_id));
20347         else
20348 +#endif
20349                 oops_id++;
20350  
20351         return 0;
20352 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
20353 index 3124cebaec31..c1b981521dd0 100644
20354 --- a/kernel/power/hibernate.c
20355 +++ b/kernel/power/hibernate.c
20356 @@ -285,6 +285,8 @@ static int create_image(int platform_mode)
20357  
20358         local_irq_disable();
20359  
20360 +       system_state = SYSTEM_SUSPEND;
20361 +
20362         error = syscore_suspend();
20363         if (error) {
20364                 printk(KERN_ERR "PM: Some system devices failed to power down, "
20365 @@ -314,6 +316,7 @@ static int create_image(int platform_mode)
20366         syscore_resume();
20367  
20368   Enable_irqs:
20369 +       system_state = SYSTEM_RUNNING;
20370         local_irq_enable();
20371  
20372   Enable_cpus:
20373 @@ -438,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
20374                 goto Enable_cpus;
20375  
20376         local_irq_disable();
20377 +       system_state = SYSTEM_SUSPEND;
20378  
20379         error = syscore_suspend();
20380         if (error)
20381 @@ -471,6 +475,7 @@ static int resume_target_kernel(bool platform_mode)
20382         syscore_resume();
20383  
20384   Enable_irqs:
20385 +       system_state = SYSTEM_RUNNING;
20386         local_irq_enable();
20387  
20388   Enable_cpus:
20389 @@ -556,6 +561,7 @@ int hibernation_platform_enter(void)
20390                 goto Enable_cpus;
20391  
20392         local_irq_disable();
20393 +       system_state = SYSTEM_SUSPEND;
20394         syscore_suspend();
20395         if (pm_wakeup_pending()) {
20396                 error = -EAGAIN;
20397 @@ -568,6 +574,7 @@ int hibernation_platform_enter(void)
20398  
20399   Power_up:
20400         syscore_resume();
20401 +       system_state = SYSTEM_RUNNING;
20402         local_irq_enable();
20403  
20404   Enable_cpus:
20405 @@ -642,6 +649,10 @@ static void power_down(void)
20406                 cpu_relax();
20407  }
20408  
20409 +#ifndef CONFIG_SUSPEND
20410 +bool pm_in_action;
20411 +#endif
20412 +
20413  /**
20414   * hibernate - Carry out system hibernation, including saving the image.
20415   */
20416 @@ -654,6 +665,8 @@ int hibernate(void)
20417                 return -EPERM;
20418         }
20419  
20420 +       pm_in_action = true;
20421 +
20422         lock_system_sleep();
20423         /* The snapshot device should not be opened while we're running */
20424         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
20425 @@ -719,6 +732,7 @@ int hibernate(void)
20426         atomic_inc(&snapshot_device_available);
20427   Unlock:
20428         unlock_system_sleep();
20429 +       pm_in_action = false;
20430         return error;
20431  }
20432  
20433 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
20434 index f9fe133c13e2..393bc342c586 100644
20435 --- a/kernel/power/suspend.c
20436 +++ b/kernel/power/suspend.c
20437 @@ -359,6 +359,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20438         arch_suspend_disable_irqs();
20439         BUG_ON(!irqs_disabled());
20440  
20441 +       system_state = SYSTEM_SUSPEND;
20442 +
20443         error = syscore_suspend();
20444         if (!error) {
20445                 *wakeup = pm_wakeup_pending();
20446 @@ -375,6 +377,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20447                 syscore_resume();
20448         }
20449  
20450 +       system_state = SYSTEM_RUNNING;
20451 +
20452         arch_suspend_enable_irqs();
20453         BUG_ON(irqs_disabled());
20454  
20455 @@ -518,6 +522,8 @@ static int enter_state(suspend_state_t state)
20456         return error;
20457  }
20458  
20459 +bool pm_in_action;
20460 +
20461  /**
20462   * pm_suspend - Externally visible function for suspending the system.
20463   * @state: System sleep state to enter.
20464 @@ -532,6 +538,8 @@ int pm_suspend(suspend_state_t state)
20465         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
20466                 return -EINVAL;
20467  
20468 +       pm_in_action = true;
20469 +
20470         error = enter_state(state);
20471         if (error) {
20472                 suspend_stats.fail++;
20473 @@ -539,6 +547,7 @@ int pm_suspend(suspend_state_t state)
20474         } else {
20475                 suspend_stats.success++;
20476         }
20477 +       pm_in_action = false;
20478         return error;
20479  }
20480  EXPORT_SYMBOL(pm_suspend);
20481 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
20482 index c048e34b177f..c747bdfa199e 100644
20483 --- a/kernel/printk/printk.c
20484 +++ b/kernel/printk/printk.c
20485 @@ -241,6 +241,65 @@ struct printk_log {
20486   */
20487  static DEFINE_RAW_SPINLOCK(logbuf_lock);
20488  
20489 +#ifdef CONFIG_EARLY_PRINTK
20490 +struct console *early_console;
20491 +
20492 +static void early_vprintk(const char *fmt, va_list ap)
20493 +{
20494 +       if (early_console) {
20495 +               char buf[512];
20496 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
20497 +
20498 +               early_console->write(early_console, buf, n);
20499 +       }
20500 +}
20501 +
20502 +asmlinkage void early_printk(const char *fmt, ...)
20503 +{
20504 +       va_list ap;
20505 +
20506 +       va_start(ap, fmt);
20507 +       early_vprintk(fmt, ap);
20508 +       va_end(ap);
20509 +}
20510 +
20511 +/*
20512 + * This is independent of any log levels - a global
20513 + * kill switch that turns off all of printk.
20514 + *
20515 + * Used by the NMI watchdog if early-printk is enabled.
20516 + */
20517 +static bool __read_mostly printk_killswitch;
20518 +
20519 +static int __init force_early_printk_setup(char *str)
20520 +{
20521 +       printk_killswitch = true;
20522 +       return 0;
20523 +}
20524 +early_param("force_early_printk", force_early_printk_setup);
20525 +
20526 +void printk_kill(void)
20527 +{
20528 +       printk_killswitch = true;
20529 +}
20530 +
20531 +#ifdef CONFIG_PRINTK
20532 +static int forced_early_printk(const char *fmt, va_list ap)
20533 +{
20534 +       if (!printk_killswitch)
20535 +               return 0;
20536 +       early_vprintk(fmt, ap);
20537 +       return 1;
20538 +}
20539 +#endif
20540 +
20541 +#else
20542 +static inline int forced_early_printk(const char *fmt, va_list ap)
20543 +{
20544 +       return 0;
20545 +}
20546 +#endif
20547 +
20548  #ifdef CONFIG_PRINTK
20549  DECLARE_WAIT_QUEUE_HEAD(log_wait);
20550  /* the next printk record to read by syslog(READ) or /proc/kmsg */
20551 @@ -1203,6 +1262,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20552  {
20553         char *text;
20554         int len = 0;
20555 +       int attempts = 0;
20556  
20557         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
20558         if (!text)
20559 @@ -1214,7 +1274,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20560                 u64 seq;
20561                 u32 idx;
20562                 enum log_flags prev;
20563 -
20564 +               int num_msg;
20565 +try_again:
20566 +               attempts++;
20567 +               if (attempts > 10) {
20568 +                       len = -EBUSY;
20569 +                       goto out;
20570 +               }
20571 +               num_msg = 0;
20572                 if (clear_seq < log_first_seq) {
20573                         /* messages are gone, move to first available one */
20574                         clear_seq = log_first_seq;
20575 @@ -1235,6 +1302,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20576                         prev = msg->flags;
20577                         idx = log_next(idx);
20578                         seq++;
20579 +                       num_msg++;
20580 +                       if (num_msg > 5) {
20581 +                               num_msg = 0;
20582 +                               raw_spin_unlock_irq(&logbuf_lock);
20583 +                               raw_spin_lock_irq(&logbuf_lock);
20584 +                               if (clear_seq < log_first_seq)
20585 +                                       goto try_again;
20586 +                       }
20587                 }
20588  
20589                 /* move first record forward until length fits into the buffer */
20590 @@ -1248,6 +1323,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20591                         prev = msg->flags;
20592                         idx = log_next(idx);
20593                         seq++;
20594 +                       num_msg++;
20595 +                       if (num_msg > 5) {
20596 +                               num_msg = 0;
20597 +                               raw_spin_unlock_irq(&logbuf_lock);
20598 +                               raw_spin_lock_irq(&logbuf_lock);
20599 +                               if (clear_seq < log_first_seq)
20600 +                                       goto try_again;
20601 +                       }
20602                 }
20603  
20604                 /* last message fitting into this dump */
20605 @@ -1288,6 +1371,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20606                 clear_seq = log_next_seq;
20607                 clear_idx = log_next_idx;
20608         }
20609 +out:
20610         raw_spin_unlock_irq(&logbuf_lock);
20611  
20612         kfree(text);
20613 @@ -1443,6 +1527,12 @@ static void call_console_drivers(int level,
20614         if (!console_drivers)
20615                 return;
20616  
20617 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20618 +               if (in_irq() || in_nmi())
20619 +                       return;
20620 +       }
20621 +
20622 +       migrate_disable();
20623         for_each_console(con) {
20624                 if (exclusive_console && con != exclusive_console)
20625                         continue;
20626 @@ -1458,6 +1548,7 @@ static void call_console_drivers(int level,
20627                 else
20628                         con->write(con, text, len);
20629         }
20630 +       migrate_enable();
20631  }
20632  
20633  /*
20634 @@ -1518,6 +1609,15 @@ static inline int can_use_console(unsigned int cpu)
20635  static int console_trylock_for_printk(void)
20636  {
20637         unsigned int cpu = smp_processor_id();
20638 +#ifdef CONFIG_PREEMPT_RT_FULL
20639 +       int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
20640 +               !irqs_disabled();
20641 +#else
20642 +       int lock = 1;
20643 +#endif
20644 +
20645 +       if (!lock)
20646 +               return 0;
20647  
20648         if (!console_trylock())
20649                 return 0;
20650 @@ -1672,6 +1772,13 @@ asmlinkage int vprintk_emit(int facility, int level,
20651         /* cpu currently holding logbuf_lock in this function */
20652         static unsigned int logbuf_cpu = UINT_MAX;
20653  
20654 +       /*
20655 +        * Fall back to early_printk if a debugging subsystem has
20656 +        * killed printk output
20657 +        */
20658 +       if (unlikely(forced_early_printk(fmt, args)))
20659 +               return 1;
20660 +
20661         if (level == LOGLEVEL_SCHED) {
20662                 level = LOGLEVEL_DEFAULT;
20663                 in_sched = true;
20664 @@ -1813,8 +1920,7 @@ asmlinkage int vprintk_emit(int facility, int level,
20665                  * console_sem which would prevent anyone from printing to
20666                  * console
20667                  */
20668 -               preempt_disable();
20669 -
20670 +               migrate_disable();
20671                 /*
20672                  * Try to acquire and then immediately release the console
20673                  * semaphore.  The release will print out buffers and wake up
20674 @@ -1822,7 +1928,7 @@ asmlinkage int vprintk_emit(int facility, int level,
20675                  */
20676                 if (console_trylock_for_printk())
20677                         console_unlock();
20678 -               preempt_enable();
20679 +               migrate_enable();
20680                 lockdep_on();
20681         }
20682  
20683 @@ -1961,26 +2067,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
20684  
20685  #endif /* CONFIG_PRINTK */
20686  
20687 -#ifdef CONFIG_EARLY_PRINTK
20688 -struct console *early_console;
20689 -
20690 -asmlinkage __visible void early_printk(const char *fmt, ...)
20691 -{
20692 -       va_list ap;
20693 -       char buf[512];
20694 -       int n;
20695 -
20696 -       if (!early_console)
20697 -               return;
20698 -
20699 -       va_start(ap, fmt);
20700 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
20701 -       va_end(ap);
20702 -
20703 -       early_console->write(early_console, buf, n);
20704 -}
20705 -#endif
20706 -
20707  static int __add_preferred_console(char *name, int idx, char *options,
20708                                    char *brl_options)
20709  {
20710 @@ -2202,11 +2288,16 @@ static void console_cont_flush(char *text, size_t size)
20711                 goto out;
20712  
20713         len = cont_print_text(text, size);
20714 +#ifdef CONFIG_PREEMPT_RT_FULL
20715 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20716 +       call_console_drivers(cont.level, NULL, 0, text, len);
20717 +#else
20718         raw_spin_unlock(&logbuf_lock);
20719         stop_critical_timings();
20720         call_console_drivers(cont.level, NULL, 0, text, len);
20721         start_critical_timings();
20722         local_irq_restore(flags);
20723 +#endif
20724         return;
20725  out:
20726         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20727 @@ -2316,13 +2407,17 @@ skip:
20728                 console_idx = log_next(console_idx);
20729                 console_seq++;
20730                 console_prev = msg->flags;
20731 +#ifdef CONFIG_PREEMPT_RT_FULL
20732 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20733 +               call_console_drivers(level, ext_text, ext_len, text, len);
20734 +#else
20735                 raw_spin_unlock(&logbuf_lock);
20736  
20737                 stop_critical_timings();        /* don't trace print latency */
20738                 call_console_drivers(level, ext_text, ext_len, text, len);
20739                 start_critical_timings();
20740                 local_irq_restore(flags);
20741 -
20742 +#endif
20743                 if (do_cond_resched)
20744                         cond_resched();
20745         }
20746 @@ -2374,6 +2469,11 @@ void console_unblank(void)
20747  {
20748         struct console *c;
20749  
20750 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20751 +               if (in_irq() || in_nmi())
20752 +                       return;
20753 +       }
20754 +
20755         /*
20756          * console_unblank can no longer be called in interrupt context unless
20757          * oops_in_progress is set to 1..
20758 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
20759 index 3189e51db7e8..1004af706be7 100644
20760 --- a/kernel/ptrace.c
20761 +++ b/kernel/ptrace.c
20762 @@ -129,7 +129,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
20763  
20764         spin_lock_irq(&task->sighand->siglock);
20765         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20766 -               task->state = __TASK_TRACED;
20767 +               unsigned long flags;
20768 +
20769 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
20770 +               if (task->state & __TASK_TRACED)
20771 +                       task->state = __TASK_TRACED;
20772 +               else
20773 +                       task->saved_state = __TASK_TRACED;
20774 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20775                 ret = true;
20776         }
20777         spin_unlock_irq(&task->sighand->siglock);
20778 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
20779 index d89328e260df..5bb3364a6284 100644
20780 --- a/kernel/rcu/rcutorture.c
20781 +++ b/kernel/rcu/rcutorture.c
20782 @@ -390,6 +390,7 @@ static struct rcu_torture_ops rcu_ops = {
20783         .name           = "rcu"
20784  };
20785  
20786 +#ifndef CONFIG_PREEMPT_RT_FULL
20787  /*
20788   * Definitions for rcu_bh torture testing.
20789   */
20790 @@ -429,6 +430,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
20791         .name           = "rcu_bh"
20792  };
20793  
20794 +#else
20795 +static struct rcu_torture_ops rcu_bh_ops = {
20796 +       .ttype          = INVALID_RCU_FLAVOR,
20797 +};
20798 +#endif
20799 +
20800  /*
20801   * Don't even think about trying any of these in real life!!!
20802   * The names includes "busted", and they really means it!
20803 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
20804 index f07343b54fe5..d862a203fce0 100644
20805 --- a/kernel/rcu/tree.c
20806 +++ b/kernel/rcu/tree.c
20807 @@ -56,6 +56,11 @@
20808  #include <linux/random.h>
20809  #include <linux/trace_events.h>
20810  #include <linux/suspend.h>
20811 +#include <linux/delay.h>
20812 +#include <linux/gfp.h>
20813 +#include <linux/oom.h>
20814 +#include <linux/smpboot.h>
20815 +#include "../time/tick-internal.h"
20816  
20817  #include "tree.h"
20818  #include "rcu.h"
20819 @@ -266,6 +271,19 @@ void rcu_sched_qs(void)
20820         }
20821  }
20822  
20823 +#ifdef CONFIG_PREEMPT_RT_FULL
20824 +static void rcu_preempt_qs(void);
20825 +
20826 +void rcu_bh_qs(void)
20827 +{
20828 +       unsigned long flags;
20829 +
20830 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
20831 +       local_irq_save(flags);
20832 +       rcu_preempt_qs();
20833 +       local_irq_restore(flags);
20834 +}
20835 +#else
20836  void rcu_bh_qs(void)
20837  {
20838         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
20839 @@ -275,6 +293,7 @@ void rcu_bh_qs(void)
20840                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
20841         }
20842  }
20843 +#endif
20844  
20845  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
20846  
20847 @@ -435,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
20848  /*
20849   * Return the number of RCU BH batches started thus far for debug & stats.
20850   */
20851 +#ifndef CONFIG_PREEMPT_RT_FULL
20852  unsigned long rcu_batches_started_bh(void)
20853  {
20854         return rcu_bh_state.gpnum;
20855  }
20856  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
20857 +#endif
20858  
20859  /*
20860   * Return the number of RCU batches completed thus far for debug & stats.
20861 @@ -459,6 +480,7 @@ unsigned long rcu_batches_completed_sched(void)
20862  }
20863  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
20864  
20865 +#ifndef CONFIG_PREEMPT_RT_FULL
20866  /*
20867   * Return the number of RCU BH batches completed thus far for debug & stats.
20868   */
20869 @@ -486,6 +508,13 @@ void rcu_bh_force_quiescent_state(void)
20870  }
20871  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
20872  
20873 +#else
20874 +void rcu_force_quiescent_state(void)
20875 +{
20876 +}
20877 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
20878 +#endif
20879 +
20880  /*
20881   * Force a quiescent state for RCU-sched.
20882   */
20883 @@ -536,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
20884         case RCU_FLAVOR:
20885                 rsp = rcu_state_p;
20886                 break;
20887 +#ifndef CONFIG_PREEMPT_RT_FULL
20888         case RCU_BH_FLAVOR:
20889                 rsp = &rcu_bh_state;
20890                 break;
20891 +#endif
20892         case RCU_SCHED_FLAVOR:
20893                 rsp = &rcu_sched_state;
20894                 break;
20895 @@ -1590,7 +1621,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
20896         int needmore;
20897         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
20898  
20899 -       rcu_nocb_gp_cleanup(rsp, rnp);
20900         rnp->need_future_gp[c & 0x1] = 0;
20901         needmore = rnp->need_future_gp[(c + 1) & 0x1];
20902         trace_rcu_future_gp(rnp, rdp, c,
20903 @@ -1611,7 +1641,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
20904             !READ_ONCE(rsp->gp_flags) ||
20905             !rsp->gp_kthread)
20906                 return;
20907 -       wake_up(&rsp->gp_wq);
20908 +       swake_up(&rsp->gp_wq);
20909  }
20910  
20911  /*
20912 @@ -1991,6 +2021,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
20913         int nocb = 0;
20914         struct rcu_data *rdp;
20915         struct rcu_node *rnp = rcu_get_root(rsp);
20916 +       struct swait_queue_head *sq;
20917  
20918         WRITE_ONCE(rsp->gp_activity, jiffies);
20919         raw_spin_lock_irq(&rnp->lock);
20920 @@ -2029,7 +2060,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
20921                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
20922                 /* smp_mb() provided by prior unlock-lock pair. */
20923                 nocb += rcu_future_gp_cleanup(rsp, rnp);
20924 +               sq = rcu_nocb_gp_get(rnp);
20925                 raw_spin_unlock_irq(&rnp->lock);
20926 +               rcu_nocb_gp_cleanup(sq);
20927                 cond_resched_rcu_qs();
20928                 WRITE_ONCE(rsp->gp_activity, jiffies);
20929                 rcu_gp_slow(rsp, gp_cleanup_delay);
20930 @@ -2076,7 +2109,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
20931                                                READ_ONCE(rsp->gpnum),
20932                                                TPS("reqwait"));
20933                         rsp->gp_state = RCU_GP_WAIT_GPS;
20934 -                       wait_event_interruptible(rsp->gp_wq,
20935 +                       swait_event_interruptible(rsp->gp_wq,
20936                                                  READ_ONCE(rsp->gp_flags) &
20937                                                  RCU_GP_FLAG_INIT);
20938                         rsp->gp_state = RCU_GP_DONE_GPS;
20939 @@ -2106,7 +2139,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
20940                                                READ_ONCE(rsp->gpnum),
20941                                                TPS("fqswait"));
20942                         rsp->gp_state = RCU_GP_WAIT_FQS;
20943 -                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
20944 +                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
20945                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
20946                         rsp->gp_state = RCU_GP_DOING_FQS;
20947                         /* Locking provides needed memory barriers. */
20948 @@ -2230,7 +2263,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
20949         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
20950         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
20951         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
20952 -       rcu_gp_kthread_wake(rsp);
20953 +       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
20954  }
20955  
20956  /*
20957 @@ -2891,7 +2924,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
20958         }
20959         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
20960         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
20961 -       rcu_gp_kthread_wake(rsp);
20962 +       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
20963  }
20964  
20965  /*
20966 @@ -2934,18 +2967,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
20967  /*
20968   * Do RCU core processing for the current CPU.
20969   */
20970 -static void rcu_process_callbacks(struct softirq_action *unused)
20971 +static void rcu_process_callbacks(void)
20972  {
20973         struct rcu_state *rsp;
20974  
20975         if (cpu_is_offline(smp_processor_id()))
20976                 return;
20977 -       trace_rcu_utilization(TPS("Start RCU core"));
20978         for_each_rcu_flavor(rsp)
20979                 __rcu_process_callbacks(rsp);
20980 -       trace_rcu_utilization(TPS("End RCU core"));
20981  }
20982  
20983 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
20984  /*
20985   * Schedule RCU callback invocation.  If the specified type of RCU
20986   * does not support RCU priority boosting, just do a direct call,
20987 @@ -2957,18 +2989,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
20988  {
20989         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
20990                 return;
20991 -       if (likely(!rsp->boost)) {
20992 -               rcu_do_batch(rsp, rdp);
20993 -               return;
20994 -       }
20995 -       invoke_rcu_callbacks_kthread();
20996 +       rcu_do_batch(rsp, rdp);
20997  }
20998  
20999 +static void rcu_wake_cond(struct task_struct *t, int status)
21000 +{
21001 +       /*
21002 +        * If the thread is yielding, only wake it when this
21003 +        * is invoked from idle
21004 +        */
21005 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21006 +               wake_up_process(t);
21007 +}
21008 +
21009 +/*
21010 + * Wake up this CPU's rcuc kthread to do RCU core processing.
21011 + */
21012  static void invoke_rcu_core(void)
21013  {
21014 -       if (cpu_online(smp_processor_id()))
21015 -               raise_softirq(RCU_SOFTIRQ);
21016 +       unsigned long flags;
21017 +       struct task_struct *t;
21018 +
21019 +       if (!cpu_online(smp_processor_id()))
21020 +               return;
21021 +       local_irq_save(flags);
21022 +       __this_cpu_write(rcu_cpu_has_work, 1);
21023 +       t = __this_cpu_read(rcu_cpu_kthread_task);
21024 +       if (t != NULL && current != t)
21025 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21026 +       local_irq_restore(flags);
21027 +}
21028 +
21029 +static void rcu_cpu_kthread_park(unsigned int cpu)
21030 +{
21031 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21032 +}
21033 +
21034 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
21035 +{
21036 +       return __this_cpu_read(rcu_cpu_has_work);
21037 +}
21038 +
21039 +/*
21040 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21041 + * RCU softirq used in flavors and configurations of RCU that do not
21042 + * support RCU priority boosting.
21043 + */
21044 +static void rcu_cpu_kthread(unsigned int cpu)
21045 +{
21046 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21047 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21048 +       int spincnt;
21049 +
21050 +       for (spincnt = 0; spincnt < 10; spincnt++) {
21051 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21052 +               local_bh_disable();
21053 +               *statusp = RCU_KTHREAD_RUNNING;
21054 +               this_cpu_inc(rcu_cpu_kthread_loops);
21055 +               local_irq_disable();
21056 +               work = *workp;
21057 +               *workp = 0;
21058 +               local_irq_enable();
21059 +               if (work)
21060 +                       rcu_process_callbacks();
21061 +               local_bh_enable();
21062 +               if (*workp == 0) {
21063 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21064 +                       *statusp = RCU_KTHREAD_WAITING;
21065 +                       return;
21066 +               }
21067 +       }
21068 +       *statusp = RCU_KTHREAD_YIELDING;
21069 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21070 +       schedule_timeout_interruptible(2);
21071 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21072 +       *statusp = RCU_KTHREAD_WAITING;
21073 +}
21074 +
21075 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21076 +       .store                  = &rcu_cpu_kthread_task,
21077 +       .thread_should_run      = rcu_cpu_kthread_should_run,
21078 +       .thread_fn              = rcu_cpu_kthread,
21079 +       .thread_comm            = "rcuc/%u",
21080 +       .setup                  = rcu_cpu_kthread_setup,
21081 +       .park                   = rcu_cpu_kthread_park,
21082 +};
21083 +
21084 +/*
21085 + * Spawn per-CPU RCU core processing kthreads.
21086 + */
21087 +static int __init rcu_spawn_core_kthreads(void)
21088 +{
21089 +       int cpu;
21090 +
21091 +       for_each_possible_cpu(cpu)
21092 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
21093 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21094 +       return 0;
21095  }
21096 +early_initcall(rcu_spawn_core_kthreads);
21097  
21098  /*
21099   * Handle any core-RCU processing required by a call_rcu() invocation.
21100 @@ -3114,6 +3233,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
21101  }
21102  EXPORT_SYMBOL_GPL(call_rcu_sched);
21103  
21104 +#ifndef CONFIG_PREEMPT_RT_FULL
21105  /*
21106   * Queue an RCU callback for invocation after a quicker grace period.
21107   */
21108 @@ -3122,6 +3242,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
21109         __call_rcu(head, func, &rcu_bh_state, -1, 0);
21110  }
21111  EXPORT_SYMBOL_GPL(call_rcu_bh);
21112 +#endif
21113  
21114  /*
21115   * Queue an RCU callback for lazy invocation after a grace period.
21116 @@ -3213,6 +3334,7 @@ void synchronize_sched(void)
21117  }
21118  EXPORT_SYMBOL_GPL(synchronize_sched);
21119  
21120 +#ifndef CONFIG_PREEMPT_RT_FULL
21121  /**
21122   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21123   *
21124 @@ -3239,6 +3361,7 @@ void synchronize_rcu_bh(void)
21125                 wait_rcu_gp(call_rcu_bh);
21126  }
21127  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21128 +#endif
21129  
21130  /**
21131   * get_state_synchronize_rcu - Snapshot current RCU state
21132 @@ -3524,7 +3647,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
21133                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21134                         if (wake) {
21135                                 smp_mb(); /* EGP done before wake_up(). */
21136 -                               wake_up(&rsp->expedited_wq);
21137 +                               swake_up(&rsp->expedited_wq);
21138                         }
21139                         break;
21140                 }
21141 @@ -3781,7 +3904,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21142         jiffies_start = jiffies;
21143  
21144         for (;;) {
21145 -               ret = wait_event_interruptible_timeout(
21146 +               ret = swait_event_timeout(
21147                                 rsp->expedited_wq,
21148                                 sync_rcu_preempt_exp_done(rnp_root),
21149                                 jiffies_stall);
21150 @@ -3789,7 +3912,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21151                         return;
21152                 if (ret < 0) {
21153                         /* Hit a signal, disable CPU stall warnings. */
21154 -                       wait_event(rsp->expedited_wq,
21155 +                       swait_event(rsp->expedited_wq,
21156                                    sync_rcu_preempt_exp_done(rnp_root));
21157                         return;
21158                 }
21159 @@ -4101,6 +4224,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
21160         mutex_unlock(&rsp->barrier_mutex);
21161  }
21162  
21163 +#ifndef CONFIG_PREEMPT_RT_FULL
21164  /**
21165   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21166   */
21167 @@ -4109,6 +4233,7 @@ void rcu_barrier_bh(void)
21168         _rcu_barrier(&rcu_bh_state);
21169  }
21170  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21171 +#endif
21172  
21173  /**
21174   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
21175 @@ -4455,8 +4580,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
21176                 }
21177         }
21178  
21179 -       init_waitqueue_head(&rsp->gp_wq);
21180 -       init_waitqueue_head(&rsp->expedited_wq);
21181 +       init_swait_queue_head(&rsp->gp_wq);
21182 +       init_swait_queue_head(&rsp->expedited_wq);
21183         rnp = rsp->level[rcu_num_lvls - 1];
21184         for_each_possible_cpu(i) {
21185                 while (i > rnp->grphi)
21186 @@ -4576,12 +4701,13 @@ void __init rcu_init(void)
21187  
21188         rcu_bootup_announce();
21189         rcu_init_geometry();
21190 +#ifndef CONFIG_PREEMPT_RT_FULL
21191         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
21192 +#endif
21193         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
21194         if (dump_tree)
21195                 rcu_dump_rcu_node_tree(&rcu_sched_state);
21196         __rcu_init_preempt();
21197 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
21198  
21199         /*
21200          * We don't need protection against CPU-hotplug here because
21201 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21202 index 9fb4e238d4dc..c75834d8de24 100644
21203 --- a/kernel/rcu/tree.h
21204 +++ b/kernel/rcu/tree.h
21205 @@ -27,6 +27,7 @@
21206  #include <linux/threads.h>
21207  #include <linux/cpumask.h>
21208  #include <linux/seqlock.h>
21209 +#include <linux/swait.h>
21210  #include <linux/stop_machine.h>
21211  
21212  /*
21213 @@ -241,7 +242,7 @@ struct rcu_node {
21214                                 /* Refused to boost: not sure why, though. */
21215                                 /*  This can happen due to race conditions. */
21216  #ifdef CONFIG_RCU_NOCB_CPU
21217 -       wait_queue_head_t nocb_gp_wq[2];
21218 +       struct swait_queue_head nocb_gp_wq[2];
21219                                 /* Place for rcu_nocb_kthread() to wait GP. */
21220  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
21221         int need_future_gp[2];
21222 @@ -393,7 +394,7 @@ struct rcu_data {
21223         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
21224         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
21225         struct rcu_head **nocb_follower_tail;
21226 -       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
21227 +       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
21228         struct task_struct *nocb_kthread;
21229         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
21230  
21231 @@ -472,7 +473,7 @@ struct rcu_state {
21232         unsigned long gpnum;                    /* Current gp number. */
21233         unsigned long completed;                /* # of last completed gp. */
21234         struct task_struct *gp_kthread;         /* Task for grace periods. */
21235 -       wait_queue_head_t gp_wq;                /* Where GP task waits. */
21236 +       struct swait_queue_head gp_wq;          /* Where GP task waits. */
21237         short gp_flags;                         /* Commands for GP task. */
21238         short gp_state;                         /* GP kthread sleep state. */
21239  
21240 @@ -504,7 +505,7 @@ struct rcu_state {
21241         atomic_long_t expedited_workdone3;      /* # done by others #3. */
21242         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
21243         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
21244 -       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
21245 +       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
21246         int ncpus_snap;                         /* # CPUs seen last time. */
21247  
21248         unsigned long jiffies_force_qs;         /* Time at which to invoke */
21249 @@ -556,18 +557,18 @@ extern struct list_head rcu_struct_flavors;
21250   */
21251  extern struct rcu_state rcu_sched_state;
21252  
21253 +#ifndef CONFIG_PREEMPT_RT_FULL
21254  extern struct rcu_state rcu_bh_state;
21255 +#endif
21256  
21257  #ifdef CONFIG_PREEMPT_RCU
21258  extern struct rcu_state rcu_preempt_state;
21259  #endif /* #ifdef CONFIG_PREEMPT_RCU */
21260  
21261 -#ifdef CONFIG_RCU_BOOST
21262  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21263  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21264  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21265  DECLARE_PER_CPU(char, rcu_cpu_has_work);
21266 -#endif /* #ifdef CONFIG_RCU_BOOST */
21267  
21268  #ifndef RCU_TREE_NONCORE
21269  
21270 @@ -587,10 +588,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
21271  static void __init __rcu_init_preempt(void);
21272  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21273  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21274 -static void invoke_rcu_callbacks_kthread(void);
21275  static bool rcu_is_callbacks_kthread(void);
21276 +static void rcu_cpu_kthread_setup(unsigned int cpu);
21277  #ifdef CONFIG_RCU_BOOST
21278 -static void rcu_preempt_do_callbacks(void);
21279  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21280                                                  struct rcu_node *rnp);
21281  #endif /* #ifdef CONFIG_RCU_BOOST */
21282 @@ -607,7 +607,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
21283  static void increment_cpu_stall_ticks(void);
21284  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
21285  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
21286 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
21287 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
21288 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
21289  static void rcu_init_one_nocb(struct rcu_node *rnp);
21290  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
21291                             bool lazy, unsigned long flags);
21292 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
21293 index 630c19772630..8e119cf647ba 100644
21294 --- a/kernel/rcu/tree_plugin.h
21295 +++ b/kernel/rcu/tree_plugin.h
21296 @@ -24,25 +24,10 @@
21297   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21298   */
21299  
21300 -#include <linux/delay.h>
21301 -#include <linux/gfp.h>
21302 -#include <linux/oom.h>
21303 -#include <linux/smpboot.h>
21304 -#include "../time/tick-internal.h"
21305 -
21306  #ifdef CONFIG_RCU_BOOST
21307  
21308  #include "../locking/rtmutex_common.h"
21309  
21310 -/*
21311 - * Control variables for per-CPU and per-rcu_node kthreads.  These
21312 - * handle all flavors of RCU.
21313 - */
21314 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21315 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21316 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21317 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
21318 -
21319  #else /* #ifdef CONFIG_RCU_BOOST */
21320  
21321  /*
21322 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
21323  
21324  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21325  
21326 +/*
21327 + * Control variables for per-CPU and per-rcu_node kthreads.  These
21328 + * handle all flavors of RCU.
21329 + */
21330 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21331 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21332 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
21333 +
21334  #ifdef CONFIG_RCU_NOCB_CPU
21335  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21336  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
21337 @@ -432,7 +425,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21338         }
21339  
21340         /* Hardware IRQ handlers cannot block, complain if they get here. */
21341 -       if (in_irq() || in_serving_softirq()) {
21342 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21343                 lockdep_rcu_suspicious(__FILE__, __LINE__,
21344                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21345                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21346 @@ -645,15 +638,6 @@ static void rcu_preempt_check_callbacks(void)
21347                 t->rcu_read_unlock_special.b.need_qs = true;
21348  }
21349  
21350 -#ifdef CONFIG_RCU_BOOST
21351 -
21352 -static void rcu_preempt_do_callbacks(void)
21353 -{
21354 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21355 -}
21356 -
21357 -#endif /* #ifdef CONFIG_RCU_BOOST */
21358 -
21359  /*
21360   * Queue a preemptible-RCU callback for invocation after a grace period.
21361   */
21362 @@ -930,6 +914,19 @@ void exit_rcu(void)
21363  
21364  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21365  
21366 +/*
21367 + * If boosting, set rcuc kthreads to realtime priority.
21368 + */
21369 +static void rcu_cpu_kthread_setup(unsigned int cpu)
21370 +{
21371 +#ifdef CONFIG_RCU_BOOST
21372 +       struct sched_param sp;
21373 +
21374 +       sp.sched_priority = kthread_prio;
21375 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21376 +#endif /* #ifdef CONFIG_RCU_BOOST */
21377 +}
21378 +
21379  #ifdef CONFIG_RCU_BOOST
21380  
21381  #include "../locking/rtmutex_common.h"
21382 @@ -961,16 +958,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
21383  
21384  #endif /* #else #ifdef CONFIG_RCU_TRACE */
21385  
21386 -static void rcu_wake_cond(struct task_struct *t, int status)
21387 -{
21388 -       /*
21389 -        * If the thread is yielding, only wake it when this
21390 -        * is invoked from idle
21391 -        */
21392 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21393 -               wake_up_process(t);
21394 -}
21395 -
21396  /*
21397   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21398   * or ->boost_tasks, advancing the pointer to the next task in the
21399 @@ -1115,23 +1102,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21400  }
21401  
21402  /*
21403 - * Wake up the per-CPU kthread to invoke RCU callbacks.
21404 - */
21405 -static void invoke_rcu_callbacks_kthread(void)
21406 -{
21407 -       unsigned long flags;
21408 -
21409 -       local_irq_save(flags);
21410 -       __this_cpu_write(rcu_cpu_has_work, 1);
21411 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21412 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
21413 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21414 -                             __this_cpu_read(rcu_cpu_kthread_status));
21415 -       }
21416 -       local_irq_restore(flags);
21417 -}
21418 -
21419 -/*
21420   * Is the current CPU running the RCU-callbacks kthread?
21421   * Caller must have preemption disabled.
21422   */
21423 @@ -1186,67 +1156,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21424         return 0;
21425  }
21426  
21427 -static void rcu_kthread_do_work(void)
21428 -{
21429 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21430 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21431 -       rcu_preempt_do_callbacks();
21432 -}
21433 -
21434 -static void rcu_cpu_kthread_setup(unsigned int cpu)
21435 -{
21436 -       struct sched_param sp;
21437 -
21438 -       sp.sched_priority = kthread_prio;
21439 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21440 -}
21441 -
21442 -static void rcu_cpu_kthread_park(unsigned int cpu)
21443 -{
21444 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21445 -}
21446 -
21447 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
21448 -{
21449 -       return __this_cpu_read(rcu_cpu_has_work);
21450 -}
21451 -
21452 -/*
21453 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21454 - * RCU softirq used in flavors and configurations of RCU that do not
21455 - * support RCU priority boosting.
21456 - */
21457 -static void rcu_cpu_kthread(unsigned int cpu)
21458 -{
21459 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21460 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21461 -       int spincnt;
21462 -
21463 -       for (spincnt = 0; spincnt < 10; spincnt++) {
21464 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21465 -               local_bh_disable();
21466 -               *statusp = RCU_KTHREAD_RUNNING;
21467 -               this_cpu_inc(rcu_cpu_kthread_loops);
21468 -               local_irq_disable();
21469 -               work = *workp;
21470 -               *workp = 0;
21471 -               local_irq_enable();
21472 -               if (work)
21473 -                       rcu_kthread_do_work();
21474 -               local_bh_enable();
21475 -               if (*workp == 0) {
21476 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21477 -                       *statusp = RCU_KTHREAD_WAITING;
21478 -                       return;
21479 -               }
21480 -       }
21481 -       *statusp = RCU_KTHREAD_YIELDING;
21482 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21483 -       schedule_timeout_interruptible(2);
21484 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21485 -       *statusp = RCU_KTHREAD_WAITING;
21486 -}
21487 -
21488  /*
21489   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21490   * served by the rcu_node in question.  The CPU hotplug lock is still
21491 @@ -1276,26 +1185,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
21492         free_cpumask_var(cm);
21493  }
21494  
21495 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21496 -       .store                  = &rcu_cpu_kthread_task,
21497 -       .thread_should_run      = rcu_cpu_kthread_should_run,
21498 -       .thread_fn              = rcu_cpu_kthread,
21499 -       .thread_comm            = "rcuc/%u",
21500 -       .setup                  = rcu_cpu_kthread_setup,
21501 -       .park                   = rcu_cpu_kthread_park,
21502 -};
21503 -
21504  /*
21505   * Spawn boost kthreads -- called as soon as the scheduler is running.
21506   */
21507  static void __init rcu_spawn_boost_kthreads(void)
21508  {
21509         struct rcu_node *rnp;
21510 -       int cpu;
21511 -
21512 -       for_each_possible_cpu(cpu)
21513 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
21514 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21515         rcu_for_each_leaf_node(rcu_state_p, rnp)
21516                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21517  }
21518 @@ -1318,11 +1213,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21519         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21520  }
21521  
21522 -static void invoke_rcu_callbacks_kthread(void)
21523 -{
21524 -       WARN_ON_ONCE(1);
21525 -}
21526 -
21527  static bool rcu_is_callbacks_kthread(void)
21528  {
21529         return false;
21530 @@ -1346,7 +1236,7 @@ static void rcu_prepare_kthreads(int cpu)
21531  
21532  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21533  
21534 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
21535 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21536  
21537  /*
21538   * Check to see if any future RCU-related work will need to be done
21539 @@ -1363,7 +1253,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21540         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
21541                ? 0 : rcu_cpu_has_callbacks(NULL);
21542  }
21543 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21544  
21545 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
21546  /*
21547   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21548   * after it.
21549 @@ -1459,6 +1351,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
21550         return cbs_ready;
21551  }
21552  
21553 +#ifndef CONFIG_PREEMPT_RT_FULL
21554 +
21555  /*
21556   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21557   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
21558 @@ -1504,6 +1398,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21559         *nextevt = basemono + dj * TICK_NSEC;
21560         return 0;
21561  }
21562 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21563  
21564  /*
21565   * Prepare a CPU for idle from an RCU perspective.  The first major task
21566 @@ -1822,9 +1717,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
21567   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
21568   * grace period.
21569   */
21570 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21571 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
21572  {
21573 -       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
21574 +       swake_up_all(sq);
21575  }
21576  
21577  /*
21578 @@ -1840,10 +1735,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
21579         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
21580  }
21581  
21582 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
21583 +{
21584 +       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
21585 +}
21586 +
21587  static void rcu_init_one_nocb(struct rcu_node *rnp)
21588  {
21589 -       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
21590 -       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
21591 +       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
21592 +       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
21593  }
21594  
21595  #ifndef CONFIG_RCU_NOCB_CPU_ALL
21596 @@ -1868,7 +1768,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
21597         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
21598                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
21599                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
21600 -               wake_up(&rdp_leader->nocb_wq);
21601 +               swake_up(&rdp_leader->nocb_wq);
21602         }
21603  }
21604  
21605 @@ -2081,7 +1981,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
21606          */
21607         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
21608         for (;;) {
21609 -               wait_event_interruptible(
21610 +               swait_event_interruptible(
21611                         rnp->nocb_gp_wq[c & 0x1],
21612                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
21613                 if (likely(d))
21614 @@ -2109,7 +2009,7 @@ wait_again:
21615         /* Wait for callbacks to appear. */
21616         if (!rcu_nocb_poll) {
21617                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
21618 -               wait_event_interruptible(my_rdp->nocb_wq,
21619 +               swait_event_interruptible(my_rdp->nocb_wq,
21620                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
21621                 /* Memory barrier handled by smp_mb() calls below and repoll. */
21622         } else if (firsttime) {
21623 @@ -2184,7 +2084,7 @@ wait_again:
21624                          * List was empty, wake up the follower.
21625                          * Memory barriers supplied by atomic_long_add().
21626                          */
21627 -                       wake_up(&rdp->nocb_wq);
21628 +                       swake_up(&rdp->nocb_wq);
21629                 }
21630         }
21631  
21632 @@ -2205,7 +2105,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
21633                 if (!rcu_nocb_poll) {
21634                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
21635                                             "FollowerSleep");
21636 -                       wait_event_interruptible(rdp->nocb_wq,
21637 +                       swait_event_interruptible(rdp->nocb_wq,
21638                                                  READ_ONCE(rdp->nocb_follower_head));
21639                 } else if (firsttime) {
21640                         /* Don't drown trace log with "Poll"! */
21641 @@ -2364,7 +2264,7 @@ void __init rcu_init_nohz(void)
21642  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
21643  {
21644         rdp->nocb_tail = &rdp->nocb_head;
21645 -       init_waitqueue_head(&rdp->nocb_wq);
21646 +       init_swait_queue_head(&rdp->nocb_wq);
21647         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
21648  }
21649  
21650 @@ -2514,7 +2414,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
21651         return false;
21652  }
21653  
21654 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21655 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
21656  {
21657  }
21658  
21659 @@ -2522,6 +2422,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
21660  {
21661  }
21662  
21663 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
21664 +{
21665 +       return NULL;
21666 +}
21667 +
21668  static void rcu_init_one_nocb(struct rcu_node *rnp)
21669  {
21670  }
21671 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
21672 index 5f748c5a40f0..9a3904603ff6 100644
21673 --- a/kernel/rcu/update.c
21674 +++ b/kernel/rcu/update.c
21675 @@ -276,6 +276,7 @@ int rcu_read_lock_held(void)
21676  }
21677  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21678  
21679 +#ifndef CONFIG_PREEMPT_RT_FULL
21680  /**
21681   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21682   *
21683 @@ -302,6 +303,7 @@ int rcu_read_lock_bh_held(void)
21684         return in_softirq() || irqs_disabled();
21685  }
21686  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
21687 +#endif
21688  
21689  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
21690  
21691 diff --git a/kernel/relay.c b/kernel/relay.c
21692 index 0b4570cfacae..60684be39f22 100644
21693 --- a/kernel/relay.c
21694 +++ b/kernel/relay.c
21695 @@ -336,6 +336,10 @@ static void wakeup_readers(unsigned long data)
21696  {
21697         struct rchan_buf *buf = (struct rchan_buf *)data;
21698         wake_up_interruptible(&buf->read_wait);
21699 +       /*
21700 +        * Stupid polling for now:
21701 +        */
21702 +       mod_timer(&buf->timer, jiffies + 1);
21703  }
21704  
21705  /**
21706 @@ -353,6 +357,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
21707                 init_waitqueue_head(&buf->read_wait);
21708                 kref_init(&buf->kref);
21709                 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
21710 +               mod_timer(&buf->timer, jiffies + 1);
21711         } else
21712                 del_timer_sync(&buf->timer);
21713  
21714 @@ -736,15 +741,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
21715                 else
21716                         buf->early_bytes += buf->chan->subbuf_size -
21717                                             buf->padding[old_subbuf];
21718 -               smp_mb();
21719 -               if (waitqueue_active(&buf->read_wait))
21720 -                       /*
21721 -                        * Calling wake_up_interruptible() from here
21722 -                        * will deadlock if we happen to be logging
21723 -                        * from the scheduler (trying to re-grab
21724 -                        * rq->lock), so defer it.
21725 -                        */
21726 -                       mod_timer(&buf->timer, jiffies + 1);
21727         }
21728  
21729         old = buf->data;
21730 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
21731 index 67687973ce80..01b9994b367a 100644
21732 --- a/kernel/sched/Makefile
21733 +++ b/kernel/sched/Makefile
21734 @@ -13,7 +13,7 @@ endif
21735  
21736  obj-y += core.o loadavg.o clock.o cputime.o
21737  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
21738 -obj-y += wait.o completion.o idle.o
21739 +obj-y += wait.o swait.o swork.o completion.o idle.o
21740  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
21741  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
21742  obj-$(CONFIG_SCHEDSTATS) += stats.o
21743 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
21744 index 8d0f35debf35..b62cf6400fe0 100644
21745 --- a/kernel/sched/completion.c
21746 +++ b/kernel/sched/completion.c
21747 @@ -30,10 +30,10 @@ void complete(struct completion *x)
21748  {
21749         unsigned long flags;
21750  
21751 -       spin_lock_irqsave(&x->wait.lock, flags);
21752 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21753         x->done++;
21754 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
21755 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21756 +       swake_up_locked(&x->wait);
21757 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21758  }
21759  EXPORT_SYMBOL(complete);
21760  
21761 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
21762  {
21763         unsigned long flags;
21764  
21765 -       spin_lock_irqsave(&x->wait.lock, flags);
21766 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21767         x->done += UINT_MAX/2;
21768 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
21769 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21770 +       swake_up_all_locked(&x->wait);
21771 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21772  }
21773  EXPORT_SYMBOL(complete_all);
21774  
21775 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
21776                    long (*action)(long), long timeout, int state)
21777  {
21778         if (!x->done) {
21779 -               DECLARE_WAITQUEUE(wait, current);
21780 +               DECLARE_SWAITQUEUE(wait);
21781  
21782 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
21783 +               __prepare_to_swait(&x->wait, &wait);
21784                 do {
21785                         if (signal_pending_state(state, current)) {
21786                                 timeout = -ERESTARTSYS;
21787                                 break;
21788                         }
21789                         __set_current_state(state);
21790 -                       spin_unlock_irq(&x->wait.lock);
21791 +                       raw_spin_unlock_irq(&x->wait.lock);
21792                         timeout = action(timeout);
21793 -                       spin_lock_irq(&x->wait.lock);
21794 +                       raw_spin_lock_irq(&x->wait.lock);
21795                 } while (!x->done && timeout);
21796 -               __remove_wait_queue(&x->wait, &wait);
21797 +               __finish_swait(&x->wait, &wait);
21798                 if (!x->done)
21799                         return timeout;
21800         }
21801 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
21802  {
21803         might_sleep();
21804  
21805 -       spin_lock_irq(&x->wait.lock);
21806 +       raw_spin_lock_irq(&x->wait.lock);
21807         timeout = do_wait_for_common(x, action, timeout, state);
21808 -       spin_unlock_irq(&x->wait.lock);
21809 +       raw_spin_unlock_irq(&x->wait.lock);
21810         return timeout;
21811  }
21812  
21813 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
21814         if (!READ_ONCE(x->done))
21815                 return 0;
21816  
21817 -       spin_lock_irqsave(&x->wait.lock, flags);
21818 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21819         if (!x->done)
21820                 ret = 0;
21821         else
21822                 x->done--;
21823 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21824 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21825         return ret;
21826  }
21827  EXPORT_SYMBOL(try_wait_for_completion);
21828 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
21829          * after it's acquired the lock.
21830          */
21831         smp_rmb();
21832 -       spin_unlock_wait(&x->wait.lock);
21833 +       raw_spin_unlock_wait(&x->wait.lock);
21834         return true;
21835  }
21836  EXPORT_SYMBOL(completion_done);
21837 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
21838 index 20253dbc8610..e9b8d518202e 100644
21839 --- a/kernel/sched/core.c
21840 +++ b/kernel/sched/core.c
21841 @@ -260,7 +260,11 @@ late_initcall(sched_init_debug);
21842   * Number of tasks to iterate in a single balance run.
21843   * Limited because this is done with IRQs disabled.
21844   */
21845 +#ifndef CONFIG_PREEMPT_RT_FULL
21846  const_debug unsigned int sysctl_sched_nr_migrate = 32;
21847 +#else
21848 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
21849 +#endif
21850  
21851  /*
21852   * period over which we average the RT time consumption, measured
21853 @@ -438,6 +442,7 @@ static void init_rq_hrtick(struct rq *rq)
21854  
21855         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21856         rq->hrtick_timer.function = hrtick;
21857 +       rq->hrtick_timer.irqsafe = 1;
21858  }
21859  #else  /* CONFIG_SCHED_HRTICK */
21860  static inline void hrtick_clear(struct rq *rq)
21861 @@ -542,7 +547,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
21862         head->lastp = &node->next;
21863  }
21864  
21865 -void wake_up_q(struct wake_q_head *head)
21866 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
21867  {
21868         struct wake_q_node *node = head->first;
21869  
21870 @@ -559,7 +564,10 @@ void wake_up_q(struct wake_q_head *head)
21871                  * wake_up_process() implies a wmb() to pair with the queueing
21872                  * in wake_q_add() so as not to miss wakeups.
21873                  */
21874 -               wake_up_process(task);
21875 +               if (sleeper)
21876 +                       wake_up_lock_sleeper(task);
21877 +               else
21878 +                       wake_up_process(task);
21879                 put_task_struct(task);
21880         }
21881  }
21882 @@ -595,6 +603,38 @@ void resched_curr(struct rq *rq)
21883                 trace_sched_wake_idle_without_ipi(cpu);
21884  }
21885  
21886 +#ifdef CONFIG_PREEMPT_LAZY
21887 +void resched_curr_lazy(struct rq *rq)
21888 +{
21889 +       struct task_struct *curr = rq->curr;
21890 +       int cpu;
21891 +
21892 +       if (!sched_feat(PREEMPT_LAZY)) {
21893 +               resched_curr(rq);
21894 +               return;
21895 +       }
21896 +
21897 +       lockdep_assert_held(&rq->lock);
21898 +
21899 +       if (test_tsk_need_resched(curr))
21900 +               return;
21901 +
21902 +       if (test_tsk_need_resched_lazy(curr))
21903 +               return;
21904 +
21905 +       set_tsk_need_resched_lazy(curr);
21906 +
21907 +       cpu = cpu_of(rq);
21908 +       if (cpu == smp_processor_id())
21909 +               return;
21910 +
21911 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
21912 +       smp_mb();
21913 +       if (!tsk_is_polling(curr))
21914 +               smp_send_reschedule(cpu);
21915 +}
21916 +#endif
21917 +
21918  void resched_cpu(int cpu)
21919  {
21920         struct rq *rq = cpu_rq(cpu);
21921 @@ -618,11 +658,14 @@ void resched_cpu(int cpu)
21922   */
21923  int get_nohz_timer_target(void)
21924  {
21925 -       int i, cpu = smp_processor_id();
21926 +       int i, cpu;
21927         struct sched_domain *sd;
21928  
21929 +       preempt_disable_rt();
21930 +       cpu = smp_processor_id();
21931 +
21932         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
21933 -               return cpu;
21934 +               goto preempt_en_rt;
21935  
21936         rcu_read_lock();
21937         for_each_domain(cpu, sd) {
21938 @@ -641,6 +684,8 @@ int get_nohz_timer_target(void)
21939                 cpu = housekeeping_any_cpu();
21940  unlock:
21941         rcu_read_unlock();
21942 +preempt_en_rt:
21943 +       preempt_enable_rt();
21944         return cpu;
21945  }
21946  /*
21947 @@ -1174,6 +1219,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21948  
21949         lockdep_assert_held(&p->pi_lock);
21950  
21951 +       if (__migrate_disabled(p)) {
21952 +               cpumask_copy(&p->cpus_allowed, new_mask);
21953 +               return;
21954 +       }
21955 +
21956         queued = task_on_rq_queued(p);
21957         running = task_current(rq, p);
21958  
21959 @@ -1196,6 +1246,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
21960                 enqueue_task(rq, p, ENQUEUE_RESTORE);
21961  }
21962  
21963 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
21964 +static DEFINE_MUTEX(sched_down_mutex);
21965 +static cpumask_t sched_down_cpumask;
21966 +
21967 +void tell_sched_cpu_down_begin(int cpu)
21968 +{
21969 +       mutex_lock(&sched_down_mutex);
21970 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
21971 +       mutex_unlock(&sched_down_mutex);
21972 +}
21973 +
21974 +void tell_sched_cpu_down_done(int cpu)
21975 +{
21976 +       mutex_lock(&sched_down_mutex);
21977 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
21978 +       mutex_unlock(&sched_down_mutex);
21979 +}
21980 +
21981 +/**
21982 + * migrate_me - try to move the current task off this cpu
21983 + *
21984 + * Used by the pin_current_cpu() code to try to get tasks
21985 + * to move off the current CPU as it is going down.
21986 + * It will only move the task if the task isn't pinned to
21987 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
21988 + * and the task has to be in a RUNNING state. Otherwise the
21989 + * movement of the task will wake it up (change its state
21990 + * to running) when the task did not expect it.
21991 + *
21992 + * Returns 1 if it succeeded in moving the current task
21993 + *         0 otherwise.
21994 + */
21995 +int migrate_me(void)
21996 +{
21997 +       struct task_struct *p = current;
21998 +       struct migration_arg arg;
21999 +       struct cpumask *cpumask;
22000 +       struct cpumask *mask;
22001 +       unsigned long flags;
22002 +       unsigned int dest_cpu;
22003 +       struct rq *rq;
22004 +
22005 +       /*
22006 +        * We can not migrate tasks bounded to a CPU or tasks not
22007 +        * running. The movement of the task will wake it up.
22008 +        */
22009 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
22010 +               return 0;
22011 +
22012 +       mutex_lock(&sched_down_mutex);
22013 +       rq = task_rq_lock(p, &flags);
22014 +
22015 +       cpumask = this_cpu_ptr(&sched_cpumasks);
22016 +       mask = &p->cpus_allowed;
22017 +
22018 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
22019 +
22020 +       if (!cpumask_weight(cpumask)) {
22021 +               /* It's only on this CPU? */
22022 +               task_rq_unlock(rq, p, &flags);
22023 +               mutex_unlock(&sched_down_mutex);
22024 +               return 0;
22025 +       }
22026 +
22027 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
22028 +
22029 +       arg.task = p;
22030 +       arg.dest_cpu = dest_cpu;
22031 +
22032 +       task_rq_unlock(rq, p, &flags);
22033 +
22034 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
22035 +       tlb_migrate_finish(p->mm);
22036 +       mutex_unlock(&sched_down_mutex);
22037 +
22038 +       return 1;
22039 +}
22040 +
22041  /*
22042   * Change a given task's CPU affinity. Migrate the thread to a
22043   * proper CPU and schedule it away if the CPU it's executing on
22044 @@ -1235,7 +1363,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22045         do_set_cpus_allowed(p, new_mask);
22046  
22047         /* Can the task run on the task's current CPU? If so, we're done */
22048 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
22049 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22050                 goto out;
22051  
22052         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
22053 @@ -1411,6 +1539,18 @@ out:
22054         return ret;
22055  }
22056  
22057 +static bool check_task_state(struct task_struct *p, long match_state)
22058 +{
22059 +       bool match = false;
22060 +
22061 +       raw_spin_lock_irq(&p->pi_lock);
22062 +       if (p->state == match_state || p->saved_state == match_state)
22063 +               match = true;
22064 +       raw_spin_unlock_irq(&p->pi_lock);
22065 +
22066 +       return match;
22067 +}
22068 +
22069  /*
22070   * wait_task_inactive - wait for a thread to unschedule.
22071   *
22072 @@ -1455,7 +1595,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22073                  * is actually now running somewhere else!
22074                  */
22075                 while (task_running(rq, p)) {
22076 -                       if (match_state && unlikely(p->state != match_state))
22077 +                       if (match_state && !check_task_state(p, match_state))
22078                                 return 0;
22079                         cpu_relax();
22080                 }
22081 @@ -1470,7 +1610,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22082                 running = task_running(rq, p);
22083                 queued = task_on_rq_queued(p);
22084                 ncsw = 0;
22085 -               if (!match_state || p->state == match_state)
22086 +               if (!match_state || p->state == match_state ||
22087 +                   p->saved_state == match_state)
22088                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22089                 task_rq_unlock(rq, p, &flags);
22090  
22091 @@ -1627,7 +1768,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22092  {
22093         lockdep_assert_held(&p->pi_lock);
22094  
22095 -       if (p->nr_cpus_allowed > 1)
22096 +       if (tsk_nr_cpus_allowed(p) > 1)
22097                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22098  
22099         /*
22100 @@ -1707,10 +1848,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
22101  {
22102         activate_task(rq, p, en_flags);
22103         p->on_rq = TASK_ON_RQ_QUEUED;
22104 -
22105 -       /* if a worker is waking up, notify workqueue */
22106 -       if (p->flags & PF_WQ_WORKER)
22107 -               wq_worker_waking_up(p, cpu_of(rq));
22108  }
22109  
22110  /*
22111 @@ -1937,8 +2074,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22112          */
22113         smp_mb__before_spinlock();
22114         raw_spin_lock_irqsave(&p->pi_lock, flags);
22115 -       if (!(p->state & state))
22116 +       if (!(p->state & state)) {
22117 +               /*
22118 +                * The task might be running due to a spinlock sleeper
22119 +                * wakeup. Check the saved state and set it to running
22120 +                * if the wakeup condition is true.
22121 +                */
22122 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
22123 +                       if (p->saved_state & state) {
22124 +                               p->saved_state = TASK_RUNNING;
22125 +                               success = 1;
22126 +                       }
22127 +               }
22128                 goto out;
22129 +       }
22130 +
22131 +       /*
22132 +        * If this is a regular wakeup, then we can unconditionally
22133 +        * clear the saved state of a "lock sleeper".
22134 +        */
22135 +       if (!(wake_flags & WF_LOCK_SLEEPER))
22136 +               p->saved_state = TASK_RUNNING;
22137  
22138         trace_sched_waking(p);
22139  
22140 @@ -2030,52 +2186,6 @@ out:
22141  }
22142  
22143  /**
22144 - * try_to_wake_up_local - try to wake up a local task with rq lock held
22145 - * @p: the thread to be awakened
22146 - *
22147 - * Put @p on the run-queue if it's not already there. The caller must
22148 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
22149 - * the current task.
22150 - */
22151 -static void try_to_wake_up_local(struct task_struct *p)
22152 -{
22153 -       struct rq *rq = task_rq(p);
22154 -
22155 -       if (WARN_ON_ONCE(rq != this_rq()) ||
22156 -           WARN_ON_ONCE(p == current))
22157 -               return;
22158 -
22159 -       lockdep_assert_held(&rq->lock);
22160 -
22161 -       if (!raw_spin_trylock(&p->pi_lock)) {
22162 -               /*
22163 -                * This is OK, because current is on_cpu, which avoids it being
22164 -                * picked for load-balance and preemption/IRQs are still
22165 -                * disabled avoiding further scheduler activity on it and we've
22166 -                * not yet picked a replacement task.
22167 -                */
22168 -               lockdep_unpin_lock(&rq->lock);
22169 -               raw_spin_unlock(&rq->lock);
22170 -               raw_spin_lock(&p->pi_lock);
22171 -               raw_spin_lock(&rq->lock);
22172 -               lockdep_pin_lock(&rq->lock);
22173 -       }
22174 -
22175 -       if (!(p->state & TASK_NORMAL))
22176 -               goto out;
22177 -
22178 -       trace_sched_waking(p);
22179 -
22180 -       if (!task_on_rq_queued(p))
22181 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
22182 -
22183 -       ttwu_do_wakeup(rq, p, 0);
22184 -       ttwu_stat(p, smp_processor_id(), 0);
22185 -out:
22186 -       raw_spin_unlock(&p->pi_lock);
22187 -}
22188 -
22189 -/**
22190   * wake_up_process - Wake up a specific process
22191   * @p: The process to be woken up.
22192   *
22193 @@ -2093,6 +2203,18 @@ int wake_up_process(struct task_struct *p)
22194  }
22195  EXPORT_SYMBOL(wake_up_process);
22196  
22197 +/**
22198 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22199 + * @p: The process to be woken up.
22200 + *
22201 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22202 + * the nature of the wakeup.
22203 + */
22204 +int wake_up_lock_sleeper(struct task_struct *p)
22205 +{
22206 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
22207 +}
22208 +
22209  int wake_up_state(struct task_struct *p, unsigned int state)
22210  {
22211         return try_to_wake_up(p, state, 0);
22212 @@ -2279,6 +2401,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22213         p->on_cpu = 0;
22214  #endif
22215         init_task_preempt_count(p);
22216 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22217 +       task_thread_info(p)->preempt_lazy_count = 0;
22218 +#endif
22219  #ifdef CONFIG_SMP
22220         plist_node_init(&p->pushable_tasks, MAX_PRIO);
22221         RB_CLEAR_NODE(&p->pushable_dl_tasks);
22222 @@ -2603,8 +2728,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
22223         finish_arch_post_lock_switch();
22224  
22225         fire_sched_in_preempt_notifiers(current);
22226 +       /*
22227 +        * We use mmdrop_delayed() here so we don't have to do the
22228 +        * full __mmdrop() when we are the last user.
22229 +        */
22230         if (mm)
22231 -               mmdrop(mm);
22232 +               mmdrop_delayed(mm);
22233         if (unlikely(prev_state == TASK_DEAD)) {
22234                 if (prev->sched_class->task_dead)
22235                         prev->sched_class->task_dead(prev);
22236 @@ -2935,16 +3064,6 @@ u64 scheduler_tick_max_deferment(void)
22237  }
22238  #endif
22239  
22240 -notrace unsigned long get_parent_ip(unsigned long addr)
22241 -{
22242 -       if (in_lock_functions(addr)) {
22243 -               addr = CALLER_ADDR2;
22244 -               if (in_lock_functions(addr))
22245 -                       addr = CALLER_ADDR3;
22246 -       }
22247 -       return addr;
22248 -}
22249 -
22250  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
22251                                 defined(CONFIG_PREEMPT_TRACER))
22252  
22253 @@ -2966,7 +3085,7 @@ void preempt_count_add(int val)
22254                                 PREEMPT_MASK - 10);
22255  #endif
22256         if (preempt_count() == val) {
22257 -               unsigned long ip = get_parent_ip(CALLER_ADDR1);
22258 +               unsigned long ip = get_lock_parent_ip();
22259  #ifdef CONFIG_DEBUG_PREEMPT
22260                 current->preempt_disable_ip = ip;
22261  #endif
22262 @@ -2993,7 +3112,7 @@ void preempt_count_sub(int val)
22263  #endif
22264  
22265         if (preempt_count() == val)
22266 -               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
22267 +               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
22268         __preempt_count_sub(val);
22269  }
22270  EXPORT_SYMBOL(preempt_count_sub);
22271 @@ -3048,6 +3167,77 @@ static inline void schedule_debug(struct task_struct *prev)
22272         schedstat_inc(this_rq(), sched_count);
22273  }
22274  
22275 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
22276 +
22277 +void migrate_disable(void)
22278 +{
22279 +       struct task_struct *p = current;
22280 +
22281 +       if (in_atomic() || irqs_disabled()) {
22282 +#ifdef CONFIG_SCHED_DEBUG
22283 +               p->migrate_disable_atomic++;
22284 +#endif
22285 +               return;
22286 +       }
22287 +
22288 +#ifdef CONFIG_SCHED_DEBUG
22289 +       if (unlikely(p->migrate_disable_atomic)) {
22290 +               tracing_off();
22291 +               WARN_ON_ONCE(1);
22292 +       }
22293 +#endif
22294 +
22295 +       if (p->migrate_disable) {
22296 +               p->migrate_disable++;
22297 +               return;
22298 +       }
22299 +
22300 +       preempt_disable();
22301 +       preempt_lazy_disable();
22302 +       pin_current_cpu();
22303 +       p->migrate_disable = 1;
22304 +       preempt_enable();
22305 +}
22306 +EXPORT_SYMBOL(migrate_disable);
22307 +
22308 +void migrate_enable(void)
22309 +{
22310 +       struct task_struct *p = current;
22311 +
22312 +       if (in_atomic() || irqs_disabled()) {
22313 +#ifdef CONFIG_SCHED_DEBUG
22314 +               p->migrate_disable_atomic--;
22315 +#endif
22316 +               return;
22317 +       }
22318 +
22319 +#ifdef CONFIG_SCHED_DEBUG
22320 +       if (unlikely(p->migrate_disable_atomic)) {
22321 +               tracing_off();
22322 +               WARN_ON_ONCE(1);
22323 +       }
22324 +#endif
22325 +       WARN_ON_ONCE(p->migrate_disable <= 0);
22326 +
22327 +       if (p->migrate_disable > 1) {
22328 +               p->migrate_disable--;
22329 +               return;
22330 +       }
22331 +
22332 +       preempt_disable();
22333 +       /*
22334 +        * Clearing migrate_disable causes tsk_cpus_allowed to
22335 +        * show the tasks original cpu affinity.
22336 +        */
22337 +       p->migrate_disable = 0;
22338 +
22339 +       unpin_current_cpu();
22340 +       preempt_enable();
22341 +       preempt_lazy_enable();
22342 +}
22343 +EXPORT_SYMBOL(migrate_enable);
22344 +#endif
22345 +
22346  /*
22347   * Pick up the highest-prio task:
22348   */
22349 @@ -3172,19 +3362,6 @@ static void __sched notrace __schedule(bool preempt)
22350                 } else {
22351                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
22352                         prev->on_rq = 0;
22353 -
22354 -                       /*
22355 -                        * If a worker went to sleep, notify and ask workqueue
22356 -                        * whether it wants to wake up a task to maintain
22357 -                        * concurrency.
22358 -                        */
22359 -                       if (prev->flags & PF_WQ_WORKER) {
22360 -                               struct task_struct *to_wakeup;
22361 -
22362 -                               to_wakeup = wq_worker_sleeping(prev, cpu);
22363 -                               if (to_wakeup)
22364 -                                       try_to_wake_up_local(to_wakeup);
22365 -                       }
22366                 }
22367                 switch_count = &prev->nvcsw;
22368         }
22369 @@ -3194,6 +3371,7 @@ static void __sched notrace __schedule(bool preempt)
22370  
22371         next = pick_next_task(rq, prev);
22372         clear_tsk_need_resched(prev);
22373 +       clear_tsk_need_resched_lazy(prev);
22374         clear_preempt_need_resched();
22375         rq->clock_skip_update = 0;
22376  
22377 @@ -3215,9 +3393,20 @@ static void __sched notrace __schedule(bool preempt)
22378  
22379  static inline void sched_submit_work(struct task_struct *tsk)
22380  {
22381 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
22382 +       if (!tsk->state)
22383                 return;
22384         /*
22385 +        * If a worker went to sleep, notify and ask workqueue whether
22386 +        * it wants to wake up a task to maintain concurrency.
22387 +        */
22388 +       if (tsk->flags & PF_WQ_WORKER)
22389 +               wq_worker_sleeping(tsk);
22390 +
22391 +
22392 +       if (tsk_is_pi_blocked(tsk))
22393 +               return;
22394 +
22395 +       /*
22396          * If we are going to sleep and we have plugged IO queued,
22397          * make sure to submit it to avoid deadlocks.
22398          */
22399 @@ -3225,6 +3414,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
22400                 blk_schedule_flush_plug(tsk);
22401  }
22402  
22403 +static void sched_update_worker(struct task_struct *tsk)
22404 +{
22405 +       if (tsk->flags & PF_WQ_WORKER)
22406 +               wq_worker_running(tsk);
22407 +}
22408 +
22409  asmlinkage __visible void __sched schedule(void)
22410  {
22411         struct task_struct *tsk = current;
22412 @@ -3235,6 +3430,7 @@ asmlinkage __visible void __sched schedule(void)
22413                 __schedule(false);
22414                 sched_preempt_enable_no_resched();
22415         } while (need_resched());
22416 +       sched_update_worker(tsk);
22417  }
22418  EXPORT_SYMBOL(schedule);
22419  
22420 @@ -3283,6 +3479,30 @@ static void __sched notrace preempt_schedule_common(void)
22421         } while (need_resched());
22422  }
22423  
22424 +#ifdef CONFIG_PREEMPT_LAZY
22425 +/*
22426 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22427 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22428 + * preempt_lazy_count counter >0.
22429 + */
22430 +static __always_inline int preemptible_lazy(void)
22431 +{
22432 +       if (test_thread_flag(TIF_NEED_RESCHED))
22433 +               return 1;
22434 +       if (current_thread_info()->preempt_lazy_count)
22435 +               return 0;
22436 +       return 1;
22437 +}
22438 +
22439 +#else
22440 +
22441 +static inline int preemptible_lazy(void)
22442 +{
22443 +       return 1;
22444 +}
22445 +
22446 +#endif
22447 +
22448  #ifdef CONFIG_PREEMPT
22449  /*
22450   * this is the entry point to schedule() from in-kernel preemption
22451 @@ -3297,6 +3517,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
22452          */
22453         if (likely(!preemptible()))
22454                 return;
22455 +       if (!preemptible_lazy())
22456 +               return;
22457  
22458         preempt_schedule_common();
22459  }
22460 @@ -3323,6 +3545,8 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22461  
22462         if (likely(!preemptible()))
22463                 return;
22464 +       if (!preemptible_lazy())
22465 +               return;
22466  
22467         do {
22468                 preempt_disable_notrace();
22469 @@ -3332,7 +3556,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22470                  * an infinite recursion.
22471                  */
22472                 prev_ctx = exception_enter();
22473 +               /*
22474 +                * The add/subtract must not be traced by the function
22475 +                * tracer. But we still want to account for the
22476 +                * preempt off latency tracer. Since the _notrace versions
22477 +                * of add/subtract skip the accounting for latency tracer
22478 +                * we must force it manually.
22479 +                */
22480 +               start_critical_timings();
22481                 __schedule(true);
22482 +               stop_critical_timings();
22483                 exception_exit(prev_ctx);
22484  
22485                 preempt_enable_no_resched_notrace();
22486 @@ -4676,6 +4909,7 @@ int __cond_resched_lock(spinlock_t *lock)
22487  }
22488  EXPORT_SYMBOL(__cond_resched_lock);
22489  
22490 +#ifndef CONFIG_PREEMPT_RT_FULL
22491  int __sched __cond_resched_softirq(void)
22492  {
22493         BUG_ON(!in_softirq());
22494 @@ -4689,6 +4923,7 @@ int __sched __cond_resched_softirq(void)
22495         return 0;
22496  }
22497  EXPORT_SYMBOL(__cond_resched_softirq);
22498 +#endif
22499  
22500  /**
22501   * yield - yield the current processor to other threads.
22502 @@ -5055,7 +5290,9 @@ void init_idle(struct task_struct *idle, int cpu)
22503  
22504         /* Set the preempt count _outside_ the spinlocks! */
22505         init_idle_preempt_count(idle, cpu);
22506 -
22507 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22508 +       task_thread_info(idle)->preempt_lazy_count = 0;
22509 +#endif
22510         /*
22511          * The idle tasks have their own, simple scheduling class:
22512          */
22513 @@ -5196,6 +5433,8 @@ void sched_setnuma(struct task_struct *p, int nid)
22514  #endif /* CONFIG_NUMA_BALANCING */
22515  
22516  #ifdef CONFIG_HOTPLUG_CPU
22517 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22518 +
22519  /*
22520   * Ensures that the idle task is using init_mm right before its cpu goes
22521   * offline.
22522 @@ -5210,7 +5449,11 @@ void idle_task_exit(void)
22523                 switch_mm(mm, &init_mm, current);
22524                 finish_arch_post_lock_switch();
22525         }
22526 -       mmdrop(mm);
22527 +       /*
22528 +        * Defer the cleanup to an alive cpu. On RT we can neither
22529 +        * call mmdrop() nor mmdrop_delayed() from here.
22530 +        */
22531 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
22532  }
22533  
22534  /*
22535 @@ -5583,6 +5826,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
22536  
22537         case CPU_DEAD:
22538                 calc_load_migrate(rq);
22539 +               if (per_cpu(idle_last_mm, cpu)) {
22540 +                       mmdrop(per_cpu(idle_last_mm, cpu));
22541 +                       per_cpu(idle_last_mm, cpu) = NULL;
22542 +               }
22543                 break;
22544  #endif
22545         }
22546 @@ -7566,7 +7813,7 @@ void __init sched_init(void)
22547  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22548  static inline int preempt_count_equals(int preempt_offset)
22549  {
22550 -       int nested = preempt_count() + rcu_preempt_depth();
22551 +       int nested = preempt_count() + sched_rcu_preempt_depth();
22552  
22553         return (nested == preempt_offset);
22554  }
22555 diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
22556 index 5a75b08cfd85..5be58820465c 100644
22557 --- a/kernel/sched/cpudeadline.c
22558 +++ b/kernel/sched/cpudeadline.c
22559 @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
22560         const struct sched_dl_entity *dl_se = &p->dl;
22561  
22562         if (later_mask &&
22563 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
22564 +           cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
22565                 best_cpu = cpumask_any(later_mask);
22566                 goto out;
22567 -       } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
22568 +       } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
22569                         dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
22570                 best_cpu = cpudl_maximum(cp);
22571                 if (later_mask)
22572 diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
22573 index 981fcd7dc394..11e9705bf937 100644
22574 --- a/kernel/sched/cpupri.c
22575 +++ b/kernel/sched/cpupri.c
22576 @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
22577                 if (skip)
22578                         continue;
22579  
22580 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
22581 +               if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
22582                         continue;
22583  
22584                 if (lowest_mask) {
22585 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
22586 +                       cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
22587  
22588                         /*
22589                          * We have to ensure that we have at least one bit
22590 diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
22591 index a1aecbedf5b1..558b98af241d 100644
22592 --- a/kernel/sched/cputime.c
22593 +++ b/kernel/sched/cputime.c
22594 @@ -685,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
22595  {
22596         unsigned long long delta = vtime_delta(tsk);
22597  
22598 -       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
22599 +       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
22600         tsk->vtime_snap += delta;
22601  
22602         /* CHECKME: always safe to convert nsecs to cputime? */
22603 @@ -701,37 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
22604  
22605  void vtime_account_system(struct task_struct *tsk)
22606  {
22607 -       write_seqlock(&tsk->vtime_seqlock);
22608 +       write_seqcount_begin(&tsk->vtime_seqcount);
22609         __vtime_account_system(tsk);
22610 -       write_sequnlock(&tsk->vtime_seqlock);
22611 +       write_seqcount_end(&tsk->vtime_seqcount);
22612  }
22613  
22614  void vtime_gen_account_irq_exit(struct task_struct *tsk)
22615  {
22616 -       write_seqlock(&tsk->vtime_seqlock);
22617 +       write_seqcount_begin(&tsk->vtime_seqcount);
22618         __vtime_account_system(tsk);
22619         if (context_tracking_in_user())
22620                 tsk->vtime_snap_whence = VTIME_USER;
22621 -       write_sequnlock(&tsk->vtime_seqlock);
22622 +       write_seqcount_end(&tsk->vtime_seqcount);
22623  }
22624  
22625  void vtime_account_user(struct task_struct *tsk)
22626  {
22627         cputime_t delta_cpu;
22628  
22629 -       write_seqlock(&tsk->vtime_seqlock);
22630 +       write_seqcount_begin(&tsk->vtime_seqcount);
22631         delta_cpu = get_vtime_delta(tsk);
22632         tsk->vtime_snap_whence = VTIME_SYS;
22633         account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
22634 -       write_sequnlock(&tsk->vtime_seqlock);
22635 +       write_seqcount_end(&tsk->vtime_seqcount);
22636  }
22637  
22638  void vtime_user_enter(struct task_struct *tsk)
22639  {
22640 -       write_seqlock(&tsk->vtime_seqlock);
22641 +       write_seqcount_begin(&tsk->vtime_seqcount);
22642         __vtime_account_system(tsk);
22643         tsk->vtime_snap_whence = VTIME_USER;
22644 -       write_sequnlock(&tsk->vtime_seqlock);
22645 +       write_seqcount_end(&tsk->vtime_seqcount);
22646  }
22647  
22648  void vtime_guest_enter(struct task_struct *tsk)
22649 @@ -743,19 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
22650          * synchronization against the reader (task_gtime())
22651          * that can thus safely catch up with a tickless delta.
22652          */
22653 -       write_seqlock(&tsk->vtime_seqlock);
22654 +       write_seqcount_begin(&tsk->vtime_seqcount);
22655         __vtime_account_system(tsk);
22656         current->flags |= PF_VCPU;
22657 -       write_sequnlock(&tsk->vtime_seqlock);
22658 +       write_seqcount_end(&tsk->vtime_seqcount);
22659  }
22660  EXPORT_SYMBOL_GPL(vtime_guest_enter);
22661  
22662  void vtime_guest_exit(struct task_struct *tsk)
22663  {
22664 -       write_seqlock(&tsk->vtime_seqlock);
22665 +       write_seqcount_begin(&tsk->vtime_seqcount);
22666         __vtime_account_system(tsk);
22667         current->flags &= ~PF_VCPU;
22668 -       write_sequnlock(&tsk->vtime_seqlock);
22669 +       write_seqcount_end(&tsk->vtime_seqcount);
22670  }
22671  EXPORT_SYMBOL_GPL(vtime_guest_exit);
22672  
22673 @@ -768,24 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
22674  
22675  void arch_vtime_task_switch(struct task_struct *prev)
22676  {
22677 -       write_seqlock(&prev->vtime_seqlock);
22678 -       prev->vtime_snap_whence = VTIME_SLEEPING;
22679 -       write_sequnlock(&prev->vtime_seqlock);
22680 +       write_seqcount_begin(&prev->vtime_seqcount);
22681 +       prev->vtime_snap_whence = VTIME_INACTIVE;
22682 +       write_seqcount_end(&prev->vtime_seqcount);
22683  
22684 -       write_seqlock(&current->vtime_seqlock);
22685 +       write_seqcount_begin(&current->vtime_seqcount);
22686         current->vtime_snap_whence = VTIME_SYS;
22687         current->vtime_snap = sched_clock_cpu(smp_processor_id());
22688 -       write_sequnlock(&current->vtime_seqlock);
22689 +       write_seqcount_end(&current->vtime_seqcount);
22690  }
22691  
22692  void vtime_init_idle(struct task_struct *t, int cpu)
22693  {
22694         unsigned long flags;
22695  
22696 -       write_seqlock_irqsave(&t->vtime_seqlock, flags);
22697 +       local_irq_save(flags);
22698 +       write_seqcount_begin(&t->vtime_seqcount);
22699         t->vtime_snap_whence = VTIME_SYS;
22700         t->vtime_snap = sched_clock_cpu(cpu);
22701 -       write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
22702 +       write_seqcount_end(&t->vtime_seqcount);
22703 +       local_irq_restore(flags);
22704  }
22705  
22706  cputime_t task_gtime(struct task_struct *t)
22707 @@ -797,13 +799,13 @@ cputime_t task_gtime(struct task_struct *t)
22708                 return t->gtime;
22709  
22710         do {
22711 -               seq = read_seqbegin(&t->vtime_seqlock);
22712 +               seq = read_seqcount_begin(&t->vtime_seqcount);
22713  
22714                 gtime = t->gtime;
22715                 if (t->flags & PF_VCPU)
22716                         gtime += vtime_delta(t);
22717  
22718 -       } while (read_seqretry(&t->vtime_seqlock, seq));
22719 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
22720  
22721         return gtime;
22722  }
22723 @@ -826,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
22724                 *udelta = 0;
22725                 *sdelta = 0;
22726  
22727 -               seq = read_seqbegin(&t->vtime_seqlock);
22728 +               seq = read_seqcount_begin(&t->vtime_seqcount);
22729  
22730                 if (u_dst)
22731                         *u_dst = *u_src;
22732 @@ -834,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
22733                         *s_dst = *s_src;
22734  
22735                 /* Task is sleeping, nothing to add */
22736 -               if (t->vtime_snap_whence == VTIME_SLEEPING ||
22737 +               if (t->vtime_snap_whence == VTIME_INACTIVE ||
22738                     is_idle_task(t))
22739                         continue;
22740  
22741 @@ -850,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
22742                         if (t->vtime_snap_whence == VTIME_SYS)
22743                                 *sdelta = delta;
22744                 }
22745 -       } while (read_seqretry(&t->vtime_seqlock, seq));
22746 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
22747  }
22748  
22749  
22750 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
22751 index 8b0a15e285f9..7a72e69fcf65 100644
22752 --- a/kernel/sched/deadline.c
22753 +++ b/kernel/sched/deadline.c
22754 @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
22755  {
22756         struct task_struct *p = dl_task_of(dl_se);
22757  
22758 -       if (p->nr_cpus_allowed > 1)
22759 +       if (tsk_nr_cpus_allowed(p) > 1)
22760                 dl_rq->dl_nr_migratory++;
22761  
22762         update_dl_migration(dl_rq);
22763 @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
22764  {
22765         struct task_struct *p = dl_task_of(dl_se);
22766  
22767 -       if (p->nr_cpus_allowed > 1)
22768 +       if (tsk_nr_cpus_allowed(p) > 1)
22769                 dl_rq->dl_nr_migratory--;
22770  
22771         update_dl_migration(dl_rq);
22772 @@ -697,6 +697,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
22773  
22774         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22775         timer->function = dl_task_timer;
22776 +       timer->irqsafe = 1;
22777  }
22778  
22779  static
22780 @@ -989,7 +990,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
22781  
22782         enqueue_dl_entity(&p->dl, pi_se, flags);
22783  
22784 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
22785 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
22786                 enqueue_pushable_dl_task(rq, p);
22787  }
22788  
22789 @@ -1067,9 +1068,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
22790          * try to make it stay here, it might be important.
22791          */
22792         if (unlikely(dl_task(curr)) &&
22793 -           (curr->nr_cpus_allowed < 2 ||
22794 +           (tsk_nr_cpus_allowed(curr) < 2 ||
22795              !dl_entity_preempt(&p->dl, &curr->dl)) &&
22796 -           (p->nr_cpus_allowed > 1)) {
22797 +           (tsk_nr_cpus_allowed(p) > 1)) {
22798                 int target = find_later_rq(p);
22799  
22800                 if (target != -1 &&
22801 @@ -1090,7 +1091,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
22802          * Current can't be migrated, useless to reschedule,
22803          * let's hope p can move out.
22804          */
22805 -       if (rq->curr->nr_cpus_allowed == 1 ||
22806 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
22807             cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
22808                 return;
22809  
22810 @@ -1098,7 +1099,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
22811          * p is migratable, so let's not schedule it and
22812          * see if it is pushed or pulled somewhere else.
22813          */
22814 -       if (p->nr_cpus_allowed != 1 &&
22815 +       if (tsk_nr_cpus_allowed(p) != 1 &&
22816             cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
22817                 return;
22818  
22819 @@ -1212,7 +1213,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
22820  {
22821         update_curr_dl(rq);
22822  
22823 -       if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
22824 +       if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
22825                 enqueue_pushable_dl_task(rq, p);
22826  }
22827  
22828 @@ -1335,7 +1336,7 @@ static int find_later_rq(struct task_struct *task)
22829         if (unlikely(!later_mask))
22830                 return -1;
22831  
22832 -       if (task->nr_cpus_allowed == 1)
22833 +       if (tsk_nr_cpus_allowed(task) == 1)
22834                 return -1;
22835  
22836         /*
22837 @@ -1441,7 +1442,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
22838                 if (double_lock_balance(rq, later_rq)) {
22839                         if (unlikely(task_rq(task) != rq ||
22840                                      !cpumask_test_cpu(later_rq->cpu,
22841 -                                                      &task->cpus_allowed) ||
22842 +                                                      tsk_cpus_allowed(task)) ||
22843                                      task_running(rq, task) ||
22844                                      !task_on_rq_queued(task))) {
22845                                 double_unlock_balance(rq, later_rq);
22846 @@ -1480,7 +1481,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
22847  
22848         BUG_ON(rq->cpu != task_cpu(p));
22849         BUG_ON(task_current(rq, p));
22850 -       BUG_ON(p->nr_cpus_allowed <= 1);
22851 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
22852  
22853         BUG_ON(!task_on_rq_queued(p));
22854         BUG_ON(!dl_task(p));
22855 @@ -1519,7 +1520,7 @@ retry:
22856          */
22857         if (dl_task(rq->curr) &&
22858             dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
22859 -           rq->curr->nr_cpus_allowed > 1) {
22860 +           tsk_nr_cpus_allowed(rq->curr) > 1) {
22861                 resched_curr(rq);
22862                 return 0;
22863         }
22864 @@ -1666,9 +1667,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
22865  {
22866         if (!task_running(rq, p) &&
22867             !test_tsk_need_resched(rq->curr) &&
22868 -           p->nr_cpus_allowed > 1 &&
22869 +           tsk_nr_cpus_allowed(p) > 1 &&
22870             dl_task(rq->curr) &&
22871 -           (rq->curr->nr_cpus_allowed < 2 ||
22872 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
22873              !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
22874                 push_dl_tasks(rq);
22875         }
22876 @@ -1769,7 +1770,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
22877  {
22878         if (task_on_rq_queued(p) && rq->curr != p) {
22879  #ifdef CONFIG_SMP
22880 -               if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
22881 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
22882                         queue_push_tasks(rq);
22883  #else
22884                 if (dl_task(rq->curr))
22885 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
22886 index 641511771ae6..a2d69b883623 100644
22887 --- a/kernel/sched/debug.c
22888 +++ b/kernel/sched/debug.c
22889 @@ -251,6 +251,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
22890         P(rt_throttled);
22891         PN(rt_time);
22892         PN(rt_runtime);
22893 +#ifdef CONFIG_SMP
22894 +       P(rt_nr_migratory);
22895 +#endif
22896  
22897  #undef PN
22898  #undef P
22899 @@ -635,6 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
22900  #endif
22901         P(policy);
22902         P(prio);
22903 +#ifdef CONFIG_PREEMPT_RT_FULL
22904 +       P(migrate_disable);
22905 +#endif
22906 +       P(nr_cpus_allowed);
22907  #undef PN
22908  #undef __PN
22909  #undef P
22910 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
22911 index 8f258f437ac2..cf0a1adba6c6 100644
22912 --- a/kernel/sched/fair.c
22913 +++ b/kernel/sched/fair.c
22914 @@ -3166,7 +3166,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
22915         ideal_runtime = sched_slice(cfs_rq, curr);
22916         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
22917         if (delta_exec > ideal_runtime) {
22918 -               resched_curr(rq_of(cfs_rq));
22919 +               resched_curr_lazy(rq_of(cfs_rq));
22920                 /*
22921                  * The current task ran long enough, ensure it doesn't get
22922                  * re-elected due to buddy favours.
22923 @@ -3190,7 +3190,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
22924                 return;
22925  
22926         if (delta > ideal_runtime)
22927 -               resched_curr(rq_of(cfs_rq));
22928 +               resched_curr_lazy(rq_of(cfs_rq));
22929  }
22930  
22931  static void
22932 @@ -3330,7 +3330,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
22933          * validating it and just reschedule.
22934          */
22935         if (queued) {
22936 -               resched_curr(rq_of(cfs_rq));
22937 +               resched_curr_lazy(rq_of(cfs_rq));
22938                 return;
22939         }
22940         /*
22941 @@ -3512,7 +3512,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
22942          * hierarchy can be throttled
22943          */
22944         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
22945 -               resched_curr(rq_of(cfs_rq));
22946 +               resched_curr_lazy(rq_of(cfs_rq));
22947  }
22948  
22949  static __always_inline
22950 @@ -4124,7 +4124,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
22951  
22952                 if (delta < 0) {
22953                         if (rq->curr == p)
22954 -                               resched_curr(rq);
22955 +                               resched_curr_lazy(rq);
22956                         return;
22957                 }
22958                 hrtick_start(rq, delta);
22959 @@ -5213,7 +5213,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
22960         return;
22961  
22962  preempt:
22963 -       resched_curr(rq);
22964 +       resched_curr_lazy(rq);
22965         /*
22966          * Only set the backward buddy when the current task is still
22967          * on the rq. This can happen when a wakeup gets interleaved
22968 @@ -7964,7 +7964,7 @@ static void task_fork_fair(struct task_struct *p)
22969                  * 'current' within the tree based on its new key value.
22970                  */
22971                 swap(curr->vruntime, se->vruntime);
22972 -               resched_curr(rq);
22973 +               resched_curr_lazy(rq);
22974         }
22975  
22976         se->vruntime -= cfs_rq->min_vruntime;
22977 @@ -7989,7 +7989,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
22978          */
22979         if (rq->curr == p) {
22980                 if (p->prio > oldprio)
22981 -                       resched_curr(rq);
22982 +                       resched_curr_lazy(rq);
22983         } else
22984                 check_preempt_curr(rq, p, 0);
22985  }
22986 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
22987 index 69631fa46c2f..6d28fcd08872 100644
22988 --- a/kernel/sched/features.h
22989 +++ b/kernel/sched/features.h
22990 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
22991   */
22992  SCHED_FEAT(NONTASK_CAPACITY, true)
22993  
22994 +#ifdef CONFIG_PREEMPT_RT_FULL
22995 +SCHED_FEAT(TTWU_QUEUE, false)
22996 +# ifdef CONFIG_PREEMPT_LAZY
22997 +SCHED_FEAT(PREEMPT_LAZY, true)
22998 +# endif
22999 +#else
23000 +
23001  /*
23002   * Queue remote wakeups on the target CPU and process them
23003   * using the scheduler IPI. Reduces rq->lock contention/bounces.
23004   */
23005  SCHED_FEAT(TTWU_QUEUE, true)
23006 +#endif
23007  
23008  #ifdef HAVE_RT_PUSH_IPI
23009  /*
23010 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23011 index 8ec86abe0ea1..8cf360d309ec 100644
23012 --- a/kernel/sched/rt.c
23013 +++ b/kernel/sched/rt.c
23014 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
23015  
23016         hrtimer_init(&rt_b->rt_period_timer,
23017                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23018 +       rt_b->rt_period_timer.irqsafe = 1;
23019         rt_b->rt_period_timer.function = sched_rt_period_timer;
23020  }
23021  
23022 @@ -93,6 +94,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
23023         rt_rq->push_cpu = nr_cpu_ids;
23024         raw_spin_lock_init(&rt_rq->push_lock);
23025         init_irq_work(&rt_rq->push_work, push_irq_work_func);
23026 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
23027  #endif
23028  #endif /* CONFIG_SMP */
23029         /* We start is dequeued state, because no RT tasks are queued */
23030 @@ -326,7 +328,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23031         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23032  
23033         rt_rq->rt_nr_total++;
23034 -       if (p->nr_cpus_allowed > 1)
23035 +       if (tsk_nr_cpus_allowed(p) > 1)
23036                 rt_rq->rt_nr_migratory++;
23037  
23038         update_rt_migration(rt_rq);
23039 @@ -343,7 +345,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23040         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23041  
23042         rt_rq->rt_nr_total--;
23043 -       if (p->nr_cpus_allowed > 1)
23044 +       if (tsk_nr_cpus_allowed(p) > 1)
23045                 rt_rq->rt_nr_migratory--;
23046  
23047         update_rt_migration(rt_rq);
23048 @@ -1262,7 +1264,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
23049  
23050         enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
23051  
23052 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
23053 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
23054                 enqueue_pushable_task(rq, p);
23055  }
23056  
23057 @@ -1351,7 +1353,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
23058          * will have to sort it out.
23059          */
23060         if (curr && unlikely(rt_task(curr)) &&
23061 -           (curr->nr_cpus_allowed < 2 ||
23062 +           (tsk_nr_cpus_allowed(curr) < 2 ||
23063              curr->prio <= p->prio)) {
23064                 int target = find_lowest_rq(p);
23065  
23066 @@ -1375,7 +1377,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23067          * Current can't be migrated, useless to reschedule,
23068          * let's hope p can move out.
23069          */
23070 -       if (rq->curr->nr_cpus_allowed == 1 ||
23071 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
23072             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
23073                 return;
23074  
23075 @@ -1383,7 +1385,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23076          * p is migratable, so let's not schedule it and
23077          * see if it is pushed or pulled somewhere else.
23078          */
23079 -       if (p->nr_cpus_allowed != 1
23080 +       if (tsk_nr_cpus_allowed(p) != 1
23081             && cpupri_find(&rq->rd->cpupri, p, NULL))
23082                 return;
23083  
23084 @@ -1517,7 +1519,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
23085          * The previous task needs to be made eligible for pushing
23086          * if it is still active
23087          */
23088 -       if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
23089 +       if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
23090                 enqueue_pushable_task(rq, p);
23091  }
23092  
23093 @@ -1567,7 +1569,7 @@ static int find_lowest_rq(struct task_struct *task)
23094         if (unlikely(!lowest_mask))
23095                 return -1;
23096  
23097 -       if (task->nr_cpus_allowed == 1)
23098 +       if (tsk_nr_cpus_allowed(task) == 1)
23099                 return -1; /* No other targets possible */
23100  
23101         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
23102 @@ -1699,7 +1701,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
23103  
23104         BUG_ON(rq->cpu != task_cpu(p));
23105         BUG_ON(task_current(rq, p));
23106 -       BUG_ON(p->nr_cpus_allowed <= 1);
23107 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
23108  
23109         BUG_ON(!task_on_rq_queued(p));
23110         BUG_ON(!rt_task(p));
23111 @@ -2059,9 +2061,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
23112  {
23113         if (!task_running(rq, p) &&
23114             !test_tsk_need_resched(rq->curr) &&
23115 -           p->nr_cpus_allowed > 1 &&
23116 +           tsk_nr_cpus_allowed(p) > 1 &&
23117             (dl_task(rq->curr) || rt_task(rq->curr)) &&
23118 -           (rq->curr->nr_cpus_allowed < 2 ||
23119 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
23120              rq->curr->prio <= p->prio))
23121                 push_rt_tasks(rq);
23122  }
23123 @@ -2134,7 +2136,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
23124          */
23125         if (task_on_rq_queued(p) && rq->curr != p) {
23126  #ifdef CONFIG_SMP
23127 -               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
23128 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
23129                         queue_push_tasks(rq);
23130  #else
23131                 if (p->prio < rq->curr->prio)
23132 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23133 index 0517abd7dd73..a8a9b156ea15 100644
23134 --- a/kernel/sched/sched.h
23135 +++ b/kernel/sched/sched.h
23136 @@ -1100,6 +1100,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
23137  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
23138  #define WF_FORK                0x02            /* child wakeup after fork */
23139  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
23140 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
23141  
23142  /*
23143   * To aid in avoiding the subversion of "niceness" due to uneven distribution
23144 @@ -1299,6 +1300,15 @@ extern void init_sched_fair_class(void);
23145  extern void resched_curr(struct rq *rq);
23146  extern void resched_cpu(int cpu);
23147  
23148 +#ifdef CONFIG_PREEMPT_LAZY
23149 +extern void resched_curr_lazy(struct rq *rq);
23150 +#else
23151 +static inline void resched_curr_lazy(struct rq *rq)
23152 +{
23153 +       resched_curr(rq);
23154 +}
23155 +#endif
23156 +
23157  extern struct rt_bandwidth def_rt_bandwidth;
23158  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23159  
23160 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23161 new file mode 100644
23162 index 000000000000..205fe36868f9
23163 --- /dev/null
23164 +++ b/kernel/sched/swait.c
23165 @@ -0,0 +1,143 @@
23166 +#include <linux/sched.h>
23167 +#include <linux/swait.h>
23168 +#include <linux/suspend.h>
23169 +
23170 +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23171 +                            struct lock_class_key *key)
23172 +{
23173 +       raw_spin_lock_init(&q->lock);
23174 +       lockdep_set_class_and_name(&q->lock, key, name);
23175 +       INIT_LIST_HEAD(&q->task_list);
23176 +}
23177 +EXPORT_SYMBOL(__init_swait_queue_head);
23178 +
23179 +/*
23180 + * The thing about the wake_up_state() return value; I think we can ignore it.
23181 + *
23182 + * If for some reason it would return 0, that means the previously waiting
23183 + * task is already running, so it will observe condition true (or has already).
23184 + */
23185 +void swake_up_locked(struct swait_queue_head *q)
23186 +{
23187 +       struct swait_queue *curr;
23188 +
23189 +       if (list_empty(&q->task_list))
23190 +               return;
23191 +
23192 +       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
23193 +       wake_up_process(curr->task);
23194 +       list_del_init(&curr->task_list);
23195 +}
23196 +EXPORT_SYMBOL(swake_up_locked);
23197 +
23198 +void swake_up_all_locked(struct swait_queue_head *q)
23199 +{
23200 +       struct swait_queue *curr;
23201 +       int wakes = 0;
23202 +
23203 +       while (!list_empty(&q->task_list)) {
23204 +
23205 +               curr = list_first_entry(&q->task_list, typeof(*curr),
23206 +                                       task_list);
23207 +               wake_up_process(curr->task);
23208 +               list_del_init(&curr->task_list);
23209 +               wakes++;
23210 +       }
23211 +       if (pm_in_action)
23212 +               return;
23213 +       WARN(wakes > 2, "complate_all() with %d waiters\n", wakes);
23214 +}
23215 +EXPORT_SYMBOL(swake_up_all_locked);
23216 +
23217 +void swake_up(struct swait_queue_head *q)
23218 +{
23219 +       unsigned long flags;
23220 +
23221 +       if (!swait_active(q))
23222 +               return;
23223 +
23224 +       raw_spin_lock_irqsave(&q->lock, flags);
23225 +       swake_up_locked(q);
23226 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23227 +}
23228 +EXPORT_SYMBOL(swake_up);
23229 +
23230 +/*
23231 + * Does not allow usage from IRQ disabled, since we must be able to
23232 + * release IRQs to guarantee bounded hold time.
23233 + */
23234 +void swake_up_all(struct swait_queue_head *q)
23235 +{
23236 +       struct swait_queue *curr;
23237 +       LIST_HEAD(tmp);
23238 +
23239 +       if (!swait_active(q))
23240 +               return;
23241 +
23242 +       raw_spin_lock_irq(&q->lock);
23243 +       list_splice_init(&q->task_list, &tmp);
23244 +       while (!list_empty(&tmp)) {
23245 +               curr = list_first_entry(&tmp, typeof(*curr), task_list);
23246 +
23247 +               wake_up_state(curr->task, TASK_NORMAL);
23248 +               list_del_init(&curr->task_list);
23249 +
23250 +               if (list_empty(&tmp))
23251 +                       break;
23252 +
23253 +               raw_spin_unlock_irq(&q->lock);
23254 +               raw_spin_lock_irq(&q->lock);
23255 +       }
23256 +       raw_spin_unlock_irq(&q->lock);
23257 +}
23258 +EXPORT_SYMBOL(swake_up_all);
23259 +
23260 +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
23261 +{
23262 +       wait->task = current;
23263 +       if (list_empty(&wait->task_list))
23264 +               list_add(&wait->task_list, &q->task_list);
23265 +}
23266 +
23267 +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
23268 +{
23269 +       unsigned long flags;
23270 +
23271 +       raw_spin_lock_irqsave(&q->lock, flags);
23272 +       __prepare_to_swait(q, wait);
23273 +       set_current_state(state);
23274 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23275 +}
23276 +EXPORT_SYMBOL(prepare_to_swait);
23277 +
23278 +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
23279 +{
23280 +       if (signal_pending_state(state, current))
23281 +               return -ERESTARTSYS;
23282 +
23283 +       prepare_to_swait(q, wait, state);
23284 +
23285 +       return 0;
23286 +}
23287 +EXPORT_SYMBOL(prepare_to_swait_event);
23288 +
23289 +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
23290 +{
23291 +       __set_current_state(TASK_RUNNING);
23292 +       if (!list_empty(&wait->task_list))
23293 +               list_del_init(&wait->task_list);
23294 +}
23295 +
23296 +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
23297 +{
23298 +       unsigned long flags;
23299 +
23300 +       __set_current_state(TASK_RUNNING);
23301 +
23302 +       if (!list_empty_careful(&wait->task_list)) {
23303 +               raw_spin_lock_irqsave(&q->lock, flags);
23304 +               list_del_init(&wait->task_list);
23305 +               raw_spin_unlock_irqrestore(&q->lock, flags);
23306 +       }
23307 +}
23308 +EXPORT_SYMBOL(finish_swait);
23309 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
23310 new file mode 100644
23311 index 000000000000..1950f40ca725
23312 --- /dev/null
23313 +++ b/kernel/sched/swork.c
23314 @@ -0,0 +1,173 @@
23315 +/*
23316 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
23317 + *
23318 + * Provides a framework for enqueuing callbacks from irq context
23319 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
23320 + */
23321 +
23322 +#include <linux/swait.h>
23323 +#include <linux/swork.h>
23324 +#include <linux/kthread.h>
23325 +#include <linux/slab.h>
23326 +#include <linux/spinlock.h>
23327 +#include <linux/export.h>
23328 +
23329 +#define SWORK_EVENT_PENDING     (1 << 0)
23330 +
23331 +static DEFINE_MUTEX(worker_mutex);
23332 +static struct sworker *glob_worker;
23333 +
23334 +struct sworker {
23335 +       struct list_head events;
23336 +       struct swait_queue_head wq;
23337 +
23338 +       raw_spinlock_t lock;
23339 +
23340 +       struct task_struct *task;
23341 +       int refs;
23342 +};
23343 +
23344 +static bool swork_readable(struct sworker *worker)
23345 +{
23346 +       bool r;
23347 +
23348 +       if (kthread_should_stop())
23349 +               return true;
23350 +
23351 +       raw_spin_lock_irq(&worker->lock);
23352 +       r = !list_empty(&worker->events);
23353 +       raw_spin_unlock_irq(&worker->lock);
23354 +
23355 +       return r;
23356 +}
23357 +
23358 +static int swork_kthread(void *arg)
23359 +{
23360 +       struct sworker *worker = arg;
23361 +
23362 +       for (;;) {
23363 +               swait_event_interruptible(worker->wq,
23364 +                                       swork_readable(worker));
23365 +               if (kthread_should_stop())
23366 +                       break;
23367 +
23368 +               raw_spin_lock_irq(&worker->lock);
23369 +               while (!list_empty(&worker->events)) {
23370 +                       struct swork_event *sev;
23371 +
23372 +                       sev = list_first_entry(&worker->events,
23373 +                                       struct swork_event, item);
23374 +                       list_del(&sev->item);
23375 +                       raw_spin_unlock_irq(&worker->lock);
23376 +
23377 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
23378 +                                                        &sev->flags));
23379 +                       sev->func(sev);
23380 +                       raw_spin_lock_irq(&worker->lock);
23381 +               }
23382 +               raw_spin_unlock_irq(&worker->lock);
23383 +       }
23384 +       return 0;
23385 +}
23386 +
23387 +static struct sworker *swork_create(void)
23388 +{
23389 +       struct sworker *worker;
23390 +
23391 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
23392 +       if (!worker)
23393 +               return ERR_PTR(-ENOMEM);
23394 +
23395 +       INIT_LIST_HEAD(&worker->events);
23396 +       raw_spin_lock_init(&worker->lock);
23397 +       init_swait_queue_head(&worker->wq);
23398 +
23399 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
23400 +       if (IS_ERR(worker->task)) {
23401 +               kfree(worker);
23402 +               return ERR_PTR(-ENOMEM);
23403 +       }
23404 +
23405 +       return worker;
23406 +}
23407 +
23408 +static void swork_destroy(struct sworker *worker)
23409 +{
23410 +       kthread_stop(worker->task);
23411 +
23412 +       WARN_ON(!list_empty(&worker->events));
23413 +       kfree(worker);
23414 +}
23415 +
23416 +/**
23417 + * swork_queue - queue swork
23418 + *
23419 + * Returns %false if @work was already on a queue, %true otherwise.
23420 + *
23421 + * The work is queued and processed on a random CPU
23422 + */
23423 +bool swork_queue(struct swork_event *sev)
23424 +{
23425 +       unsigned long flags;
23426 +
23427 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
23428 +               return false;
23429 +
23430 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
23431 +       list_add_tail(&sev->item, &glob_worker->events);
23432 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
23433 +
23434 +       swake_up(&glob_worker->wq);
23435 +       return true;
23436 +}
23437 +EXPORT_SYMBOL_GPL(swork_queue);
23438 +
23439 +/**
23440 + * swork_get - get an instance of the sworker
23441 + *
23442 + * Returns an negative error code if the initialization if the worker did not
23443 + * work, %0 otherwise.
23444 + *
23445 + */
23446 +int swork_get(void)
23447 +{
23448 +       struct sworker *worker;
23449 +
23450 +       mutex_lock(&worker_mutex);
23451 +       if (!glob_worker) {
23452 +               worker = swork_create();
23453 +               if (IS_ERR(worker)) {
23454 +                       mutex_unlock(&worker_mutex);
23455 +                       return -ENOMEM;
23456 +               }
23457 +
23458 +               glob_worker = worker;
23459 +       }
23460 +
23461 +       glob_worker->refs++;
23462 +       mutex_unlock(&worker_mutex);
23463 +
23464 +       return 0;
23465 +}
23466 +EXPORT_SYMBOL_GPL(swork_get);
23467 +
23468 +/**
23469 + * swork_put - puts an instance of the sworker
23470 + *
23471 + * Will destroy the sworker thread. This function must not be called until all
23472 + * queued events have been completed.
23473 + */
23474 +void swork_put(void)
23475 +{
23476 +       mutex_lock(&worker_mutex);
23477 +
23478 +       glob_worker->refs--;
23479 +       if (glob_worker->refs > 0)
23480 +               goto out;
23481 +
23482 +       swork_destroy(glob_worker);
23483 +       glob_worker = NULL;
23484 +out:
23485 +       mutex_unlock(&worker_mutex);
23486 +}
23487 +EXPORT_SYMBOL_GPL(swork_put);
23488 diff --git a/kernel/signal.c b/kernel/signal.c
23489 index f3f1f7a972fd..bc2c990f3f63 100644
23490 --- a/kernel/signal.c
23491 +++ b/kernel/signal.c
23492 @@ -14,6 +14,7 @@
23493  #include <linux/export.h>
23494  #include <linux/init.h>
23495  #include <linux/sched.h>
23496 +#include <linux/sched/rt.h>
23497  #include <linux/fs.h>
23498  #include <linux/tty.h>
23499  #include <linux/binfmts.h>
23500 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
23501         return false;
23502  }
23503  
23504 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
23505 +{
23506 +       struct sigqueue *q = t->sigqueue_cache;
23507 +
23508 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23509 +               return NULL;
23510 +       return q;
23511 +}
23512 +
23513 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23514 +{
23515 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23516 +               return 0;
23517 +       return 1;
23518 +}
23519 +
23520  /*
23521   * allocate a new signal queue record
23522   * - this may be called without locks if and only if t == current, otherwise an
23523   *   appropriate lock must be held to stop the target task from exiting
23524   */
23525  static struct sigqueue *
23526 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23527 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23528 +                   int override_rlimit, int fromslab)
23529  {
23530         struct sigqueue *q = NULL;
23531         struct user_struct *user;
23532 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23533         if (override_rlimit ||
23534             atomic_read(&user->sigpending) <=
23535                         task_rlimit(t, RLIMIT_SIGPENDING)) {
23536 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
23537 +               if (!fromslab)
23538 +                       q = get_task_cache(t);
23539 +               if (!q)
23540 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
23541         } else {
23542                 print_dropped_signal(sig);
23543         }
23544 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23545         return q;
23546  }
23547  
23548 +static struct sigqueue *
23549 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23550 +                int override_rlimit)
23551 +{
23552 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23553 +}
23554 +
23555  static void __sigqueue_free(struct sigqueue *q)
23556  {
23557         if (q->flags & SIGQUEUE_PREALLOC)
23558 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
23559         kmem_cache_free(sigqueue_cachep, q);
23560  }
23561  
23562 +static void sigqueue_free_current(struct sigqueue *q)
23563 +{
23564 +       struct user_struct *up;
23565 +
23566 +       if (q->flags & SIGQUEUE_PREALLOC)
23567 +               return;
23568 +
23569 +       up = q->user;
23570 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23571 +               atomic_dec(&up->sigpending);
23572 +               free_uid(up);
23573 +       } else
23574 +                 __sigqueue_free(q);
23575 +}
23576 +
23577  void flush_sigqueue(struct sigpending *queue)
23578  {
23579         struct sigqueue *q;
23580 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
23581  }
23582  
23583  /*
23584 + * Called from __exit_signal. Flush tsk->pending and
23585 + * tsk->sigqueue_cache
23586 + */
23587 +void flush_task_sigqueue(struct task_struct *tsk)
23588 +{
23589 +       struct sigqueue *q;
23590 +
23591 +       flush_sigqueue(&tsk->pending);
23592 +
23593 +       q = get_task_cache(tsk);
23594 +       if (q)
23595 +               kmem_cache_free(sigqueue_cachep, q);
23596 +}
23597 +
23598 +/*
23599   * Flush all pending signals for this kthread.
23600   */
23601  void flush_signals(struct task_struct *t)
23602 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
23603  still_pending:
23604                 list_del_init(&first->list);
23605                 copy_siginfo(info, &first->info);
23606 -               __sigqueue_free(first);
23607 +               sigqueue_free_current(first);
23608         } else {
23609                 /*
23610                  * Ok, it wasn't in the queue.  This must be
23611 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
23612  {
23613         int signr;
23614  
23615 +       WARN_ON_ONCE(tsk != current);
23616 +
23617         /* We only dequeue private signals from ourselves, we don't let
23618          * signalfd steal them
23619          */
23620 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
23621   * We don't want to have recursive SIGSEGV's etc, for example,
23622   * that is why we also clear SIGNAL_UNKILLABLE.
23623   */
23624 -int
23625 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23626 +static int
23627 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23628  {
23629         unsigned long int flags;
23630         int ret, blocked, ignored;
23631 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23632         return ret;
23633  }
23634  
23635 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23636 +{
23637 +/*
23638 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23639 + * since it can not enable preemption, and the signal code's spin_locks
23640 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23641 + * send the signal on exit of the trap.
23642 + */
23643 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23644 +       if (in_atomic()) {
23645 +               if (WARN_ON_ONCE(t != current))
23646 +                       return 0;
23647 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
23648 +                       return 0;
23649 +
23650 +               if (is_si_special(info)) {
23651 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
23652 +                       t->forced_info.si_signo = sig;
23653 +                       t->forced_info.si_errno = 0;
23654 +                       t->forced_info.si_code = SI_KERNEL;
23655 +                       t->forced_info.si_pid = 0;
23656 +                       t->forced_info.si_uid = 0;
23657 +               } else {
23658 +                       t->forced_info = *info;
23659 +               }
23660 +
23661 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23662 +               return 0;
23663 +       }
23664 +#endif
23665 +       return do_force_sig_info(sig, info, t);
23666 +}
23667 +
23668  /*
23669   * Nuke all other threads in the group.
23670   */
23671 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23672                  * Disable interrupts early to avoid deadlocks.
23673                  * See rcu_read_unlock() comment header for details.
23674                  */
23675 -               local_irq_save(*flags);
23676 +               local_irq_save_nort(*flags);
23677                 rcu_read_lock();
23678                 sighand = rcu_dereference(tsk->sighand);
23679                 if (unlikely(sighand == NULL)) {
23680                         rcu_read_unlock();
23681 -                       local_irq_restore(*flags);
23682 +                       local_irq_restore_nort(*flags);
23683                         break;
23684                 }
23685                 /*
23686 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23687                 }
23688                 spin_unlock(&sighand->siglock);
23689                 rcu_read_unlock();
23690 -               local_irq_restore(*flags);
23691 +               local_irq_restore_nort(*flags);
23692         }
23693  
23694         return sighand;
23695 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
23696   */
23697  struct sigqueue *sigqueue_alloc(void)
23698  {
23699 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23700 +       /* Preallocated sigqueue objects always from the slabcache ! */
23701 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23702  
23703         if (q)
23704                 q->flags |= SIGQUEUE_PREALLOC;
23705 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
23706                 if (gstop_done && ptrace_reparented(current))
23707                         do_notify_parent_cldstop(current, false, why);
23708  
23709 -               /*
23710 -                * Don't want to allow preemption here, because
23711 -                * sys_ptrace() needs this task to be inactive.
23712 -                *
23713 -                * XXX: implement read_unlock_no_resched().
23714 -                */
23715 -               preempt_disable();
23716                 read_unlock(&tasklist_lock);
23717 -               preempt_enable_no_resched();
23718                 freezable_schedule();
23719         } else {
23720                 /*
23721 diff --git a/kernel/softirq.c b/kernel/softirq.c
23722 index 479e4436f787..cb9c1d5dee10 100644
23723 --- a/kernel/softirq.c
23724 +++ b/kernel/softirq.c
23725 @@ -21,10 +21,12 @@
23726  #include <linux/freezer.h>
23727  #include <linux/kthread.h>
23728  #include <linux/rcupdate.h>
23729 +#include <linux/delay.h>
23730  #include <linux/ftrace.h>
23731  #include <linux/smp.h>
23732  #include <linux/smpboot.h>
23733  #include <linux/tick.h>
23734 +#include <linux/locallock.h>
23735  #include <linux/irq.h>
23736  
23737  #define CREATE_TRACE_POINTS
23738 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
23739  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23740  
23741  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23742 +#ifdef CONFIG_PREEMPT_RT_FULL
23743 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23744 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23745 +#endif
23746  
23747  const char * const softirq_to_name[NR_SOFTIRQS] = {
23748         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
23749         "TASKLET", "SCHED", "HRTIMER", "RCU"
23750  };
23751  
23752 +#ifdef CONFIG_NO_HZ_COMMON
23753 +# ifdef CONFIG_PREEMPT_RT_FULL
23754 +
23755 +struct softirq_runner {
23756 +       struct task_struct *runner[NR_SOFTIRQS];
23757 +};
23758 +
23759 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23760 +
23761 +static inline void softirq_set_runner(unsigned int sirq)
23762 +{
23763 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23764 +
23765 +       sr->runner[sirq] = current;
23766 +}
23767 +
23768 +static inline void softirq_clr_runner(unsigned int sirq)
23769 +{
23770 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23771 +
23772 +       sr->runner[sirq] = NULL;
23773 +}
23774 +
23775 +/*
23776 + * On preempt-rt a softirq running context might be blocked on a
23777 + * lock. There might be no other runnable task on this CPU because the
23778 + * lock owner runs on some other CPU. So we have to go into idle with
23779 + * the pending bit set. Therefor we need to check this otherwise we
23780 + * warn about false positives which confuses users and defeats the
23781 + * whole purpose of this test.
23782 + *
23783 + * This code is called with interrupts disabled.
23784 + */
23785 +void softirq_check_pending_idle(void)
23786 +{
23787 +       static int rate_limit;
23788 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23789 +       u32 warnpending;
23790 +       int i;
23791 +
23792 +       if (rate_limit >= 10)
23793 +               return;
23794 +
23795 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
23796 +       for (i = 0; i < NR_SOFTIRQS; i++) {
23797 +               struct task_struct *tsk = sr->runner[i];
23798 +
23799 +               /*
23800 +                * The wakeup code in rtmutex.c wakes up the task
23801 +                * _before_ it sets pi_blocked_on to NULL under
23802 +                * tsk->pi_lock. So we need to check for both: state
23803 +                * and pi_blocked_on.
23804 +                */
23805 +               if (tsk) {
23806 +                       raw_spin_lock(&tsk->pi_lock);
23807 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
23808 +                               /* Clear all bits pending in that task */
23809 +                               warnpending &= ~(tsk->softirqs_raised);
23810 +                               warnpending &= ~(1 << i);
23811 +                       }
23812 +                       raw_spin_unlock(&tsk->pi_lock);
23813 +               }
23814 +       }
23815 +
23816 +       if (warnpending) {
23817 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23818 +                      warnpending);
23819 +               rate_limit++;
23820 +       }
23821 +}
23822 +# else
23823 +/*
23824 + * On !PREEMPT_RT we just printk rate limited:
23825 + */
23826 +void softirq_check_pending_idle(void)
23827 +{
23828 +       static int rate_limit;
23829 +
23830 +       if (rate_limit < 10 &&
23831 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
23832 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23833 +                      local_softirq_pending());
23834 +               rate_limit++;
23835 +       }
23836 +}
23837 +# endif
23838 +
23839 +#else /* !CONFIG_NO_HZ_COMMON */
23840 +static inline void softirq_set_runner(unsigned int sirq) { }
23841 +static inline void softirq_clr_runner(unsigned int sirq) { }
23842 +#endif
23843 +
23844  /*
23845   * we cannot loop indefinitely here to avoid userspace starvation,
23846   * but we also don't want to introduce a worst case 1/HZ latency
23847 @@ -77,6 +175,79 @@ static void wakeup_softirqd(void)
23848                 wake_up_process(tsk);
23849  }
23850  
23851 +#ifdef CONFIG_PREEMPT_RT_FULL
23852 +static void wakeup_timer_softirqd(void)
23853 +{
23854 +       /* Interrupts are disabled: no need to stop preemption */
23855 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
23856 +
23857 +       if (tsk && tsk->state != TASK_RUNNING)
23858 +               wake_up_process(tsk);
23859 +}
23860 +#endif
23861 +
23862 +static void handle_softirq(unsigned int vec_nr)
23863 +{
23864 +       struct softirq_action *h = softirq_vec + vec_nr;
23865 +       int prev_count;
23866 +
23867 +       prev_count = preempt_count();
23868 +
23869 +       kstat_incr_softirqs_this_cpu(vec_nr);
23870 +
23871 +       trace_softirq_entry(vec_nr);
23872 +       h->action(h);
23873 +       trace_softirq_exit(vec_nr);
23874 +       if (unlikely(prev_count != preempt_count())) {
23875 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23876 +                      vec_nr, softirq_to_name[vec_nr], h->action,
23877 +                      prev_count, preempt_count());
23878 +               preempt_count_set(prev_count);
23879 +       }
23880 +}
23881 +
23882 +#ifndef CONFIG_PREEMPT_RT_FULL
23883 +static inline int ksoftirqd_softirq_pending(void)
23884 +{
23885 +       return local_softirq_pending();
23886 +}
23887 +
23888 +static void handle_pending_softirqs(u32 pending)
23889 +{
23890 +       struct softirq_action *h = softirq_vec;
23891 +       int softirq_bit;
23892 +
23893 +       local_irq_enable();
23894 +
23895 +       h = softirq_vec;
23896 +
23897 +       while ((softirq_bit = ffs(pending))) {
23898 +               unsigned int vec_nr;
23899 +
23900 +               h += softirq_bit - 1;
23901 +               vec_nr = h - softirq_vec;
23902 +               handle_softirq(vec_nr);
23903 +
23904 +               h++;
23905 +               pending >>= softirq_bit;
23906 +       }
23907 +
23908 +       rcu_bh_qs();
23909 +       local_irq_disable();
23910 +}
23911 +
23912 +static void run_ksoftirqd(unsigned int cpu)
23913 +{
23914 +       local_irq_disable();
23915 +       if (ksoftirqd_softirq_pending()) {
23916 +               __do_softirq();
23917 +               local_irq_enable();
23918 +               cond_resched_rcu_qs();
23919 +               return;
23920 +       }
23921 +       local_irq_enable();
23922 +}
23923 +
23924  /*
23925   * preempt_count and SOFTIRQ_OFFSET usage:
23926   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
23927 @@ -116,9 +287,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
23928  
23929         if (preempt_count() == cnt) {
23930  #ifdef CONFIG_DEBUG_PREEMPT
23931 -               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
23932 +               current->preempt_disable_ip = get_lock_parent_ip();
23933  #endif
23934 -               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
23935 +               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
23936         }
23937  }
23938  EXPORT_SYMBOL(__local_bh_disable_ip);
23939 @@ -232,10 +403,8 @@ asmlinkage __visible void __do_softirq(void)
23940         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
23941         unsigned long old_flags = current->flags;
23942         int max_restart = MAX_SOFTIRQ_RESTART;
23943 -       struct softirq_action *h;
23944         bool in_hardirq;
23945         __u32 pending;
23946 -       int softirq_bit;
23947  
23948         /*
23949          * Mask out PF_MEMALLOC s current task context is borrowed for the
23950 @@ -254,36 +423,7 @@ restart:
23951         /* Reset the pending bitmask before enabling irqs */
23952         set_softirq_pending(0);
23953  
23954 -       local_irq_enable();
23955 -
23956 -       h = softirq_vec;
23957 -
23958 -       while ((softirq_bit = ffs(pending))) {
23959 -               unsigned int vec_nr;
23960 -               int prev_count;
23961 -
23962 -               h += softirq_bit - 1;
23963 -
23964 -               vec_nr = h - softirq_vec;
23965 -               prev_count = preempt_count();
23966 -
23967 -               kstat_incr_softirqs_this_cpu(vec_nr);
23968 -
23969 -               trace_softirq_entry(vec_nr);
23970 -               h->action(h);
23971 -               trace_softirq_exit(vec_nr);
23972 -               if (unlikely(prev_count != preempt_count())) {
23973 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23974 -                              vec_nr, softirq_to_name[vec_nr], h->action,
23975 -                              prev_count, preempt_count());
23976 -                       preempt_count_set(prev_count);
23977 -               }
23978 -               h++;
23979 -               pending >>= softirq_bit;
23980 -       }
23981 -
23982 -       rcu_bh_qs();
23983 -       local_irq_disable();
23984 +       handle_pending_softirqs(pending);
23985  
23986         pending = local_softirq_pending();
23987         if (pending) {
23988 @@ -320,6 +460,310 @@ asmlinkage __visible void do_softirq(void)
23989  }
23990  
23991  /*
23992 + * This function must run with irqs disabled!
23993 + */
23994 +void raise_softirq_irqoff(unsigned int nr)
23995 +{
23996 +       __raise_softirq_irqoff(nr);
23997 +
23998 +       /*
23999 +        * If we're in an interrupt or softirq, we're done
24000 +        * (this also catches softirq-disabled code). We will
24001 +        * actually run the softirq once we return from
24002 +        * the irq or softirq.
24003 +        *
24004 +        * Otherwise we wake up ksoftirqd to make sure we
24005 +        * schedule the softirq soon.
24006 +        */
24007 +       if (!in_interrupt())
24008 +               wakeup_softirqd();
24009 +}
24010 +
24011 +void __raise_softirq_irqoff(unsigned int nr)
24012 +{
24013 +       trace_softirq_raise(nr);
24014 +       or_softirq_pending(1UL << nr);
24015 +}
24016 +
24017 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24018 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24019 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
24020 +
24021 +#else /* !PREEMPT_RT_FULL */
24022 +
24023 +/*
24024 + * On RT we serialize softirq execution with a cpu local lock per softirq
24025 + */
24026 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
24027 +
24028 +void __init softirq_early_init(void)
24029 +{
24030 +       int i;
24031 +
24032 +       for (i = 0; i < NR_SOFTIRQS; i++)
24033 +               local_irq_lock_init(local_softirq_locks[i]);
24034 +}
24035 +
24036 +static void lock_softirq(int which)
24037 +{
24038 +       local_lock(local_softirq_locks[which]);
24039 +}
24040 +
24041 +static void unlock_softirq(int which)
24042 +{
24043 +       local_unlock(local_softirq_locks[which]);
24044 +}
24045 +
24046 +static void do_single_softirq(int which)
24047 +{
24048 +       unsigned long old_flags = current->flags;
24049 +
24050 +       current->flags &= ~PF_MEMALLOC;
24051 +       vtime_account_irq_enter(current);
24052 +       current->flags |= PF_IN_SOFTIRQ;
24053 +       lockdep_softirq_enter();
24054 +       local_irq_enable();
24055 +       handle_softirq(which);
24056 +       local_irq_disable();
24057 +       lockdep_softirq_exit();
24058 +       current->flags &= ~PF_IN_SOFTIRQ;
24059 +       vtime_account_irq_enter(current);
24060 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
24061 +}
24062 +
24063 +/*
24064 + * Called with interrupts disabled. Process softirqs which were raised
24065 + * in current context (or on behalf of ksoftirqd).
24066 + */
24067 +static void do_current_softirqs(void)
24068 +{
24069 +       while (current->softirqs_raised) {
24070 +               int i = __ffs(current->softirqs_raised);
24071 +               unsigned int pending, mask = (1U << i);
24072 +
24073 +               current->softirqs_raised &= ~mask;
24074 +               local_irq_enable();
24075 +
24076 +               /*
24077 +                * If the lock is contended, we boost the owner to
24078 +                * process the softirq or leave the critical section
24079 +                * now.
24080 +                */
24081 +               lock_softirq(i);
24082 +               local_irq_disable();
24083 +               softirq_set_runner(i);
24084 +               /*
24085 +                * Check with the local_softirq_pending() bits,
24086 +                * whether we need to process this still or if someone
24087 +                * else took care of it.
24088 +                */
24089 +               pending = local_softirq_pending();
24090 +               if (pending & mask) {
24091 +                       set_softirq_pending(pending & ~mask);
24092 +                       do_single_softirq(i);
24093 +               }
24094 +               softirq_clr_runner(i);
24095 +               WARN_ON(current->softirq_nestcnt != 1);
24096 +               local_irq_enable();
24097 +               unlock_softirq(i);
24098 +               local_irq_disable();
24099 +       }
24100 +}
24101 +
24102 +void __local_bh_disable(void)
24103 +{
24104 +       if (++current->softirq_nestcnt == 1)
24105 +               migrate_disable();
24106 +}
24107 +EXPORT_SYMBOL(__local_bh_disable);
24108 +
24109 +void __local_bh_enable(void)
24110 +{
24111 +       if (WARN_ON(current->softirq_nestcnt == 0))
24112 +               return;
24113 +
24114 +       local_irq_disable();
24115 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24116 +               do_current_softirqs();
24117 +       local_irq_enable();
24118 +
24119 +       if (--current->softirq_nestcnt == 0)
24120 +               migrate_enable();
24121 +}
24122 +EXPORT_SYMBOL(__local_bh_enable);
24123 +
24124 +void _local_bh_enable(void)
24125 +{
24126 +       if (WARN_ON(current->softirq_nestcnt == 0))
24127 +               return;
24128 +       if (--current->softirq_nestcnt == 0)
24129 +               migrate_enable();
24130 +}
24131 +EXPORT_SYMBOL(_local_bh_enable);
24132 +
24133 +int in_serving_softirq(void)
24134 +{
24135 +       return current->flags & PF_IN_SOFTIRQ;
24136 +}
24137 +EXPORT_SYMBOL(in_serving_softirq);
24138 +
24139 +/* Called with preemption disabled */
24140 +static void run_ksoftirqd(unsigned int cpu)
24141 +{
24142 +       local_irq_disable();
24143 +       current->softirq_nestcnt++;
24144 +
24145 +       do_current_softirqs();
24146 +       current->softirq_nestcnt--;
24147 +       local_irq_enable();
24148 +       cond_resched_rcu_qs();
24149 +}
24150 +
24151 +/*
24152 + * Called from netif_rx_ni(). Preemption enabled, but migration
24153 + * disabled. So the cpu can't go away under us.
24154 + */
24155 +void thread_do_softirq(void)
24156 +{
24157 +       if (!in_serving_softirq() && current->softirqs_raised) {
24158 +               current->softirq_nestcnt++;
24159 +               do_current_softirqs();
24160 +               current->softirq_nestcnt--;
24161 +       }
24162 +}
24163 +
24164 +static void do_raise_softirq_irqoff(unsigned int nr)
24165 +{
24166 +       unsigned int mask;
24167 +
24168 +       mask = 1UL << nr;
24169 +
24170 +       trace_softirq_raise(nr);
24171 +       or_softirq_pending(mask);
24172 +
24173 +       /*
24174 +        * If we are not in a hard interrupt and inside a bh disabled
24175 +        * region, we simply raise the flag on current. local_bh_enable()
24176 +        * will make sure that the softirq is executed. Otherwise we
24177 +        * delegate it to ksoftirqd.
24178 +        */
24179 +       if (!in_irq() && current->softirq_nestcnt)
24180 +               current->softirqs_raised |= mask;
24181 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24182 +               return;
24183 +
24184 +       if (mask & TIMER_SOFTIRQS)
24185 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24186 +       else
24187 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24188 +}
24189 +
24190 +static void wakeup_proper_softirq(unsigned int nr)
24191 +{
24192 +       if ((1UL << nr) & TIMER_SOFTIRQS)
24193 +               wakeup_timer_softirqd();
24194 +       else
24195 +               wakeup_softirqd();
24196 +}
24197 +
24198 +
24199 +void __raise_softirq_irqoff(unsigned int nr)
24200 +{
24201 +       do_raise_softirq_irqoff(nr);
24202 +       if (!in_irq() && !current->softirq_nestcnt)
24203 +               wakeup_proper_softirq(nr);
24204 +}
24205 +
24206 +/*
24207 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24208 + */
24209 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
24210 +{
24211 +       unsigned int mask;
24212 +
24213 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24214 +                        !__this_cpu_read(ktimer_softirqd)))
24215 +               return;
24216 +       mask = 1UL << nr;
24217 +
24218 +       trace_softirq_raise(nr);
24219 +       or_softirq_pending(mask);
24220 +       if (mask & TIMER_SOFTIRQS)
24221 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24222 +       else
24223 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24224 +       wakeup_proper_softirq(nr);
24225 +}
24226 +
24227 +/*
24228 + * This function must run with irqs disabled!
24229 + */
24230 +void raise_softirq_irqoff(unsigned int nr)
24231 +{
24232 +       do_raise_softirq_irqoff(nr);
24233 +
24234 +       /*
24235 +        * If we're in an hard interrupt we let irq return code deal
24236 +        * with the wakeup of ksoftirqd.
24237 +        */
24238 +       if (in_irq())
24239 +               return;
24240 +       /*
24241 +        * If we are in thread context but outside of a bh disabled
24242 +        * region, we need to wake ksoftirqd as well.
24243 +        *
24244 +        * CHECKME: Some of the places which do that could be wrapped
24245 +        * into local_bh_disable/enable pairs. Though it's unclear
24246 +        * whether this is worth the effort. To find those places just
24247 +        * raise a WARN() if the condition is met.
24248 +        */
24249 +       if (!current->softirq_nestcnt)
24250 +               wakeup_proper_softirq(nr);
24251 +}
24252 +
24253 +static inline int ksoftirqd_softirq_pending(void)
24254 +{
24255 +       return current->softirqs_raised;
24256 +}
24257 +
24258 +static inline void local_bh_disable_nort(void) { }
24259 +static inline void _local_bh_enable_nort(void) { }
24260 +
24261 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
24262 +{
24263 +       /* Take over all but timer pending softirqs when starting */
24264 +       local_irq_disable();
24265 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
24266 +       local_irq_enable();
24267 +}
24268 +
24269 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
24270 +{
24271 +       struct sched_param param = { .sched_priority = 1 };
24272 +
24273 +       sched_setscheduler(current, SCHED_FIFO, &param);
24274 +
24275 +       /* Take over timer pending softirqs when starting */
24276 +       local_irq_disable();
24277 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
24278 +       local_irq_enable();
24279 +}
24280 +
24281 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
24282 +                                                   bool online)
24283 +{
24284 +       struct sched_param param = { .sched_priority = 0 };
24285 +
24286 +       sched_setscheduler(current, SCHED_NORMAL, &param);
24287 +}
24288 +
24289 +static int ktimer_softirqd_should_run(unsigned int cpu)
24290 +{
24291 +       return current->softirqs_raised;
24292 +}
24293 +
24294 +#endif /* PREEMPT_RT_FULL */
24295 +/*
24296   * Enter an interrupt context.
24297   */
24298  void irq_enter(void)
24299 @@ -330,9 +774,9 @@ void irq_enter(void)
24300                  * Prevent raise_softirq from needlessly waking up ksoftirqd
24301                  * here, as softirq will be serviced on return from interrupt.
24302                  */
24303 -               local_bh_disable();
24304 +               local_bh_disable_nort();
24305                 tick_irq_enter();
24306 -               _local_bh_enable();
24307 +               _local_bh_enable_nort();
24308         }
24309  
24310         __irq_enter();
24311 @@ -340,6 +784,7 @@ void irq_enter(void)
24312  
24313  static inline void invoke_softirq(void)
24314  {
24315 +#ifndef CONFIG_PREEMPT_RT_FULL
24316         if (!force_irqthreads) {
24317  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
24318                 /*
24319 @@ -359,6 +804,18 @@ static inline void invoke_softirq(void)
24320         } else {
24321                 wakeup_softirqd();
24322         }
24323 +#else /* PREEMPT_RT_FULL */
24324 +       unsigned long flags;
24325 +
24326 +       local_irq_save(flags);
24327 +       if (__this_cpu_read(ksoftirqd) &&
24328 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
24329 +               wakeup_softirqd();
24330 +       if (__this_cpu_read(ktimer_softirqd) &&
24331 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
24332 +               wakeup_timer_softirqd();
24333 +       local_irq_restore(flags);
24334 +#endif
24335  }
24336  
24337  static inline void tick_irq_exit(void)
24338 @@ -395,26 +852,6 @@ void irq_exit(void)
24339         trace_hardirq_exit(); /* must be last! */
24340  }
24341  
24342 -/*
24343 - * This function must run with irqs disabled!
24344 - */
24345 -inline void raise_softirq_irqoff(unsigned int nr)
24346 -{
24347 -       __raise_softirq_irqoff(nr);
24348 -
24349 -       /*
24350 -        * If we're in an interrupt or softirq, we're done
24351 -        * (this also catches softirq-disabled code). We will
24352 -        * actually run the softirq once we return from
24353 -        * the irq or softirq.
24354 -        *
24355 -        * Otherwise we wake up ksoftirqd to make sure we
24356 -        * schedule the softirq soon.
24357 -        */
24358 -       if (!in_interrupt())
24359 -               wakeup_softirqd();
24360 -}
24361 -
24362  void raise_softirq(unsigned int nr)
24363  {
24364         unsigned long flags;
24365 @@ -424,12 +861,6 @@ void raise_softirq(unsigned int nr)
24366         local_irq_restore(flags);
24367  }
24368  
24369 -void __raise_softirq_irqoff(unsigned int nr)
24370 -{
24371 -       trace_softirq_raise(nr);
24372 -       or_softirq_pending(1UL << nr);
24373 -}
24374 -
24375  void open_softirq(int nr, void (*action)(struct softirq_action *))
24376  {
24377         softirq_vec[nr].action = action;
24378 @@ -446,15 +877,45 @@ struct tasklet_head {
24379  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
24380  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
24381  
24382 +static void inline
24383 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
24384 +{
24385 +       if (tasklet_trylock(t)) {
24386 +again:
24387 +               /* We may have been preempted before tasklet_trylock
24388 +                * and __tasklet_action may have already run.
24389 +                * So double check the sched bit while the takslet
24390 +                * is locked before adding it to the list.
24391 +                */
24392 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
24393 +                       t->next = NULL;
24394 +                       *head->tail = t;
24395 +                       head->tail = &(t->next);
24396 +                       raise_softirq_irqoff(nr);
24397 +                       tasklet_unlock(t);
24398 +               } else {
24399 +                       /* This is subtle. If we hit the corner case above
24400 +                        * It is possible that we get preempted right here,
24401 +                        * and another task has successfully called
24402 +                        * tasklet_schedule(), then this function, and
24403 +                        * failed on the trylock. Thus we must be sure
24404 +                        * before releasing the tasklet lock, that the
24405 +                        * SCHED_BIT is clear. Otherwise the tasklet
24406 +                        * may get its SCHED_BIT set, but not added to the
24407 +                        * list
24408 +                        */
24409 +                       if (!tasklet_tryunlock(t))
24410 +                               goto again;
24411 +               }
24412 +       }
24413 +}
24414 +
24415  void __tasklet_schedule(struct tasklet_struct *t)
24416  {
24417         unsigned long flags;
24418  
24419         local_irq_save(flags);
24420 -       t->next = NULL;
24421 -       *__this_cpu_read(tasklet_vec.tail) = t;
24422 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
24423 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
24424 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
24425         local_irq_restore(flags);
24426  }
24427  EXPORT_SYMBOL(__tasklet_schedule);
24428 @@ -464,10 +925,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
24429         unsigned long flags;
24430  
24431         local_irq_save(flags);
24432 -       t->next = NULL;
24433 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
24434 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
24435 -       raise_softirq_irqoff(HI_SOFTIRQ);
24436 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
24437         local_irq_restore(flags);
24438  }
24439  EXPORT_SYMBOL(__tasklet_hi_schedule);
24440 @@ -476,82 +934,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
24441  {
24442         BUG_ON(!irqs_disabled());
24443  
24444 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
24445 -       __this_cpu_write(tasklet_hi_vec.head, t);
24446 -       __raise_softirq_irqoff(HI_SOFTIRQ);
24447 +       __tasklet_hi_schedule(t);
24448  }
24449  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
24450  
24451 -static void tasklet_action(struct softirq_action *a)
24452 +void  tasklet_enable(struct tasklet_struct *t)
24453  {
24454 -       struct tasklet_struct *list;
24455 +       if (!atomic_dec_and_test(&t->count))
24456 +               return;
24457 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24458 +               tasklet_schedule(t);
24459 +}
24460 +EXPORT_SYMBOL(tasklet_enable);
24461  
24462 -       local_irq_disable();
24463 -       list = __this_cpu_read(tasklet_vec.head);
24464 -       __this_cpu_write(tasklet_vec.head, NULL);
24465 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24466 -       local_irq_enable();
24467 +static void __tasklet_action(struct softirq_action *a,
24468 +                            struct tasklet_struct *list)
24469 +{
24470 +       int loops = 1000000;
24471  
24472         while (list) {
24473                 struct tasklet_struct *t = list;
24474  
24475                 list = list->next;
24476  
24477 -               if (tasklet_trylock(t)) {
24478 -                       if (!atomic_read(&t->count)) {
24479 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24480 -                                                       &t->state))
24481 -                                       BUG();
24482 -                               t->func(t->data);
24483 -                               tasklet_unlock(t);
24484 -                               continue;
24485 -                       }
24486 -                       tasklet_unlock(t);
24487 +               /*
24488 +                * Should always succeed - after a tasklist got on the
24489 +                * list (after getting the SCHED bit set from 0 to 1),
24490 +                * nothing but the tasklet softirq it got queued to can
24491 +                * lock it:
24492 +                */
24493 +               if (!tasklet_trylock(t)) {
24494 +                       WARN_ON(1);
24495 +                       continue;
24496                 }
24497  
24498 -               local_irq_disable();
24499                 t->next = NULL;
24500 -               *__this_cpu_read(tasklet_vec.tail) = t;
24501 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
24502 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24503 -               local_irq_enable();
24504 +
24505 +               /*
24506 +                * If we cannot handle the tasklet because it's disabled,
24507 +                * mark it as pending. tasklet_enable() will later
24508 +                * re-schedule the tasklet.
24509 +                */
24510 +               if (unlikely(atomic_read(&t->count))) {
24511 +out_disabled:
24512 +                       /* implicit unlock: */
24513 +                       wmb();
24514 +                       t->state = TASKLET_STATEF_PENDING;
24515 +                       continue;
24516 +               }
24517 +
24518 +               /*
24519 +                * After this point on the tasklet might be rescheduled
24520 +                * on another CPU, but it can only be added to another
24521 +                * CPU's tasklet list if we unlock the tasklet (which we
24522 +                * dont do yet).
24523 +                */
24524 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24525 +                       WARN_ON(1);
24526 +
24527 +again:
24528 +               t->func(t->data);
24529 +
24530 +               /*
24531 +                * Try to unlock the tasklet. We must use cmpxchg, because
24532 +                * another CPU might have scheduled or disabled the tasklet.
24533 +                * We only allow the STATE_RUN -> 0 transition here.
24534 +                */
24535 +               while (!tasklet_tryunlock(t)) {
24536 +                       /*
24537 +                        * If it got disabled meanwhile, bail out:
24538 +                        */
24539 +                       if (atomic_read(&t->count))
24540 +                               goto out_disabled;
24541 +                       /*
24542 +                        * If it got scheduled meanwhile, re-execute
24543 +                        * the tasklet function:
24544 +                        */
24545 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24546 +                               goto again;
24547 +                       if (!--loops) {
24548 +                               printk("hm, tasklet state: %08lx\n", t->state);
24549 +                               WARN_ON(1);
24550 +                               tasklet_unlock(t);
24551 +                               break;
24552 +                       }
24553 +               }
24554         }
24555  }
24556  
24557 +static void tasklet_action(struct softirq_action *a)
24558 +{
24559 +       struct tasklet_struct *list;
24560 +
24561 +       local_irq_disable();
24562 +
24563 +       list = __this_cpu_read(tasklet_vec.head);
24564 +       __this_cpu_write(tasklet_vec.head, NULL);
24565 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24566 +
24567 +       local_irq_enable();
24568 +
24569 +       __tasklet_action(a, list);
24570 +}
24571 +
24572  static void tasklet_hi_action(struct softirq_action *a)
24573  {
24574         struct tasklet_struct *list;
24575  
24576         local_irq_disable();
24577 +
24578         list = __this_cpu_read(tasklet_hi_vec.head);
24579         __this_cpu_write(tasklet_hi_vec.head, NULL);
24580         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24581 -       local_irq_enable();
24582  
24583 -       while (list) {
24584 -               struct tasklet_struct *t = list;
24585 -
24586 -               list = list->next;
24587 -
24588 -               if (tasklet_trylock(t)) {
24589 -                       if (!atomic_read(&t->count)) {
24590 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24591 -                                                       &t->state))
24592 -                                       BUG();
24593 -                               t->func(t->data);
24594 -                               tasklet_unlock(t);
24595 -                               continue;
24596 -                       }
24597 -                       tasklet_unlock(t);
24598 -               }
24599 +       local_irq_enable();
24600  
24601 -               local_irq_disable();
24602 -               t->next = NULL;
24603 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
24604 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24605 -               __raise_softirq_irqoff(HI_SOFTIRQ);
24606 -               local_irq_enable();
24607 -       }
24608 +       __tasklet_action(a, list);
24609  }
24610  
24611  void tasklet_init(struct tasklet_struct *t,
24612 @@ -572,7 +1070,7 @@ void tasklet_kill(struct tasklet_struct *t)
24613  
24614         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24615                 do {
24616 -                       yield();
24617 +                       msleep(1);
24618                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24619         }
24620         tasklet_unlock_wait(t);
24621 @@ -646,25 +1144,26 @@ void __init softirq_init(void)
24622         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24623  }
24624  
24625 -static int ksoftirqd_should_run(unsigned int cpu)
24626 -{
24627 -       return local_softirq_pending();
24628 -}
24629 -
24630 -static void run_ksoftirqd(unsigned int cpu)
24631 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24632 +void tasklet_unlock_wait(struct tasklet_struct *t)
24633  {
24634 -       local_irq_disable();
24635 -       if (local_softirq_pending()) {
24636 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24637                 /*
24638 -                * We can safely run softirq on inline stack, as we are not deep
24639 -                * in the task stack here.
24640 +                * Hack for now to avoid this busy-loop:
24641                  */
24642 -               __do_softirq();
24643 -               local_irq_enable();
24644 -               cond_resched_rcu_qs();
24645 -               return;
24646 +#ifdef CONFIG_PREEMPT_RT_FULL
24647 +               msleep(1);
24648 +#else
24649 +               barrier();
24650 +#endif
24651         }
24652 -       local_irq_enable();
24653 +}
24654 +EXPORT_SYMBOL(tasklet_unlock_wait);
24655 +#endif
24656 +
24657 +static int ksoftirqd_should_run(unsigned int cpu)
24658 +{
24659 +       return ksoftirqd_softirq_pending();
24660  }
24661  
24662  #ifdef CONFIG_HOTPLUG_CPU
24663 @@ -746,16 +1245,31 @@ static struct notifier_block cpu_nfb = {
24664  
24665  static struct smp_hotplug_thread softirq_threads = {
24666         .store                  = &ksoftirqd,
24667 +       .setup                  = ksoftirqd_set_sched_params,
24668         .thread_should_run      = ksoftirqd_should_run,
24669         .thread_fn              = run_ksoftirqd,
24670         .thread_comm            = "ksoftirqd/%u",
24671  };
24672  
24673 +#ifdef CONFIG_PREEMPT_RT_FULL
24674 +static struct smp_hotplug_thread softirq_timer_threads = {
24675 +       .store                  = &ktimer_softirqd,
24676 +       .setup                  = ktimer_softirqd_set_sched_params,
24677 +       .cleanup                = ktimer_softirqd_clr_sched_params,
24678 +       .thread_should_run      = ktimer_softirqd_should_run,
24679 +       .thread_fn              = run_ksoftirqd,
24680 +       .thread_comm            = "ktimersoftd/%u",
24681 +};
24682 +#endif
24683 +
24684  static __init int spawn_ksoftirqd(void)
24685  {
24686         register_cpu_notifier(&cpu_nfb);
24687  
24688         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24689 +#ifdef CONFIG_PREEMPT_RT_FULL
24690 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24691 +#endif
24692  
24693         return 0;
24694  }
24695 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
24696 index a3bbaee77c58..f84d3b45cda7 100644
24697 --- a/kernel/stop_machine.c
24698 +++ b/kernel/stop_machine.c
24699 @@ -37,7 +37,7 @@ struct cpu_stop_done {
24700  struct cpu_stopper {
24701         struct task_struct      *thread;
24702  
24703 -       spinlock_t              lock;
24704 +       raw_spinlock_t          lock;
24705         bool                    enabled;        /* is this stopper enabled? */
24706         struct list_head        works;          /* list of pending works */
24707  
24708 @@ -86,12 +86,12 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
24709         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
24710         unsigned long flags;
24711  
24712 -       spin_lock_irqsave(&stopper->lock, flags);
24713 +       raw_spin_lock_irqsave(&stopper->lock, flags);
24714         if (stopper->enabled)
24715                 __cpu_stop_queue_work(stopper, work);
24716         else
24717                 cpu_stop_signal_done(work->done, false);
24718 -       spin_unlock_irqrestore(&stopper->lock, flags);
24719 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
24720  }
24721  
24722  /**
24723 @@ -224,8 +224,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
24724         int err;
24725  
24726         lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
24727 -       spin_lock_irq(&stopper1->lock);
24728 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
24729 +       raw_spin_lock_irq(&stopper1->lock);
24730 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
24731  
24732         err = -ENOENT;
24733         if (!stopper1->enabled || !stopper2->enabled)
24734 @@ -235,8 +235,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
24735         __cpu_stop_queue_work(stopper1, work1);
24736         __cpu_stop_queue_work(stopper2, work2);
24737  unlock:
24738 -       spin_unlock(&stopper2->lock);
24739 -       spin_unlock_irq(&stopper1->lock);
24740 +       raw_spin_unlock(&stopper2->lock);
24741 +       raw_spin_unlock_irq(&stopper1->lock);
24742         lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
24743  
24744         return err;
24745 @@ -258,7 +258,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
24746         struct cpu_stop_work work1, work2;
24747         struct multi_stop_data msdata;
24748  
24749 -       preempt_disable();
24750 +       preempt_disable_nort();
24751         msdata = (struct multi_stop_data){
24752                 .fn = fn,
24753                 .data = arg,
24754 @@ -278,11 +278,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
24755         if (cpu1 > cpu2)
24756                 swap(cpu1, cpu2);
24757         if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
24758 -               preempt_enable();
24759 +               preempt_enable_nort();
24760                 return -ENOENT;
24761         }
24762  
24763 -       preempt_enable();
24764 +       preempt_enable_nort();
24765  
24766         wait_for_completion(&done.completion);
24767  
24768 @@ -315,17 +315,20 @@ static DEFINE_MUTEX(stop_cpus_mutex);
24769  
24770  static void queue_stop_cpus_work(const struct cpumask *cpumask,
24771                                  cpu_stop_fn_t fn, void *arg,
24772 -                                struct cpu_stop_done *done)
24773 +                                struct cpu_stop_done *done, bool inactive)
24774  {
24775         struct cpu_stop_work *work;
24776         unsigned int cpu;
24777  
24778         /*
24779 -        * Disable preemption while queueing to avoid getting
24780 -        * preempted by a stopper which might wait for other stoppers
24781 -        * to enter @fn which can lead to deadlock.
24782 +        * Make sure that all work is queued on all cpus before
24783 +        * any of the cpus can execute it.
24784          */
24785 -       lg_global_lock(&stop_cpus_lock);
24786 +       if (!inactive)
24787 +               lg_global_lock(&stop_cpus_lock);
24788 +       else
24789 +               lg_global_trylock_relax(&stop_cpus_lock);
24790 +
24791         for_each_cpu(cpu, cpumask) {
24792                 work = &per_cpu(cpu_stopper.stop_work, cpu);
24793                 work->fn = fn;
24794 @@ -342,7 +345,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
24795         struct cpu_stop_done done;
24796  
24797         cpu_stop_init_done(&done, cpumask_weight(cpumask));
24798 -       queue_stop_cpus_work(cpumask, fn, arg, &done);
24799 +       queue_stop_cpus_work(cpumask, fn, arg, &done, false);
24800         wait_for_completion(&done.completion);
24801         return done.executed ? done.ret : -ENOENT;
24802  }
24803 @@ -422,9 +425,9 @@ static int cpu_stop_should_run(unsigned int cpu)
24804         unsigned long flags;
24805         int run;
24806  
24807 -       spin_lock_irqsave(&stopper->lock, flags);
24808 +       raw_spin_lock_irqsave(&stopper->lock, flags);
24809         run = !list_empty(&stopper->works);
24810 -       spin_unlock_irqrestore(&stopper->lock, flags);
24811 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
24812         return run;
24813  }
24814  
24815 @@ -436,13 +439,13 @@ static void cpu_stopper_thread(unsigned int cpu)
24816  
24817  repeat:
24818         work = NULL;
24819 -       spin_lock_irq(&stopper->lock);
24820 +       raw_spin_lock_irq(&stopper->lock);
24821         if (!list_empty(&stopper->works)) {
24822                 work = list_first_entry(&stopper->works,
24823                                         struct cpu_stop_work, list);
24824                 list_del_init(&work->list);
24825         }
24826 -       spin_unlock_irq(&stopper->lock);
24827 +       raw_spin_unlock_irq(&stopper->lock);
24828  
24829         if (work) {
24830                 cpu_stop_fn_t fn = work->fn;
24831 @@ -450,6 +453,16 @@ repeat:
24832                 struct cpu_stop_done *done = work->done;
24833                 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
24834  
24835 +               /*
24836 +                * Wait until the stopper finished scheduling on all
24837 +                * cpus
24838 +                */
24839 +               lg_global_lock(&stop_cpus_lock);
24840 +               /*
24841 +                * Let other cpu threads continue as well
24842 +                */
24843 +               lg_global_unlock(&stop_cpus_lock);
24844 +
24845                 /* cpu stop callbacks are not allowed to sleep */
24846                 preempt_disable();
24847  
24848 @@ -520,10 +533,12 @@ static int __init cpu_stop_init(void)
24849         for_each_possible_cpu(cpu) {
24850                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
24851  
24852 -               spin_lock_init(&stopper->lock);
24853 +               raw_spin_lock_init(&stopper->lock);
24854                 INIT_LIST_HEAD(&stopper->works);
24855         }
24856  
24857 +       lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
24858 +
24859         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
24860         stop_machine_unpark(raw_smp_processor_id());
24861         stop_machine_initialized = true;
24862 @@ -620,7 +635,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
24863         set_state(&msdata, MULTI_STOP_PREPARE);
24864         cpu_stop_init_done(&done, num_active_cpus());
24865         queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
24866 -                            &done);
24867 +                            &done, true);
24868         ret = multi_cpu_stop(&msdata);
24869  
24870         /* Busy wait for completion. */
24871 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
24872 index 17f7bcff1e02..ba3d60144838 100644
24873 --- a/kernel/time/hrtimer.c
24874 +++ b/kernel/time/hrtimer.c
24875 @@ -48,11 +48,13 @@
24876  #include <linux/sched/rt.h>
24877  #include <linux/sched/deadline.h>
24878  #include <linux/timer.h>
24879 +#include <linux/kthread.h>
24880  #include <linux/freezer.h>
24881  
24882  #include <asm/uaccess.h>
24883  
24884  #include <trace/events/timer.h>
24885 +#include <trace/events/hist.h>
24886  
24887  #include "tick-internal.h"
24888  
24889 @@ -717,6 +719,44 @@ static void clock_was_set_work(struct work_struct *work)
24890  
24891  static DECLARE_WORK(hrtimer_work, clock_was_set_work);
24892  
24893 +#ifdef CONFIG_PREEMPT_RT_FULL
24894 +/*
24895 + * RT can not call schedule_work from real interrupt context.
24896 + * Need to make a thread to do the real work.
24897 + */
24898 +static struct task_struct *clock_set_delay_thread;
24899 +static bool do_clock_set_delay;
24900 +
24901 +static int run_clock_set_delay(void *ignore)
24902 +{
24903 +       while (!kthread_should_stop()) {
24904 +               set_current_state(TASK_INTERRUPTIBLE);
24905 +               if (do_clock_set_delay) {
24906 +                       do_clock_set_delay = false;
24907 +                       schedule_work(&hrtimer_work);
24908 +               }
24909 +               schedule();
24910 +       }
24911 +       __set_current_state(TASK_RUNNING);
24912 +       return 0;
24913 +}
24914 +
24915 +void clock_was_set_delayed(void)
24916 +{
24917 +       do_clock_set_delay = true;
24918 +       /* Make visible before waking up process */
24919 +       smp_wmb();
24920 +       wake_up_process(clock_set_delay_thread);
24921 +}
24922 +
24923 +static __init int create_clock_set_delay_thread(void)
24924 +{
24925 +       clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
24926 +       BUG_ON(!clock_set_delay_thread);
24927 +       return 0;
24928 +}
24929 +early_initcall(create_clock_set_delay_thread);
24930 +#else /* PREEMPT_RT_FULL */
24931  /*
24932   * Called from timekeeping and resume code to reprogramm the hrtimer
24933   * interrupt device on all cpus.
24934 @@ -725,6 +765,7 @@ void clock_was_set_delayed(void)
24935  {
24936         schedule_work(&hrtimer_work);
24937  }
24938 +#endif
24939  
24940  #else
24941  
24942 @@ -734,11 +775,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
24943  static inline void hrtimer_switch_to_hres(void) { }
24944  static inline void
24945  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
24946 -static inline int hrtimer_reprogram(struct hrtimer *timer,
24947 -                                   struct hrtimer_clock_base *base)
24948 -{
24949 -       return 0;
24950 -}
24951 +static inline void hrtimer_reprogram(struct hrtimer *timer,
24952 +                                    struct hrtimer_clock_base *base) { }
24953  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
24954  static inline void retrigger_next_event(void *arg) { }
24955  
24956 @@ -870,6 +908,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
24957  }
24958  EXPORT_SYMBOL_GPL(hrtimer_forward);
24959  
24960 +#ifdef CONFIG_PREEMPT_RT_BASE
24961 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
24962 +
24963 +/**
24964 + * hrtimer_wait_for_timer - Wait for a running timer
24965 + *
24966 + * @timer:     timer to wait for
24967 + *
24968 + * The function waits in case the timers callback function is
24969 + * currently executed on the waitqueue of the timer base. The
24970 + * waitqueue is woken up after the timer callback function has
24971 + * finished execution.
24972 + */
24973 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
24974 +{
24975 +       struct hrtimer_clock_base *base = timer->base;
24976 +
24977 +       if (base && base->cpu_base && !timer->irqsafe)
24978 +               wait_event(base->cpu_base->wait,
24979 +                               !(hrtimer_callback_running(timer)));
24980 +}
24981 +
24982 +#else
24983 +# define wake_up_timer_waiters(b)      do { } while (0)
24984 +#endif
24985 +
24986  /*
24987   * enqueue_hrtimer - internal function to (re)start a timer
24988   *
24989 @@ -911,6 +975,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
24990         if (!(state & HRTIMER_STATE_ENQUEUED))
24991                 return;
24992  
24993 +       if (unlikely(!list_empty(&timer->cb_entry))) {
24994 +               list_del_init(&timer->cb_entry);
24995 +               return;
24996 +       }
24997 +
24998         if (!timerqueue_del(&base->active, &timer->node))
24999                 cpu_base->active_bases &= ~(1 << base->index);
25000  
25001 @@ -1006,7 +1075,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25002         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25003  
25004         timer_stats_hrtimer_set_start_info(timer);
25005 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
25006 +       {
25007 +               ktime_t now = new_base->get_time();
25008  
25009 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
25010 +                       timer->praecox = now;
25011 +               else
25012 +                       timer->praecox = ktime_set(0, 0);
25013 +       }
25014 +#endif
25015         leftmost = enqueue_hrtimer(timer, new_base);
25016         if (!leftmost)
25017                 goto unlock;
25018 @@ -1078,7 +1156,7 @@ int hrtimer_cancel(struct hrtimer *timer)
25019  
25020                 if (ret >= 0)
25021                         return ret;
25022 -               cpu_relax();
25023 +               hrtimer_wait_for_timer(timer);
25024         }
25025  }
25026  EXPORT_SYMBOL_GPL(hrtimer_cancel);
25027 @@ -1142,6 +1220,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25028  
25029         base = hrtimer_clockid_to_base(clock_id);
25030         timer->base = &cpu_base->clock_base[base];
25031 +       INIT_LIST_HEAD(&timer->cb_entry);
25032         timerqueue_init(&timer->node);
25033  
25034  #ifdef CONFIG_TIMER_STATS
25035 @@ -1182,6 +1261,7 @@ bool hrtimer_active(const struct hrtimer *timer)
25036                 seq = raw_read_seqcount_begin(&cpu_base->seq);
25037  
25038                 if (timer->state != HRTIMER_STATE_INACTIVE ||
25039 +                   cpu_base->running_soft == timer ||
25040                     cpu_base->running == timer)
25041                         return true;
25042  
25043 @@ -1280,10 +1360,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25044         cpu_base->running = NULL;
25045  }
25046  
25047 +#ifdef CONFIG_PREEMPT_RT_BASE
25048 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
25049 +                                struct hrtimer_clock_base *base)
25050 +{
25051 +       int leftmost;
25052 +
25053 +       if (restart != HRTIMER_NORESTART &&
25054 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
25055 +
25056 +               leftmost = enqueue_hrtimer(timer, base);
25057 +               if (!leftmost)
25058 +                       return;
25059 +#ifdef CONFIG_HIGH_RES_TIMERS
25060 +               if (!hrtimer_is_hres_active(timer)) {
25061 +                       /*
25062 +                        * Kick to reschedule the next tick to handle the new timer
25063 +                        * on dynticks target.
25064 +                        */
25065 +                       if (base->cpu_base->nohz_active)
25066 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
25067 +               } else {
25068 +
25069 +                       hrtimer_reprogram(timer, base);
25070 +               }
25071 +#endif
25072 +       }
25073 +}
25074 +
25075 +/*
25076 + * The changes in mainline which removed the callback modes from
25077 + * hrtimer are not yet working with -rt. The non wakeup_process()
25078 + * based callbacks which involve sleeping locks need to be treated
25079 + * seperately.
25080 + */
25081 +static void hrtimer_rt_run_pending(void)
25082 +{
25083 +       enum hrtimer_restart (*fn)(struct hrtimer *);
25084 +       struct hrtimer_cpu_base *cpu_base;
25085 +       struct hrtimer_clock_base *base;
25086 +       struct hrtimer *timer;
25087 +       int index, restart;
25088 +
25089 +       local_irq_disable();
25090 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
25091 +
25092 +       raw_spin_lock(&cpu_base->lock);
25093 +
25094 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
25095 +               base = &cpu_base->clock_base[index];
25096 +
25097 +               while (!list_empty(&base->expired)) {
25098 +                       timer = list_first_entry(&base->expired,
25099 +                                                struct hrtimer, cb_entry);
25100 +
25101 +                       /*
25102 +                        * Same as the above __run_hrtimer function
25103 +                        * just we run with interrupts enabled.
25104 +                        */
25105 +                       debug_deactivate(timer);
25106 +                       cpu_base->running_soft = timer;
25107 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25108 +
25109 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25110 +                       timer_stats_account_hrtimer(timer);
25111 +                       fn = timer->function;
25112 +
25113 +                       raw_spin_unlock_irq(&cpu_base->lock);
25114 +                       restart = fn(timer);
25115 +                       raw_spin_lock_irq(&cpu_base->lock);
25116 +
25117 +                       hrtimer_rt_reprogram(restart, timer, base);
25118 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25119 +
25120 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
25121 +                       cpu_base->running_soft = NULL;
25122 +               }
25123 +       }
25124 +
25125 +       raw_spin_unlock_irq(&cpu_base->lock);
25126 +
25127 +       wake_up_timer_waiters(cpu_base);
25128 +}
25129 +
25130 +static int hrtimer_rt_defer(struct hrtimer *timer)
25131 +{
25132 +       if (timer->irqsafe)
25133 +               return 0;
25134 +
25135 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
25136 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
25137 +       return 1;
25138 +}
25139 +
25140 +#else
25141 +
25142 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
25143 +
25144 +#endif
25145 +
25146 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
25147 +
25148  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25149  {
25150         struct hrtimer_clock_base *base = cpu_base->clock_base;
25151         unsigned int active = cpu_base->active_bases;
25152 +       int raise = 0;
25153  
25154         for (; active; base++, active >>= 1) {
25155                 struct timerqueue_node *node;
25156 @@ -1299,6 +1481,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25157  
25158                         timer = container_of(node, struct hrtimer, node);
25159  
25160 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
25161 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
25162 +                               timer->praecox : hrtimer_get_expires(timer),
25163 +                               basenow)),
25164 +                           current,
25165 +                           timer->function == hrtimer_wakeup ?
25166 +                           container_of(timer, struct hrtimer_sleeper,
25167 +                               timer)->task : NULL);
25168 +
25169                         /*
25170                          * The immediate goal for using the softexpires is
25171                          * minimizing wakeups, not running timers at the
25172 @@ -1314,9 +1505,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25173                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
25174                                 break;
25175  
25176 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
25177 +                       if (!hrtimer_rt_defer(timer))
25178 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
25179 +                       else
25180 +                               raise = 1;
25181                 }
25182         }
25183 +       if (raise)
25184 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25185  }
25186  
25187  #ifdef CONFIG_HIGH_RES_TIMERS
25188 @@ -1479,16 +1675,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
25189  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25190  {
25191         sl->timer.function = hrtimer_wakeup;
25192 +       sl->timer.irqsafe = 1;
25193         sl->task = task;
25194  }
25195  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
25196  
25197 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
25198 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
25199 +                               unsigned long state)
25200  {
25201         hrtimer_init_sleeper(t, current);
25202  
25203         do {
25204 -               set_current_state(TASK_INTERRUPTIBLE);
25205 +               set_current_state(state);
25206                 hrtimer_start_expires(&t->timer, mode);
25207  
25208                 if (likely(t->task))
25209 @@ -1530,7 +1728,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
25210                                 HRTIMER_MODE_ABS);
25211         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25212  
25213 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
25214 +       /* cpu_chill() does not care about restart state. */
25215 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
25216                 goto out;
25217  
25218         rmtp = restart->nanosleep.rmtp;
25219 @@ -1547,8 +1746,10 @@ out:
25220         return ret;
25221  }
25222  
25223 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25224 -                      const enum hrtimer_mode mode, const clockid_t clockid)
25225 +static long
25226 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25227 +                   const enum hrtimer_mode mode, const clockid_t clockid,
25228 +                   unsigned long state)
25229  {
25230         struct restart_block *restart;
25231         struct hrtimer_sleeper t;
25232 @@ -1561,7 +1762,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25233  
25234         hrtimer_init_on_stack(&t.timer, clockid, mode);
25235         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
25236 -       if (do_nanosleep(&t, mode))
25237 +       if (do_nanosleep(&t, mode, state))
25238                 goto out;
25239  
25240         /* Absolute timers do not update the rmtp value and restart: */
25241 @@ -1588,6 +1789,12 @@ out:
25242         return ret;
25243  }
25244  
25245 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25246 +                      const enum hrtimer_mode mode, const clockid_t clockid)
25247 +{
25248 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
25249 +}
25250 +
25251  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
25252                 struct timespec __user *, rmtp)
25253  {
25254 @@ -1602,6 +1809,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
25255         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
25256  }
25257  
25258 +#ifdef CONFIG_PREEMPT_RT_FULL
25259 +/*
25260 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
25261 + */
25262 +void cpu_chill(void)
25263 +{
25264 +       struct timespec tu = {
25265 +               .tv_nsec = NSEC_PER_MSEC,
25266 +       };
25267 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
25268 +
25269 +       current->flags |= PF_NOFREEZE;
25270 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
25271 +                           TASK_UNINTERRUPTIBLE);
25272 +       if (!freeze_flag)
25273 +               current->flags &= ~PF_NOFREEZE;
25274 +}
25275 +EXPORT_SYMBOL(cpu_chill);
25276 +#endif
25277 +
25278  /*
25279   * Functions related to boot-time initialization:
25280   */
25281 @@ -1613,10 +1840,14 @@ static void init_hrtimers_cpu(int cpu)
25282         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
25283                 cpu_base->clock_base[i].cpu_base = cpu_base;
25284                 timerqueue_init_head(&cpu_base->clock_base[i].active);
25285 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
25286         }
25287  
25288         cpu_base->cpu = cpu;
25289         hrtimer_init_hres(cpu_base);
25290 +#ifdef CONFIG_PREEMPT_RT_BASE
25291 +       init_waitqueue_head(&cpu_base->wait);
25292 +#endif
25293  }
25294  
25295  #ifdef CONFIG_HOTPLUG_CPU
25296 @@ -1714,11 +1945,21 @@ static struct notifier_block hrtimers_nb = {
25297         .notifier_call = hrtimer_cpu_notify,
25298  };
25299  
25300 +#ifdef CONFIG_PREEMPT_RT_BASE
25301 +static void run_hrtimer_softirq(struct softirq_action *h)
25302 +{
25303 +       hrtimer_rt_run_pending();
25304 +}
25305 +#endif
25306 +
25307  void __init hrtimers_init(void)
25308  {
25309         hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
25310                           (void *)(long)smp_processor_id());
25311         register_cpu_notifier(&hrtimers_nb);
25312 +#ifdef CONFIG_PREEMPT_RT_BASE
25313 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
25314 +#endif
25315  }
25316  
25317  /**
25318 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
25319 index 1d5c7204ddc9..184de6751180 100644
25320 --- a/kernel/time/itimer.c
25321 +++ b/kernel/time/itimer.c
25322 @@ -213,6 +213,7 @@ again:
25323                 /* We are sharing ->siglock with it_real_fn() */
25324                 if (hrtimer_try_to_cancel(timer) < 0) {
25325                         spin_unlock_irq(&tsk->sighand->siglock);
25326 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
25327                         goto again;
25328                 }
25329                 expires = timeval_to_ktime(value->it_value);
25330 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
25331 index 347fecf86a3f..2ede47408a3e 100644
25332 --- a/kernel/time/jiffies.c
25333 +++ b/kernel/time/jiffies.c
25334 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
25335         .max_cycles     = 10,
25336  };
25337  
25338 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
25339 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
25340 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
25341  
25342  #if (BITS_PER_LONG < 64)
25343  u64 get_jiffies_64(void)
25344 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
25345         u64 ret;
25346  
25347         do {
25348 -               seq = read_seqbegin(&jiffies_lock);
25349 +               seq = read_seqcount_begin(&jiffies_seq);
25350                 ret = jiffies_64;
25351 -       } while (read_seqretry(&jiffies_lock, seq));
25352 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25353         return ret;
25354  }
25355  EXPORT_SYMBOL(get_jiffies_64);
25356 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
25357 index ab861771e37f..0f6868fd2de6 100644
25358 --- a/kernel/time/ntp.c
25359 +++ b/kernel/time/ntp.c
25360 @@ -10,6 +10,7 @@
25361  #include <linux/workqueue.h>
25362  #include <linux/hrtimer.h>
25363  #include <linux/jiffies.h>
25364 +#include <linux/kthread.h>
25365  #include <linux/math64.h>
25366  #include <linux/timex.h>
25367  #include <linux/time.h>
25368 @@ -562,10 +563,52 @@ static void sync_cmos_clock(struct work_struct *work)
25369                            &sync_cmos_work, timespec64_to_jiffies(&next));
25370  }
25371  
25372 +#ifdef CONFIG_PREEMPT_RT_FULL
25373 +/*
25374 + * RT can not call schedule_delayed_work from real interrupt context.
25375 + * Need to make a thread to do the real work.
25376 + */
25377 +static struct task_struct *cmos_delay_thread;
25378 +static bool do_cmos_delay;
25379 +
25380 +static int run_cmos_delay(void *ignore)
25381 +{
25382 +       while (!kthread_should_stop()) {
25383 +               set_current_state(TASK_INTERRUPTIBLE);
25384 +               if (do_cmos_delay) {
25385 +                       do_cmos_delay = false;
25386 +                       queue_delayed_work(system_power_efficient_wq,
25387 +                                          &sync_cmos_work, 0);
25388 +               }
25389 +               schedule();
25390 +       }
25391 +       __set_current_state(TASK_RUNNING);
25392 +       return 0;
25393 +}
25394 +
25395 +void ntp_notify_cmos_timer(void)
25396 +{
25397 +       do_cmos_delay = true;
25398 +       /* Make visible before waking up process */
25399 +       smp_wmb();
25400 +       wake_up_process(cmos_delay_thread);
25401 +}
25402 +
25403 +static __init int create_cmos_delay_thread(void)
25404 +{
25405 +       cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
25406 +       BUG_ON(!cmos_delay_thread);
25407 +       return 0;
25408 +}
25409 +early_initcall(create_cmos_delay_thread);
25410 +
25411 +#else
25412 +
25413  void ntp_notify_cmos_timer(void)
25414  {
25415         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
25416  }
25417 +#endif /* CONFIG_PREEMPT_RT_FULL */
25418  
25419  #else
25420  void ntp_notify_cmos_timer(void) { }
25421 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
25422 index 80016b329d94..b7342b6e6a5a 100644
25423 --- a/kernel/time/posix-cpu-timers.c
25424 +++ b/kernel/time/posix-cpu-timers.c
25425 @@ -3,6 +3,7 @@
25426   */
25427  
25428  #include <linux/sched.h>
25429 +#include <linux/sched/rt.h>
25430  #include <linux/posix-timers.h>
25431  #include <linux/errno.h>
25432  #include <linux/math64.h>
25433 @@ -650,7 +651,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
25434         /*
25435          * Disarm any old timer after extracting its expiry time.
25436          */
25437 -       WARN_ON_ONCE(!irqs_disabled());
25438 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25439  
25440         ret = 0;
25441         old_incr = timer->it.cpu.incr;
25442 @@ -1092,7 +1093,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
25443         /*
25444          * Now re-arm for the new expiry time.
25445          */
25446 -       WARN_ON_ONCE(!irqs_disabled());
25447 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25448         arm_timer(timer);
25449         unlock_task_sighand(p, &flags);
25450  
25451 @@ -1183,13 +1184,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
25452   * already updated our counts.  We need to check if any timers fire now.
25453   * Interrupts are disabled.
25454   */
25455 -void run_posix_cpu_timers(struct task_struct *tsk)
25456 +static void __run_posix_cpu_timers(struct task_struct *tsk)
25457  {
25458         LIST_HEAD(firing);
25459         struct k_itimer *timer, *next;
25460         unsigned long flags;
25461  
25462 -       WARN_ON_ONCE(!irqs_disabled());
25463 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25464  
25465         /*
25466          * The fast path checks that there are no expired thread or thread
25467 @@ -1243,6 +1244,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
25468         }
25469  }
25470  
25471 +#ifdef CONFIG_PREEMPT_RT_BASE
25472 +#include <linux/kthread.h>
25473 +#include <linux/cpu.h>
25474 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
25475 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
25476 +
25477 +static int posix_cpu_timers_thread(void *data)
25478 +{
25479 +       int cpu = (long)data;
25480 +
25481 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
25482 +
25483 +       while (!kthread_should_stop()) {
25484 +               struct task_struct *tsk = NULL;
25485 +               struct task_struct *next = NULL;
25486 +
25487 +               if (cpu_is_offline(cpu))
25488 +                       goto wait_to_die;
25489 +
25490 +               /* grab task list */
25491 +               raw_local_irq_disable();
25492 +               tsk = per_cpu(posix_timer_tasklist, cpu);
25493 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25494 +               raw_local_irq_enable();
25495 +
25496 +               /* its possible the list is empty, just return */
25497 +               if (!tsk) {
25498 +                       set_current_state(TASK_INTERRUPTIBLE);
25499 +                       schedule();
25500 +                       __set_current_state(TASK_RUNNING);
25501 +                       continue;
25502 +               }
25503 +
25504 +               /* Process task list */
25505 +               while (1) {
25506 +                       /* save next */
25507 +                       next = tsk->posix_timer_list;
25508 +
25509 +                       /* run the task timers, clear its ptr and
25510 +                        * unreference it
25511 +                        */
25512 +                       __run_posix_cpu_timers(tsk);
25513 +                       tsk->posix_timer_list = NULL;
25514 +                       put_task_struct(tsk);
25515 +
25516 +                       /* check if this is the last on the list */
25517 +                       if (next == tsk)
25518 +                               break;
25519 +                       tsk = next;
25520 +               }
25521 +       }
25522 +       return 0;
25523 +
25524 +wait_to_die:
25525 +       /* Wait for kthread_stop */
25526 +       set_current_state(TASK_INTERRUPTIBLE);
25527 +       while (!kthread_should_stop()) {
25528 +               schedule();
25529 +               set_current_state(TASK_INTERRUPTIBLE);
25530 +       }
25531 +       __set_current_state(TASK_RUNNING);
25532 +       return 0;
25533 +}
25534 +
25535 +static inline int __fastpath_timer_check(struct task_struct *tsk)
25536 +{
25537 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
25538 +       if (unlikely(tsk->exit_state))
25539 +               return 0;
25540 +
25541 +       if (!task_cputime_zero(&tsk->cputime_expires))
25542 +                       return 1;
25543 +
25544 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
25545 +                       return 1;
25546 +
25547 +       return 0;
25548 +}
25549 +
25550 +void run_posix_cpu_timers(struct task_struct *tsk)
25551 +{
25552 +       unsigned long cpu = smp_processor_id();
25553 +       struct task_struct *tasklist;
25554 +
25555 +       BUG_ON(!irqs_disabled());
25556 +       if(!per_cpu(posix_timer_task, cpu))
25557 +               return;
25558 +       /* get per-cpu references */
25559 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
25560 +
25561 +       /* check to see if we're already queued */
25562 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
25563 +               get_task_struct(tsk);
25564 +               if (tasklist) {
25565 +                       tsk->posix_timer_list = tasklist;
25566 +               } else {
25567 +                       /*
25568 +                        * The list is terminated by a self-pointing
25569 +                        * task_struct
25570 +                        */
25571 +                       tsk->posix_timer_list = tsk;
25572 +               }
25573 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
25574 +
25575 +               wake_up_process(per_cpu(posix_timer_task, cpu));
25576 +       }
25577 +}
25578 +
25579 +/*
25580 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
25581 + * Here we can start up the necessary migration thread for the new CPU.
25582 + */
25583 +static int posix_cpu_thread_call(struct notifier_block *nfb,
25584 +                                unsigned long action, void *hcpu)
25585 +{
25586 +       int cpu = (long)hcpu;
25587 +       struct task_struct *p;
25588 +       struct sched_param param;
25589 +
25590 +       switch (action) {
25591 +       case CPU_UP_PREPARE:
25592 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
25593 +                                       "posixcputmr/%d",cpu);
25594 +               if (IS_ERR(p))
25595 +                       return NOTIFY_BAD;
25596 +               p->flags |= PF_NOFREEZE;
25597 +               kthread_bind(p, cpu);
25598 +               /* Must be high prio to avoid getting starved */
25599 +               param.sched_priority = MAX_RT_PRIO-1;
25600 +               sched_setscheduler(p, SCHED_FIFO, &param);
25601 +               per_cpu(posix_timer_task,cpu) = p;
25602 +               break;
25603 +       case CPU_ONLINE:
25604 +               /* Strictly unneccessary, as first user will wake it. */
25605 +               wake_up_process(per_cpu(posix_timer_task,cpu));
25606 +               break;
25607 +#ifdef CONFIG_HOTPLUG_CPU
25608 +       case CPU_UP_CANCELED:
25609 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
25610 +               kthread_bind(per_cpu(posix_timer_task, cpu),
25611 +                            cpumask_any(cpu_online_mask));
25612 +               kthread_stop(per_cpu(posix_timer_task,cpu));
25613 +               per_cpu(posix_timer_task,cpu) = NULL;
25614 +               break;
25615 +       case CPU_DEAD:
25616 +               kthread_stop(per_cpu(posix_timer_task,cpu));
25617 +               per_cpu(posix_timer_task,cpu) = NULL;
25618 +               break;
25619 +#endif
25620 +       }
25621 +       return NOTIFY_OK;
25622 +}
25623 +
25624 +/* Register at highest priority so that task migration (migrate_all_tasks)
25625 + * happens before everything else.
25626 + */
25627 +static struct notifier_block posix_cpu_thread_notifier = {
25628 +       .notifier_call = posix_cpu_thread_call,
25629 +       .priority = 10
25630 +};
25631 +
25632 +static int __init posix_cpu_thread_init(void)
25633 +{
25634 +       void *hcpu = (void *)(long)smp_processor_id();
25635 +       /* Start one for boot CPU. */
25636 +       unsigned long cpu;
25637 +
25638 +       /* init the per-cpu posix_timer_tasklets */
25639 +       for_each_possible_cpu(cpu)
25640 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25641 +
25642 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
25643 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
25644 +       register_cpu_notifier(&posix_cpu_thread_notifier);
25645 +       return 0;
25646 +}
25647 +early_initcall(posix_cpu_thread_init);
25648 +#else /* CONFIG_PREEMPT_RT_BASE */
25649 +void run_posix_cpu_timers(struct task_struct *tsk)
25650 +{
25651 +       __run_posix_cpu_timers(tsk);
25652 +}
25653 +#endif /* CONFIG_PREEMPT_RT_BASE */
25654 +
25655  /*
25656   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
25657   * The tsk->sighand->siglock must be held by the caller.
25658 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
25659 index f2826c35e918..464a98155a0e 100644
25660 --- a/kernel/time/posix-timers.c
25661 +++ b/kernel/time/posix-timers.c
25662 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
25663  static struct pid *good_sigevent(sigevent_t * event)
25664  {
25665         struct task_struct *rtn = current->group_leader;
25666 +       int sig = event->sigev_signo;
25667  
25668         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
25669                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
25670 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
25671                 return NULL;
25672  
25673         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
25674 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
25675 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
25676 +            sig_kernel_coredump(sig)))
25677                 return NULL;
25678  
25679         return task_pid(rtn);
25680 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
25681         return overrun;
25682  }
25683  
25684 +/*
25685 + * Protected by RCU!
25686 + */
25687 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
25688 +{
25689 +#ifdef CONFIG_PREEMPT_RT_FULL
25690 +       if (kc->timer_set == common_timer_set)
25691 +               hrtimer_wait_for_timer(&timr->it.real.timer);
25692 +       else
25693 +               /* FIXME: Whacky hack for posix-cpu-timers */
25694 +               schedule_timeout(1);
25695 +#endif
25696 +}
25697 +
25698  /* Set a POSIX.1b interval timer. */
25699  /* timr->it_lock is taken. */
25700  static int
25701 @@ -903,6 +919,7 @@ retry:
25702         if (!timr)
25703                 return -EINVAL;
25704  
25705 +       rcu_read_lock();
25706         kc = clockid_to_kclock(timr->it_clock);
25707         if (WARN_ON_ONCE(!kc || !kc->timer_set))
25708                 error = -EINVAL;
25709 @@ -911,9 +928,12 @@ retry:
25710  
25711         unlock_timer(timr, flag);
25712         if (error == TIMER_RETRY) {
25713 +               timer_wait_for_callback(kc, timr);
25714                 rtn = NULL;     // We already got the old time...
25715 +               rcu_read_unlock();
25716                 goto retry;
25717         }
25718 +       rcu_read_unlock();
25719  
25720         if (old_setting && !error &&
25721             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
25722 @@ -951,10 +971,15 @@ retry_delete:
25723         if (!timer)
25724                 return -EINVAL;
25725  
25726 +       rcu_read_lock();
25727         if (timer_delete_hook(timer) == TIMER_RETRY) {
25728                 unlock_timer(timer, flags);
25729 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25730 +                                       timer);
25731 +               rcu_read_unlock();
25732                 goto retry_delete;
25733         }
25734 +       rcu_read_unlock();
25735  
25736         spin_lock(&current->sighand->siglock);
25737         list_del(&timer->list);
25738 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
25739  retry_delete:
25740         spin_lock_irqsave(&timer->it_lock, flags);
25741  
25742 +       /* On RT we can race with a deletion */
25743 +       if (!timer->it_signal) {
25744 +               unlock_timer(timer, flags);
25745 +               return;
25746 +       }
25747 +
25748         if (timer_delete_hook(timer) == TIMER_RETRY) {
25749 +               rcu_read_lock();
25750                 unlock_timer(timer, flags);
25751 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25752 +                                       timer);
25753 +               rcu_read_unlock();
25754                 goto retry_delete;
25755         }
25756         list_del(&timer->list);
25757 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
25758 index 53d7184da0be..1b4ac3361c3f 100644
25759 --- a/kernel/time/tick-broadcast-hrtimer.c
25760 +++ b/kernel/time/tick-broadcast-hrtimer.c
25761 @@ -106,5 +106,6 @@ void tick_setup_hrtimer_broadcast(void)
25762  {
25763         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
25764         bctimer.function = bc_handler;
25765 +       bctimer.irqsafe = true;
25766         clockevents_register_device(&ce_broadcast_hrtimer);
25767  }
25768 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
25769 index 4fcd99e12aa0..5a47f2e98faf 100644
25770 --- a/kernel/time/tick-common.c
25771 +++ b/kernel/time/tick-common.c
25772 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
25773  static void tick_periodic(int cpu)
25774  {
25775         if (tick_do_timer_cpu == cpu) {
25776 -               write_seqlock(&jiffies_lock);
25777 +               raw_spin_lock(&jiffies_lock);
25778 +               write_seqcount_begin(&jiffies_seq);
25779  
25780                 /* Keep track of the next tick event */
25781                 tick_next_period = ktime_add(tick_next_period, tick_period);
25782  
25783                 do_timer(1);
25784 -               write_sequnlock(&jiffies_lock);
25785 +               write_seqcount_end(&jiffies_seq);
25786 +               raw_spin_unlock(&jiffies_lock);
25787                 update_wall_time();
25788         }
25789  
25790 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
25791                 ktime_t next;
25792  
25793                 do {
25794 -                       seq = read_seqbegin(&jiffies_lock);
25795 +                       seq = read_seqcount_begin(&jiffies_seq);
25796                         next = tick_next_period;
25797 -               } while (read_seqretry(&jiffies_lock, seq));
25798 +               } while (read_seqcount_retry(&jiffies_seq, seq));
25799  
25800                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
25801  
25802 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
25803 index 22c57e191a23..d536824cbd36 100644
25804 --- a/kernel/time/tick-sched.c
25805 +++ b/kernel/time/tick-sched.c
25806 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
25807                 return;
25808  
25809         /* Reevalute with jiffies_lock held */
25810 -       write_seqlock(&jiffies_lock);
25811 +       raw_spin_lock(&jiffies_lock);
25812 +       write_seqcount_begin(&jiffies_seq);
25813  
25814         delta = ktime_sub(now, last_jiffies_update);
25815         if (delta.tv64 >= tick_period.tv64) {
25816 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
25817                 /* Keep the tick_next_period variable up to date */
25818                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
25819         } else {
25820 -               write_sequnlock(&jiffies_lock);
25821 +               write_seqcount_end(&jiffies_seq);
25822 +               raw_spin_unlock(&jiffies_lock);
25823                 return;
25824         }
25825 -       write_sequnlock(&jiffies_lock);
25826 +       write_seqcount_end(&jiffies_seq);
25827 +       raw_spin_unlock(&jiffies_lock);
25828         update_wall_time();
25829  }
25830  
25831 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
25832  {
25833         ktime_t period;
25834  
25835 -       write_seqlock(&jiffies_lock);
25836 +       raw_spin_lock(&jiffies_lock);
25837 +       write_seqcount_begin(&jiffies_seq);
25838         /* Did we start the jiffies update yet ? */
25839         if (last_jiffies_update.tv64 == 0)
25840                 last_jiffies_update = tick_next_period;
25841         period = last_jiffies_update;
25842 -       write_sequnlock(&jiffies_lock);
25843 +       write_seqcount_end(&jiffies_seq);
25844 +       raw_spin_unlock(&jiffies_lock);
25845         return period;
25846  }
25847  
25848 @@ -176,6 +181,11 @@ static bool can_stop_full_tick(void)
25849                 return false;
25850         }
25851  
25852 +       if (!arch_irq_work_has_interrupt()) {
25853 +               trace_tick_stop(0, "missing irq work interrupt\n");
25854 +               return false;
25855 +       }
25856 +
25857         /* sched_clock_tick() needs us? */
25858  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
25859         /*
25860 @@ -204,6 +214,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
25861  
25862  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
25863         .func = nohz_full_kick_work_func,
25864 +       .flags = IRQ_WORK_HARD_IRQ,
25865  };
25866  
25867  /*
25868 @@ -578,10 +589,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
25869  
25870         /* Read jiffies and the time when jiffies were updated last */
25871         do {
25872 -               seq = read_seqbegin(&jiffies_lock);
25873 +               seq = read_seqcount_begin(&jiffies_seq);
25874                 basemono = last_jiffies_update.tv64;
25875                 basejiff = jiffies;
25876 -       } while (read_seqretry(&jiffies_lock, seq));
25877 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25878         ts->last_jiffies = basejiff;
25879  
25880         if (rcu_needs_cpu(basemono, &next_rcu) ||
25881 @@ -753,14 +764,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
25882                 return false;
25883  
25884         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
25885 -               static int ratelimit;
25886 -
25887 -               if (ratelimit < 10 &&
25888 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
25889 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
25890 -                               (unsigned int) local_softirq_pending());
25891 -                       ratelimit++;
25892 -               }
25893 +               softirq_check_pending_idle();
25894                 return false;
25895         }
25896  
25897 @@ -1100,6 +1104,7 @@ void tick_setup_sched_timer(void)
25898          * Emulate tick processing via per-CPU hrtimers:
25899          */
25900         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
25901 +       ts->sched_timer.irqsafe = 1;
25902         ts->sched_timer.function = tick_sched_timer;
25903  
25904         /* Get the next period (per cpu) */
25905 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
25906 index 4ff237dbc006..ecdc1d8c1ed9 100644
25907 --- a/kernel/time/timekeeping.c
25908 +++ b/kernel/time/timekeeping.c
25909 @@ -2050,8 +2050,10 @@ EXPORT_SYMBOL(hardpps);
25910   */
25911  void xtime_update(unsigned long ticks)
25912  {
25913 -       write_seqlock(&jiffies_lock);
25914 +       raw_spin_lock(&jiffies_lock);
25915 +       write_seqcount_begin(&jiffies_seq);
25916         do_timer(ticks);
25917 -       write_sequnlock(&jiffies_lock);
25918 +       write_seqcount_end(&jiffies_seq);
25919 +       raw_spin_unlock(&jiffies_lock);
25920         update_wall_time();
25921  }
25922 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
25923 index 704f595ce83f..763a3e5121ff 100644
25924 --- a/kernel/time/timekeeping.h
25925 +++ b/kernel/time/timekeeping.h
25926 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
25927  extern void do_timer(unsigned long ticks);
25928  extern void update_wall_time(void);
25929  
25930 -extern seqlock_t jiffies_lock;
25931 +extern raw_spinlock_t jiffies_lock;
25932 +extern seqcount_t jiffies_seq;
25933  
25934  #define CS_NAME_LEN    32
25935  
25936 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
25937 index bbc5d1114583..603699ff9411 100644
25938 --- a/kernel/time/timer.c
25939 +++ b/kernel/time/timer.c
25940 @@ -80,6 +80,9 @@ struct tvec_root {
25941  struct tvec_base {
25942         spinlock_t lock;
25943         struct timer_list *running_timer;
25944 +#ifdef CONFIG_PREEMPT_RT_FULL
25945 +       wait_queue_head_t wait_for_running_timer;
25946 +#endif
25947         unsigned long timer_jiffies;
25948         unsigned long next_timer;
25949         unsigned long active_timers;
25950 @@ -777,6 +780,39 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
25951                 cpu_relax();
25952         }
25953  }
25954 +#ifdef CONFIG_PREEMPT_RT_FULL
25955 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
25956 +                                                 struct tvec_base *old,
25957 +                                                 struct tvec_base *new)
25958 +{
25959 +       /*
25960 +        * We cannot do the below because we might be preempted and
25961 +        * then the preempter would see NULL and loop forever.
25962 +        */
25963 +       if (spin_trylock(&new->lock)) {
25964 +               WRITE_ONCE(timer->flags,
25965 +                          (timer->flags & ~TIMER_BASEMASK) | new->cpu);
25966 +               spin_unlock(&old->lock);
25967 +               return new;
25968 +       }
25969 +       return old;
25970 +}
25971 +
25972 +#else
25973 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
25974 +                                                 struct tvec_base *old,
25975 +                                                 struct tvec_base *new)
25976 +{
25977 +       /* See the comment in lock_timer_base() */
25978 +       timer->flags |= TIMER_MIGRATING;
25979 +
25980 +       spin_unlock(&old->lock);
25981 +       spin_lock(&new->lock);
25982 +       WRITE_ONCE(timer->flags,
25983 +                  (timer->flags & ~TIMER_BASEMASK) | new->cpu);
25984 +       return new;
25985 +}
25986 +#endif
25987  
25988  static inline int
25989  __mod_timer(struct timer_list *timer, unsigned long expires,
25990 @@ -807,16 +843,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
25991                  * handler yet has not finished. This also guarantees that
25992                  * the timer is serialized wrt itself.
25993                  */
25994 -               if (likely(base->running_timer != timer)) {
25995 -                       /* See the comment in lock_timer_base() */
25996 -                       timer->flags |= TIMER_MIGRATING;
25997 -
25998 -                       spin_unlock(&base->lock);
25999 -                       base = new_base;
26000 -                       spin_lock(&base->lock);
26001 -                       WRITE_ONCE(timer->flags,
26002 -                                  (timer->flags & ~TIMER_BASEMASK) | base->cpu);
26003 -               }
26004 +               if (likely(base->running_timer != timer))
26005 +                       base = switch_timer_base(timer, base, new_base);
26006         }
26007  
26008         timer->expires = expires;
26009 @@ -1006,6 +1034,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
26010  }
26011  EXPORT_SYMBOL_GPL(add_timer_on);
26012  
26013 +#ifdef CONFIG_PREEMPT_RT_FULL
26014 +/*
26015 + * Wait for a running timer
26016 + */
26017 +static void wait_for_running_timer(struct timer_list *timer)
26018 +{
26019 +       struct tvec_base *base;
26020 +       u32 tf = timer->flags;
26021 +
26022 +       if (tf & TIMER_MIGRATING)
26023 +               return;
26024 +
26025 +       base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
26026 +       wait_event(base->wait_for_running_timer,
26027 +                  base->running_timer != timer);
26028 +}
26029 +
26030 +# define wakeup_timer_waiters(b)       wake_up_all(&(b)->wait_for_running_timer)
26031 +#else
26032 +static inline void wait_for_running_timer(struct timer_list *timer)
26033 +{
26034 +       cpu_relax();
26035 +}
26036 +
26037 +# define wakeup_timer_waiters(b)       do { } while (0)
26038 +#endif
26039 +
26040  /**
26041   * del_timer - deactive a timer.
26042   * @timer: the timer to be deactivated
26043 @@ -1063,7 +1118,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
26044  }
26045  EXPORT_SYMBOL(try_to_del_timer_sync);
26046  
26047 -#ifdef CONFIG_SMP
26048 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26049  /**
26050   * del_timer_sync - deactivate a timer and wait for the handler to finish.
26051   * @timer: the timer to be deactivated
26052 @@ -1123,7 +1178,7 @@ int del_timer_sync(struct timer_list *timer)
26053                 int ret = try_to_del_timer_sync(timer);
26054                 if (ret >= 0)
26055                         return ret;
26056 -               cpu_relax();
26057 +               wait_for_running_timer(timer);
26058         }
26059  }
26060  EXPORT_SYMBOL(del_timer_sync);
26061 @@ -1248,16 +1303,18 @@ static inline void __run_timers(struct tvec_base *base)
26062                         if (irqsafe) {
26063                                 spin_unlock(&base->lock);
26064                                 call_timer_fn(timer, fn, data);
26065 +                               base->running_timer = NULL;
26066                                 spin_lock(&base->lock);
26067                         } else {
26068                                 spin_unlock_irq(&base->lock);
26069                                 call_timer_fn(timer, fn, data);
26070 +                               base->running_timer = NULL;
26071                                 spin_lock_irq(&base->lock);
26072                         }
26073                 }
26074         }
26075 -       base->running_timer = NULL;
26076         spin_unlock_irq(&base->lock);
26077 +       wakeup_timer_waiters(base);
26078  }
26079  
26080  #ifdef CONFIG_NO_HZ_COMMON
26081 @@ -1390,6 +1447,14 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
26082         if (cpu_is_offline(smp_processor_id()))
26083                 return expires;
26084  
26085 +#ifdef CONFIG_PREEMPT_RT_FULL
26086 +       /*
26087 +        * On PREEMPT_RT we cannot sleep here. As a result we can't take
26088 +        * the base lock to check when the next timer is pending and so
26089 +        * we assume the next jiffy.
26090 +        */
26091 +       return basem + TICK_NSEC;
26092 +#endif
26093         spin_lock(&base->lock);
26094         if (base->active_timers) {
26095                 if (time_before_eq(base->next_timer, base->timer_jiffies))
26096 @@ -1416,13 +1481,13 @@ void update_process_times(int user_tick)
26097  
26098         /* Note: this timer irq context must be accounted for as well. */
26099         account_process_tick(p, user_tick);
26100 +       scheduler_tick();
26101         run_local_timers();
26102         rcu_check_callbacks(user_tick);
26103 -#ifdef CONFIG_IRQ_WORK
26104 +#if defined(CONFIG_IRQ_WORK)
26105         if (in_irq())
26106                 irq_work_tick();
26107  #endif
26108 -       scheduler_tick();
26109         run_posix_cpu_timers(p);
26110  }
26111  
26112 @@ -1433,6 +1498,8 @@ static void run_timer_softirq(struct softirq_action *h)
26113  {
26114         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
26115  
26116 +       irq_work_tick_soft();
26117 +
26118         if (time_after_eq(jiffies, base->timer_jiffies))
26119                 __run_timers(base);
26120  }
26121 @@ -1589,7 +1656,7 @@ static void migrate_timers(int cpu)
26122  
26123         BUG_ON(cpu_online(cpu));
26124         old_base = per_cpu_ptr(&tvec_bases, cpu);
26125 -       new_base = get_cpu_ptr(&tvec_bases);
26126 +       new_base = get_local_ptr(&tvec_bases);
26127         /*
26128          * The caller is globally serialized and nobody else
26129          * takes two locks at once, deadlock is not possible.
26130 @@ -1613,7 +1680,7 @@ static void migrate_timers(int cpu)
26131  
26132         spin_unlock(&old_base->lock);
26133         spin_unlock_irq(&new_base->lock);
26134 -       put_cpu_ptr(&tvec_bases);
26135 +       put_local_ptr(&tvec_bases);
26136  }
26137  
26138  static int timer_cpu_notify(struct notifier_block *self,
26139 @@ -1645,6 +1712,9 @@ static void __init init_timer_cpu(int cpu)
26140  
26141         base->cpu = cpu;
26142         spin_lock_init(&base->lock);
26143 +#ifdef CONFIG_PREEMPT_RT_FULL
26144 +       init_waitqueue_head(&base->wait_for_running_timer);
26145 +#endif
26146  
26147         base->timer_jiffies = jiffies;
26148         base->next_timer = base->timer_jiffies;
26149 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
26150 index e45db6b0d878..364ccd0eb57b 100644
26151 --- a/kernel/trace/Kconfig
26152 +++ b/kernel/trace/Kconfig
26153 @@ -187,6 +187,24 @@ config IRQSOFF_TRACER
26154           enabled. This option and the preempt-off timing option can be
26155           used together or separately.)
26156  
26157 +config INTERRUPT_OFF_HIST
26158 +       bool "Interrupts-off Latency Histogram"
26159 +       depends on IRQSOFF_TRACER
26160 +       help
26161 +         This option generates continuously updated histograms (one per cpu)
26162 +         of the duration of time periods with interrupts disabled. The
26163 +         histograms are disabled by default. To enable them, write a non-zero
26164 +         number to
26165 +
26166 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26167 +
26168 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
26169 +         per cpu) are generated that accumulate the duration of time periods
26170 +         when both interrupts and preemption are disabled. The histogram data
26171 +         will be located in the debug file system at
26172 +
26173 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
26174 +
26175  config PREEMPT_TRACER
26176         bool "Preemption-off Latency Tracer"
26177         default n
26178 @@ -211,6 +229,24 @@ config PREEMPT_TRACER
26179           enabled. This option and the irqs-off timing option can be
26180           used together or separately.)
26181  
26182 +config PREEMPT_OFF_HIST
26183 +       bool "Preemption-off Latency Histogram"
26184 +       depends on PREEMPT_TRACER
26185 +       help
26186 +         This option generates continuously updated histograms (one per cpu)
26187 +         of the duration of time periods with preemption disabled. The
26188 +         histograms are disabled by default. To enable them, write a non-zero
26189 +         number to
26190 +
26191 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26192 +
26193 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
26194 +         per cpu) are generated that accumulate the duration of time periods
26195 +         when both interrupts and preemption are disabled. The histogram data
26196 +         will be located in the debug file system at
26197 +
26198 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
26199 +
26200  config SCHED_TRACER
26201         bool "Scheduling Latency Tracer"
26202         select GENERIC_TRACER
26203 @@ -221,6 +257,74 @@ config SCHED_TRACER
26204           This tracer tracks the latency of the highest priority task
26205           to be scheduled in, starting from the point it has woken up.
26206  
26207 +config WAKEUP_LATENCY_HIST
26208 +       bool "Scheduling Latency Histogram"
26209 +       depends on SCHED_TRACER
26210 +       help
26211 +         This option generates continuously updated histograms (one per cpu)
26212 +         of the scheduling latency of the highest priority task.
26213 +         The histograms are disabled by default. To enable them, write a
26214 +         non-zero number to
26215 +
26216 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
26217 +
26218 +         Two different algorithms are used, one to determine the latency of
26219 +         processes that exclusively use the highest priority of the system and
26220 +         another one to determine the latency of processes that share the
26221 +         highest system priority with other processes. The former is used to
26222 +         improve hardware and system software, the latter to optimize the
26223 +         priority design of a given system. The histogram data will be
26224 +         located in the debug file system at
26225 +
26226 +             /sys/kernel/debug/tracing/latency_hist/wakeup
26227 +
26228 +         and
26229 +
26230 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
26231 +
26232 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26233 +         Histogram are selected, additional histogram data will be collected
26234 +         that contain, in addition to the wakeup latency, the timer latency, in
26235 +         case the wakeup was triggered by an expired timer. These histograms
26236 +         are available in the
26237 +
26238 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
26239 +
26240 +         directory. They reflect the apparent interrupt and scheduling latency
26241 +         and are best suitable to determine the worst-case latency of a given
26242 +         system. To enable these histograms, write a non-zero number to
26243 +
26244 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
26245 +
26246 +config MISSED_TIMER_OFFSETS_HIST
26247 +       depends on HIGH_RES_TIMERS
26248 +       select GENERIC_TRACER
26249 +       bool "Missed Timer Offsets Histogram"
26250 +       help
26251 +         Generate a histogram of missed timer offsets in microseconds. The
26252 +         histograms are disabled by default. To enable them, write a non-zero
26253 +         number to
26254 +
26255 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
26256 +
26257 +         The histogram data will be located in the debug file system at
26258 +
26259 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
26260 +
26261 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26262 +         Histogram are selected, additional histogram data will be collected
26263 +         that contain, in addition to the wakeup latency, the timer latency, in
26264 +         case the wakeup was triggered by an expired timer. These histograms
26265 +         are available in the
26266 +
26267 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
26268 +
26269 +         directory. They reflect the apparent interrupt and scheduling latency
26270 +         and are best suitable to determine the worst-case latency of a given
26271 +         system. To enable these histograms, write a non-zero number to
26272 +
26273 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
26274 +
26275  config ENABLE_DEFAULT_TRACERS
26276         bool "Trace process context switches and events"
26277         depends on !GENERIC_TRACER
26278 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
26279 index 05ea5167e6bb..bc08c67301ae 100644
26280 --- a/kernel/trace/Makefile
26281 +++ b/kernel/trace/Makefile
26282 @@ -40,6 +40,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
26283  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
26284  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
26285  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
26286 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
26287 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
26288 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
26289 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
26290  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
26291  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
26292  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26293 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
26294 new file mode 100644
26295 index 000000000000..7f6ee70dea41
26296 --- /dev/null
26297 +++ b/kernel/trace/latency_hist.c
26298 @@ -0,0 +1,1178 @@
26299 +/*
26300 + * kernel/trace/latency_hist.c
26301 + *
26302 + * Add support for histograms of preemption-off latency and
26303 + * interrupt-off latency and wakeup latency, it depends on
26304 + * Real-Time Preemption Support.
26305 + *
26306 + *  Copyright (C) 2005 MontaVista Software, Inc.
26307 + *  Yi Yang <yyang@ch.mvista.com>
26308 + *
26309 + *  Converted to work with the new latency tracer.
26310 + *  Copyright (C) 2008 Red Hat, Inc.
26311 + *    Steven Rostedt <srostedt@redhat.com>
26312 + *
26313 + */
26314 +#include <linux/module.h>
26315 +#include <linux/debugfs.h>
26316 +#include <linux/seq_file.h>
26317 +#include <linux/percpu.h>
26318 +#include <linux/kallsyms.h>
26319 +#include <linux/uaccess.h>
26320 +#include <linux/sched.h>
26321 +#include <linux/sched/rt.h>
26322 +#include <linux/slab.h>
26323 +#include <linux/atomic.h>
26324 +#include <asm/div64.h>
26325 +
26326 +#include "trace.h"
26327 +#include <trace/events/sched.h>
26328 +
26329 +#define NSECS_PER_USECS 1000L
26330 +
26331 +#define CREATE_TRACE_POINTS
26332 +#include <trace/events/hist.h>
26333 +
26334 +enum {
26335 +       IRQSOFF_LATENCY = 0,
26336 +       PREEMPTOFF_LATENCY,
26337 +       PREEMPTIRQSOFF_LATENCY,
26338 +       WAKEUP_LATENCY,
26339 +       WAKEUP_LATENCY_SHAREDPRIO,
26340 +       MISSED_TIMER_OFFSETS,
26341 +       TIMERANDWAKEUP_LATENCY,
26342 +       MAX_LATENCY_TYPE,
26343 +};
26344 +
26345 +#define MAX_ENTRY_NUM 10240
26346 +
26347 +struct hist_data {
26348 +       atomic_t hist_mode; /* 0 log, 1 don't log */
26349 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
26350 +       long min_lat;
26351 +       long max_lat;
26352 +       unsigned long long below_hist_bound_samples;
26353 +       unsigned long long above_hist_bound_samples;
26354 +       long long accumulate_lat;
26355 +       unsigned long long total_samples;
26356 +       unsigned long long hist_array[MAX_ENTRY_NUM];
26357 +};
26358 +
26359 +struct enable_data {
26360 +       int latency_type;
26361 +       int enabled;
26362 +};
26363 +
26364 +static char *latency_hist_dir_root = "latency_hist";
26365 +
26366 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26367 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
26368 +static char *irqsoff_hist_dir = "irqsoff";
26369 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
26370 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
26371 +#endif
26372 +
26373 +#ifdef CONFIG_PREEMPT_OFF_HIST
26374 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
26375 +static char *preemptoff_hist_dir = "preemptoff";
26376 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
26377 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
26378 +#endif
26379 +
26380 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
26381 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
26382 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
26383 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
26384 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
26385 +#endif
26386 +
26387 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
26388 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
26389 +static struct enable_data preemptirqsoff_enabled_data = {
26390 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
26391 +       .enabled = 0,
26392 +};
26393 +#endif
26394 +
26395 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26396 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26397 +struct maxlatproc_data {
26398 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
26399 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
26400 +       int pid;
26401 +       int current_pid;
26402 +       int prio;
26403 +       int current_prio;
26404 +       long latency;
26405 +       long timeroffset;
26406 +       cycle_t timestamp;
26407 +};
26408 +#endif
26409 +
26410 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26411 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
26412 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
26413 +static char *wakeup_latency_hist_dir = "wakeup";
26414 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
26415 +static notrace void probe_wakeup_latency_hist_start(void *v,
26416 +       struct task_struct *p);
26417 +static notrace void probe_wakeup_latency_hist_stop(void *v,
26418 +       bool preempt, struct task_struct *prev, struct task_struct *next);
26419 +static notrace void probe_sched_migrate_task(void *,
26420 +       struct task_struct *task, int cpu);
26421 +static struct enable_data wakeup_latency_enabled_data = {
26422 +       .latency_type = WAKEUP_LATENCY,
26423 +       .enabled = 0,
26424 +};
26425 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
26426 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
26427 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
26428 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
26429 +static unsigned long wakeup_pid;
26430 +#endif
26431 +
26432 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26433 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
26434 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
26435 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
26436 +       long long offset, struct task_struct *curr, struct task_struct *task);
26437 +static struct enable_data missed_timer_offsets_enabled_data = {
26438 +       .latency_type = MISSED_TIMER_OFFSETS,
26439 +       .enabled = 0,
26440 +};
26441 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
26442 +static unsigned long missed_timer_offsets_pid;
26443 +#endif
26444 +
26445 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26446 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26447 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
26448 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
26449 +static struct enable_data timerandwakeup_enabled_data = {
26450 +       .latency_type = TIMERANDWAKEUP_LATENCY,
26451 +       .enabled = 0,
26452 +};
26453 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
26454 +#endif
26455 +
26456 +void notrace latency_hist(int latency_type, int cpu, long latency,
26457 +                         long timeroffset, cycle_t stop,
26458 +                         struct task_struct *p)
26459 +{
26460 +       struct hist_data *my_hist;
26461 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26462 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26463 +       struct maxlatproc_data *mp = NULL;
26464 +#endif
26465 +
26466 +       if (!cpu_possible(cpu) || latency_type < 0 ||
26467 +           latency_type >= MAX_LATENCY_TYPE)
26468 +               return;
26469 +
26470 +       switch (latency_type) {
26471 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26472 +       case IRQSOFF_LATENCY:
26473 +               my_hist = &per_cpu(irqsoff_hist, cpu);
26474 +               break;
26475 +#endif
26476 +#ifdef CONFIG_PREEMPT_OFF_HIST
26477 +       case PREEMPTOFF_LATENCY:
26478 +               my_hist = &per_cpu(preemptoff_hist, cpu);
26479 +               break;
26480 +#endif
26481 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
26482 +       case PREEMPTIRQSOFF_LATENCY:
26483 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
26484 +               break;
26485 +#endif
26486 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26487 +       case WAKEUP_LATENCY:
26488 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
26489 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
26490 +               break;
26491 +       case WAKEUP_LATENCY_SHAREDPRIO:
26492 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
26493 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
26494 +               break;
26495 +#endif
26496 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26497 +       case MISSED_TIMER_OFFSETS:
26498 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
26499 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
26500 +               break;
26501 +#endif
26502 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26503 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26504 +       case TIMERANDWAKEUP_LATENCY:
26505 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
26506 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
26507 +               break;
26508 +#endif
26509 +
26510 +       default:
26511 +               return;
26512 +       }
26513 +
26514 +       latency += my_hist->offset;
26515 +
26516 +       if (atomic_read(&my_hist->hist_mode) == 0)
26517 +               return;
26518 +
26519 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
26520 +               if (latency < 0)
26521 +                       my_hist->below_hist_bound_samples++;
26522 +               else
26523 +                       my_hist->above_hist_bound_samples++;
26524 +       } else
26525 +               my_hist->hist_array[latency]++;
26526 +
26527 +       if (unlikely(latency > my_hist->max_lat ||
26528 +           my_hist->min_lat == LONG_MAX)) {
26529 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26530 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26531 +               if (latency_type == WAKEUP_LATENCY ||
26532 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
26533 +                   latency_type == MISSED_TIMER_OFFSETS ||
26534 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
26535 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
26536 +                       strncpy(mp->current_comm, current->comm,
26537 +                           sizeof(mp->current_comm));
26538 +                       mp->pid = task_pid_nr(p);
26539 +                       mp->current_pid = task_pid_nr(current);
26540 +                       mp->prio = p->prio;
26541 +                       mp->current_prio = current->prio;
26542 +                       mp->latency = latency;
26543 +                       mp->timeroffset = timeroffset;
26544 +                       mp->timestamp = stop;
26545 +               }
26546 +#endif
26547 +               my_hist->max_lat = latency;
26548 +       }
26549 +       if (unlikely(latency < my_hist->min_lat))
26550 +               my_hist->min_lat = latency;
26551 +       my_hist->total_samples++;
26552 +       my_hist->accumulate_lat += latency;
26553 +}
26554 +
26555 +static void *l_start(struct seq_file *m, loff_t *pos)
26556 +{
26557 +       loff_t *index_ptr = NULL;
26558 +       loff_t index = *pos;
26559 +       struct hist_data *my_hist = m->private;
26560 +
26561 +       if (index == 0) {
26562 +               char minstr[32], avgstr[32], maxstr[32];
26563 +
26564 +               atomic_dec(&my_hist->hist_mode);
26565 +
26566 +               if (likely(my_hist->total_samples)) {
26567 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
26568 +                           my_hist->total_samples);
26569 +                       snprintf(minstr, sizeof(minstr), "%ld",
26570 +                           my_hist->min_lat - my_hist->offset);
26571 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
26572 +                           avg - my_hist->offset);
26573 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
26574 +                           my_hist->max_lat - my_hist->offset);
26575 +               } else {
26576 +                       strcpy(minstr, "<undef>");
26577 +                       strcpy(avgstr, minstr);
26578 +                       strcpy(maxstr, minstr);
26579 +               }
26580 +
26581 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
26582 +                          "#Average latency: %s microseconds\n"
26583 +                          "#Maximum latency: %s microseconds\n"
26584 +                          "#Total samples: %llu\n"
26585 +                          "#There are %llu samples lower than %ld"
26586 +                          " microseconds.\n"
26587 +                          "#There are %llu samples greater or equal"
26588 +                          " than %ld microseconds.\n"
26589 +                          "#usecs\t%16s\n",
26590 +                          minstr, avgstr, maxstr,
26591 +                          my_hist->total_samples,
26592 +                          my_hist->below_hist_bound_samples,
26593 +                          -my_hist->offset,
26594 +                          my_hist->above_hist_bound_samples,
26595 +                          MAX_ENTRY_NUM - my_hist->offset,
26596 +                          "samples");
26597 +       }
26598 +       if (index < MAX_ENTRY_NUM) {
26599 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
26600 +               if (index_ptr)
26601 +                       *index_ptr = index;
26602 +       }
26603 +
26604 +       return index_ptr;
26605 +}
26606 +
26607 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
26608 +{
26609 +       loff_t *index_ptr = p;
26610 +       struct hist_data *my_hist = m->private;
26611 +
26612 +       if (++*pos >= MAX_ENTRY_NUM) {
26613 +               atomic_inc(&my_hist->hist_mode);
26614 +               return NULL;
26615 +       }
26616 +       *index_ptr = *pos;
26617 +       return index_ptr;
26618 +}
26619 +
26620 +static void l_stop(struct seq_file *m, void *p)
26621 +{
26622 +       kfree(p);
26623 +}
26624 +
26625 +static int l_show(struct seq_file *m, void *p)
26626 +{
26627 +       int index = *(loff_t *) p;
26628 +       struct hist_data *my_hist = m->private;
26629 +
26630 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
26631 +           my_hist->hist_array[index]);
26632 +       return 0;
26633 +}
26634 +
26635 +static const struct seq_operations latency_hist_seq_op = {
26636 +       .start = l_start,
26637 +       .next  = l_next,
26638 +       .stop  = l_stop,
26639 +       .show  = l_show
26640 +};
26641 +
26642 +static int latency_hist_open(struct inode *inode, struct file *file)
26643 +{
26644 +       int ret;
26645 +
26646 +       ret = seq_open(file, &latency_hist_seq_op);
26647 +       if (!ret) {
26648 +               struct seq_file *seq = file->private_data;
26649 +               seq->private = inode->i_private;
26650 +       }
26651 +       return ret;
26652 +}
26653 +
26654 +static const struct file_operations latency_hist_fops = {
26655 +       .open = latency_hist_open,
26656 +       .read = seq_read,
26657 +       .llseek = seq_lseek,
26658 +       .release = seq_release,
26659 +};
26660 +
26661 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26662 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26663 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
26664 +{
26665 +       mp->comm[0] = mp->current_comm[0] = '\0';
26666 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
26667 +           mp->latency = mp->timeroffset = -1;
26668 +       mp->timestamp = 0;
26669 +}
26670 +#endif
26671 +
26672 +static void hist_reset(struct hist_data *hist)
26673 +{
26674 +       atomic_dec(&hist->hist_mode);
26675 +
26676 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
26677 +       hist->below_hist_bound_samples = 0ULL;
26678 +       hist->above_hist_bound_samples = 0ULL;
26679 +       hist->min_lat = LONG_MAX;
26680 +       hist->max_lat = LONG_MIN;
26681 +       hist->total_samples = 0ULL;
26682 +       hist->accumulate_lat = 0LL;
26683 +
26684 +       atomic_inc(&hist->hist_mode);
26685 +}
26686 +
26687 +static ssize_t
26688 +latency_hist_reset(struct file *file, const char __user *a,
26689 +                  size_t size, loff_t *off)
26690 +{
26691 +       int cpu;
26692 +       struct hist_data *hist = NULL;
26693 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26694 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26695 +       struct maxlatproc_data *mp = NULL;
26696 +#endif
26697 +       off_t latency_type = (off_t) file->private_data;
26698 +
26699 +       for_each_online_cpu(cpu) {
26700 +
26701 +               switch (latency_type) {
26702 +#ifdef CONFIG_PREEMPT_OFF_HIST
26703 +               case PREEMPTOFF_LATENCY:
26704 +                       hist = &per_cpu(preemptoff_hist, cpu);
26705 +                       break;
26706 +#endif
26707 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26708 +               case IRQSOFF_LATENCY:
26709 +                       hist = &per_cpu(irqsoff_hist, cpu);
26710 +                       break;
26711 +#endif
26712 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
26713 +               case PREEMPTIRQSOFF_LATENCY:
26714 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
26715 +                       break;
26716 +#endif
26717 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26718 +               case WAKEUP_LATENCY:
26719 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
26720 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
26721 +                       break;
26722 +               case WAKEUP_LATENCY_SHAREDPRIO:
26723 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
26724 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
26725 +                       break;
26726 +#endif
26727 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26728 +               case MISSED_TIMER_OFFSETS:
26729 +                       hist = &per_cpu(missed_timer_offsets, cpu);
26730 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
26731 +                       break;
26732 +#endif
26733 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26734 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26735 +               case TIMERANDWAKEUP_LATENCY:
26736 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
26737 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
26738 +                       break;
26739 +#endif
26740 +               }
26741 +
26742 +               hist_reset(hist);
26743 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26744 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26745 +               if (latency_type == WAKEUP_LATENCY ||
26746 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
26747 +                   latency_type == MISSED_TIMER_OFFSETS ||
26748 +                   latency_type == TIMERANDWAKEUP_LATENCY)
26749 +                       clear_maxlatprocdata(mp);
26750 +#endif
26751 +       }
26752 +
26753 +       return size;
26754 +}
26755 +
26756 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26757 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26758 +static ssize_t
26759 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26760 +{
26761 +       char buf[64];
26762 +       int r;
26763 +       unsigned long *this_pid = file->private_data;
26764 +
26765 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
26766 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26767 +}
26768 +
26769 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
26770 +                     size_t cnt, loff_t *ppos)
26771 +{
26772 +       char buf[64];
26773 +       unsigned long pid;
26774 +       unsigned long *this_pid = file->private_data;
26775 +
26776 +       if (cnt >= sizeof(buf))
26777 +               return -EINVAL;
26778 +
26779 +       if (copy_from_user(&buf, ubuf, cnt))
26780 +               return -EFAULT;
26781 +
26782 +       buf[cnt] = '\0';
26783 +
26784 +       if (kstrtoul(buf, 10, &pid))
26785 +               return -EINVAL;
26786 +
26787 +       *this_pid = pid;
26788 +
26789 +       return cnt;
26790 +}
26791 +#endif
26792 +
26793 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26794 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26795 +static ssize_t
26796 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26797 +{
26798 +       int r;
26799 +       struct maxlatproc_data *mp = file->private_data;
26800 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
26801 +       unsigned long long t;
26802 +       unsigned long usecs, secs;
26803 +       char *buf;
26804 +
26805 +       if (mp->pid == -1 || mp->current_pid == -1) {
26806 +               buf = "(none)\n";
26807 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
26808 +                   strlen(buf));
26809 +       }
26810 +
26811 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
26812 +       if (buf == NULL)
26813 +               return -ENOMEM;
26814 +
26815 +       t = ns2usecs(mp->timestamp);
26816 +       usecs = do_div(t, USEC_PER_SEC);
26817 +       secs = (unsigned long) t;
26818 +       r = snprintf(buf, strmaxlen,
26819 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
26820 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
26821 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
26822 +           secs, usecs);
26823 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26824 +       kfree(buf);
26825 +       return r;
26826 +}
26827 +#endif
26828 +
26829 +static ssize_t
26830 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26831 +{
26832 +       char buf[64];
26833 +       struct enable_data *ed = file->private_data;
26834 +       int r;
26835 +
26836 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
26837 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26838 +}
26839 +
26840 +static ssize_t
26841 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
26842 +{
26843 +       char buf[64];
26844 +       long enable;
26845 +       struct enable_data *ed = file->private_data;
26846 +
26847 +       if (cnt >= sizeof(buf))
26848 +               return -EINVAL;
26849 +
26850 +       if (copy_from_user(&buf, ubuf, cnt))
26851 +               return -EFAULT;
26852 +
26853 +       buf[cnt] = 0;
26854 +
26855 +       if (kstrtoul(buf, 10, &enable))
26856 +               return -EINVAL;
26857 +
26858 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
26859 +               return cnt;
26860 +
26861 +       if (enable) {
26862 +               int ret;
26863 +
26864 +               switch (ed->latency_type) {
26865 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
26866 +               case PREEMPTIRQSOFF_LATENCY:
26867 +                       ret = register_trace_preemptirqsoff_hist(
26868 +                           probe_preemptirqsoff_hist, NULL);
26869 +                       if (ret) {
26870 +                               pr_info("wakeup trace: Couldn't assign "
26871 +                                   "probe_preemptirqsoff_hist "
26872 +                                   "to trace_preemptirqsoff_hist\n");
26873 +                               return ret;
26874 +                       }
26875 +                       break;
26876 +#endif
26877 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26878 +               case WAKEUP_LATENCY:
26879 +                       ret = register_trace_sched_wakeup(
26880 +                           probe_wakeup_latency_hist_start, NULL);
26881 +                       if (ret) {
26882 +                               pr_info("wakeup trace: Couldn't assign "
26883 +                                   "probe_wakeup_latency_hist_start "
26884 +                                   "to trace_sched_wakeup\n");
26885 +                               return ret;
26886 +                       }
26887 +                       ret = register_trace_sched_wakeup_new(
26888 +                           probe_wakeup_latency_hist_start, NULL);
26889 +                       if (ret) {
26890 +                               pr_info("wakeup trace: Couldn't assign "
26891 +                                   "probe_wakeup_latency_hist_start "
26892 +                                   "to trace_sched_wakeup_new\n");
26893 +                               unregister_trace_sched_wakeup(
26894 +                                   probe_wakeup_latency_hist_start, NULL);
26895 +                               return ret;
26896 +                       }
26897 +                       ret = register_trace_sched_switch(
26898 +                           probe_wakeup_latency_hist_stop, NULL);
26899 +                       if (ret) {
26900 +                               pr_info("wakeup trace: Couldn't assign "
26901 +                                   "probe_wakeup_latency_hist_stop "
26902 +                                   "to trace_sched_switch\n");
26903 +                               unregister_trace_sched_wakeup(
26904 +                                   probe_wakeup_latency_hist_start, NULL);
26905 +                               unregister_trace_sched_wakeup_new(
26906 +                                   probe_wakeup_latency_hist_start, NULL);
26907 +                               return ret;
26908 +                       }
26909 +                       ret = register_trace_sched_migrate_task(
26910 +                           probe_sched_migrate_task, NULL);
26911 +                       if (ret) {
26912 +                               pr_info("wakeup trace: Couldn't assign "
26913 +                                   "probe_sched_migrate_task "
26914 +                                   "to trace_sched_migrate_task\n");
26915 +                               unregister_trace_sched_wakeup(
26916 +                                   probe_wakeup_latency_hist_start, NULL);
26917 +                               unregister_trace_sched_wakeup_new(
26918 +                                   probe_wakeup_latency_hist_start, NULL);
26919 +                               unregister_trace_sched_switch(
26920 +                                   probe_wakeup_latency_hist_stop, NULL);
26921 +                               return ret;
26922 +                       }
26923 +                       break;
26924 +#endif
26925 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26926 +               case MISSED_TIMER_OFFSETS:
26927 +                       ret = register_trace_hrtimer_interrupt(
26928 +                           probe_hrtimer_interrupt, NULL);
26929 +                       if (ret) {
26930 +                               pr_info("wakeup trace: Couldn't assign "
26931 +                                   "probe_hrtimer_interrupt "
26932 +                                   "to trace_hrtimer_interrupt\n");
26933 +                               return ret;
26934 +                       }
26935 +                       break;
26936 +#endif
26937 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26938 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26939 +               case TIMERANDWAKEUP_LATENCY:
26940 +                       if (!wakeup_latency_enabled_data.enabled ||
26941 +                           !missed_timer_offsets_enabled_data.enabled)
26942 +                               return -EINVAL;
26943 +                       break;
26944 +#endif
26945 +               default:
26946 +                       break;
26947 +               }
26948 +       } else {
26949 +               switch (ed->latency_type) {
26950 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
26951 +               case PREEMPTIRQSOFF_LATENCY:
26952 +                       {
26953 +                               int cpu;
26954 +
26955 +                               unregister_trace_preemptirqsoff_hist(
26956 +                                   probe_preemptirqsoff_hist, NULL);
26957 +                               for_each_online_cpu(cpu) {
26958 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26959 +                                       per_cpu(hist_irqsoff_counting,
26960 +                                           cpu) = 0;
26961 +#endif
26962 +#ifdef CONFIG_PREEMPT_OFF_HIST
26963 +                                       per_cpu(hist_preemptoff_counting,
26964 +                                           cpu) = 0;
26965 +#endif
26966 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
26967 +                                       per_cpu(hist_preemptirqsoff_counting,
26968 +                                           cpu) = 0;
26969 +#endif
26970 +                               }
26971 +                       }
26972 +                       break;
26973 +#endif
26974 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26975 +               case WAKEUP_LATENCY:
26976 +                       {
26977 +                               int cpu;
26978 +
26979 +                               unregister_trace_sched_wakeup(
26980 +                                   probe_wakeup_latency_hist_start, NULL);
26981 +                               unregister_trace_sched_wakeup_new(
26982 +                                   probe_wakeup_latency_hist_start, NULL);
26983 +                               unregister_trace_sched_switch(
26984 +                                   probe_wakeup_latency_hist_stop, NULL);
26985 +                               unregister_trace_sched_migrate_task(
26986 +                                   probe_sched_migrate_task, NULL);
26987 +
26988 +                               for_each_online_cpu(cpu) {
26989 +                                       per_cpu(wakeup_task, cpu) = NULL;
26990 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
26991 +                               }
26992 +                       }
26993 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26994 +                       timerandwakeup_enabled_data.enabled = 0;
26995 +#endif
26996 +                       break;
26997 +#endif
26998 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26999 +               case MISSED_TIMER_OFFSETS:
27000 +                       unregister_trace_hrtimer_interrupt(
27001 +                           probe_hrtimer_interrupt, NULL);
27002 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27003 +                       timerandwakeup_enabled_data.enabled = 0;
27004 +#endif
27005 +                       break;
27006 +#endif
27007 +               default:
27008 +                       break;
27009 +               }
27010 +       }
27011 +       ed->enabled = enable;
27012 +       return cnt;
27013 +}
27014 +
27015 +static const struct file_operations latency_hist_reset_fops = {
27016 +       .open = tracing_open_generic,
27017 +       .write = latency_hist_reset,
27018 +};
27019 +
27020 +static const struct file_operations enable_fops = {
27021 +       .open = tracing_open_generic,
27022 +       .read = show_enable,
27023 +       .write = do_enable,
27024 +};
27025 +
27026 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27027 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27028 +static const struct file_operations pid_fops = {
27029 +       .open = tracing_open_generic,
27030 +       .read = show_pid,
27031 +       .write = do_pid,
27032 +};
27033 +
27034 +static const struct file_operations maxlatproc_fops = {
27035 +       .open = tracing_open_generic,
27036 +       .read = show_maxlatproc,
27037 +};
27038 +#endif
27039 +
27040 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27041 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
27042 +       int starthist)
27043 +{
27044 +       int cpu = raw_smp_processor_id();
27045 +       int time_set = 0;
27046 +
27047 +       if (starthist) {
27048 +               cycle_t uninitialized_var(start);
27049 +
27050 +               if (!preempt_count() && !irqs_disabled())
27051 +                       return;
27052 +
27053 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27054 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
27055 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
27056 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
27057 +                       start = ftrace_now(cpu);
27058 +                       time_set++;
27059 +                       per_cpu(hist_irqsoff_start, cpu) = start;
27060 +               }
27061 +#endif
27062 +
27063 +#ifdef CONFIG_PREEMPT_OFF_HIST
27064 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
27065 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
27066 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
27067 +                       if (!(time_set++))
27068 +                               start = ftrace_now(cpu);
27069 +                       per_cpu(hist_preemptoff_start, cpu) = start;
27070 +               }
27071 +#endif
27072 +
27073 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27074 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
27075 +                   per_cpu(hist_preemptoff_counting, cpu) &&
27076 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
27077 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
27078 +                       if (!time_set)
27079 +                               start = ftrace_now(cpu);
27080 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
27081 +               }
27082 +#endif
27083 +       } else {
27084 +               cycle_t uninitialized_var(stop);
27085 +
27086 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27087 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
27088 +                   per_cpu(hist_irqsoff_counting, cpu)) {
27089 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
27090 +
27091 +                       stop = ftrace_now(cpu);
27092 +                       time_set++;
27093 +                       if (start) {
27094 +                               long latency = ((long) (stop - start)) /
27095 +                                   NSECS_PER_USECS;
27096 +
27097 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
27098 +                                   stop, NULL);
27099 +                       }
27100 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
27101 +               }
27102 +#endif
27103 +
27104 +#ifdef CONFIG_PREEMPT_OFF_HIST
27105 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
27106 +                   per_cpu(hist_preemptoff_counting, cpu)) {
27107 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
27108 +
27109 +                       if (!(time_set++))
27110 +                               stop = ftrace_now(cpu);
27111 +                       if (start) {
27112 +                               long latency = ((long) (stop - start)) /
27113 +                                   NSECS_PER_USECS;
27114 +
27115 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
27116 +                                   0, stop, NULL);
27117 +                       }
27118 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
27119 +               }
27120 +#endif
27121 +
27122 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27123 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
27124 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
27125 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
27126 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
27127 +
27128 +                       if (!time_set)
27129 +                               stop = ftrace_now(cpu);
27130 +                       if (start) {
27131 +                               long latency = ((long) (stop - start)) /
27132 +                                   NSECS_PER_USECS;
27133 +
27134 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
27135 +                                   latency, 0, stop, NULL);
27136 +                       }
27137 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
27138 +               }
27139 +#endif
27140 +       }
27141 +}
27142 +#endif
27143 +
27144 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27145 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
27146 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
27147 +       int cpu)
27148 +{
27149 +       int old_cpu = task_cpu(task);
27150 +
27151 +       if (cpu != old_cpu) {
27152 +               unsigned long flags;
27153 +               struct task_struct *cpu_wakeup_task;
27154 +
27155 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
27156 +
27157 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
27158 +               if (task == cpu_wakeup_task) {
27159 +                       put_task_struct(cpu_wakeup_task);
27160 +                       per_cpu(wakeup_task, old_cpu) = NULL;
27161 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
27162 +                       get_task_struct(cpu_wakeup_task);
27163 +               }
27164 +
27165 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27166 +       }
27167 +}
27168 +
27169 +static notrace void probe_wakeup_latency_hist_start(void *v,
27170 +       struct task_struct *p)
27171 +{
27172 +       unsigned long flags;
27173 +       struct task_struct *curr = current;
27174 +       int cpu = task_cpu(p);
27175 +       struct task_struct *cpu_wakeup_task;
27176 +
27177 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27178 +
27179 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27180 +
27181 +       if (wakeup_pid) {
27182 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27183 +                   p->prio == curr->prio)
27184 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27185 +               if (likely(wakeup_pid != task_pid_nr(p)))
27186 +                       goto out;
27187 +       } else {
27188 +               if (likely(!rt_task(p)) ||
27189 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
27190 +                   p->prio > curr->prio)
27191 +                       goto out;
27192 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27193 +                   p->prio == curr->prio)
27194 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27195 +       }
27196 +
27197 +       if (cpu_wakeup_task)
27198 +               put_task_struct(cpu_wakeup_task);
27199 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
27200 +       get_task_struct(cpu_wakeup_task);
27201 +       cpu_wakeup_task->preempt_timestamp_hist =
27202 +               ftrace_now(raw_smp_processor_id());
27203 +out:
27204 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27205 +}
27206 +
27207 +static notrace void probe_wakeup_latency_hist_stop(void *v,
27208 +       bool preempt, struct task_struct *prev, struct task_struct *next)
27209 +{
27210 +       unsigned long flags;
27211 +       int cpu = task_cpu(next);
27212 +       long latency;
27213 +       cycle_t stop;
27214 +       struct task_struct *cpu_wakeup_task;
27215 +
27216 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27217 +
27218 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27219 +
27220 +       if (cpu_wakeup_task == NULL)
27221 +               goto out;
27222 +
27223 +       /* Already running? */
27224 +       if (unlikely(current == cpu_wakeup_task))
27225 +               goto out_reset;
27226 +
27227 +       if (next != cpu_wakeup_task) {
27228 +               if (next->prio < cpu_wakeup_task->prio)
27229 +                       goto out_reset;
27230 +
27231 +               if (next->prio == cpu_wakeup_task->prio)
27232 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27233 +
27234 +               goto out;
27235 +       }
27236 +
27237 +       if (current->prio == cpu_wakeup_task->prio)
27238 +               per_cpu(wakeup_sharedprio, cpu) = 1;
27239 +
27240 +       /*
27241 +        * The task we are waiting for is about to be switched to.
27242 +        * Calculate latency and store it in histogram.
27243 +        */
27244 +       stop = ftrace_now(raw_smp_processor_id());
27245 +
27246 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
27247 +           NSECS_PER_USECS;
27248 +
27249 +       if (per_cpu(wakeup_sharedprio, cpu)) {
27250 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
27251 +                   next);
27252 +               per_cpu(wakeup_sharedprio, cpu) = 0;
27253 +       } else {
27254 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
27255 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27256 +               if (timerandwakeup_enabled_data.enabled) {
27257 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
27258 +                           next->timer_offset + latency, next->timer_offset,
27259 +                           stop, next);
27260 +               }
27261 +#endif
27262 +       }
27263 +
27264 +out_reset:
27265 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27266 +       next->timer_offset = 0;
27267 +#endif
27268 +       put_task_struct(cpu_wakeup_task);
27269 +       per_cpu(wakeup_task, cpu) = NULL;
27270 +out:
27271 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27272 +}
27273 +#endif
27274 +
27275 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27276 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
27277 +       long long latency_ns, struct task_struct *curr,
27278 +       struct task_struct *task)
27279 +{
27280 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
27281 +           (task->prio < curr->prio ||
27282 +           (task->prio == curr->prio &&
27283 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
27284 +               long latency;
27285 +               cycle_t now;
27286 +
27287 +               if (missed_timer_offsets_pid) {
27288 +                       if (likely(missed_timer_offsets_pid !=
27289 +                           task_pid_nr(task)))
27290 +                               return;
27291 +               }
27292 +
27293 +               now = ftrace_now(cpu);
27294 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
27295 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
27296 +                   task);
27297 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27298 +               task->timer_offset = latency;
27299 +#endif
27300 +       }
27301 +}
27302 +#endif
27303 +
27304 +static __init int latency_hist_init(void)
27305 +{
27306 +       struct dentry *latency_hist_root = NULL;
27307 +       struct dentry *dentry;
27308 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27309 +       struct dentry *dentry_sharedprio;
27310 +#endif
27311 +       struct dentry *entry;
27312 +       struct dentry *enable_root;
27313 +       int i = 0;
27314 +       struct hist_data *my_hist;
27315 +       char name[64];
27316 +       char *cpufmt = "CPU%d";
27317 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27318 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27319 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
27320 +       struct maxlatproc_data *mp = NULL;
27321 +#endif
27322 +
27323 +       dentry = tracing_init_dentry();
27324 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
27325 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
27326 +
27327 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27328 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
27329 +       for_each_possible_cpu(i) {
27330 +               sprintf(name, cpufmt, i);
27331 +               entry = debugfs_create_file(name, 0444, dentry,
27332 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
27333 +               my_hist = &per_cpu(irqsoff_hist, i);
27334 +               atomic_set(&my_hist->hist_mode, 1);
27335 +               my_hist->min_lat = LONG_MAX;
27336 +       }
27337 +       entry = debugfs_create_file("reset", 0644, dentry,
27338 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
27339 +#endif
27340 +
27341 +#ifdef CONFIG_PREEMPT_OFF_HIST
27342 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
27343 +           latency_hist_root);
27344 +       for_each_possible_cpu(i) {
27345 +               sprintf(name, cpufmt, i);
27346 +               entry = debugfs_create_file(name, 0444, dentry,
27347 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
27348 +               my_hist = &per_cpu(preemptoff_hist, i);
27349 +               atomic_set(&my_hist->hist_mode, 1);
27350 +               my_hist->min_lat = LONG_MAX;
27351 +       }
27352 +       entry = debugfs_create_file("reset", 0644, dentry,
27353 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
27354 +#endif
27355 +
27356 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27357 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
27358 +           latency_hist_root);
27359 +       for_each_possible_cpu(i) {
27360 +               sprintf(name, cpufmt, i);
27361 +               entry = debugfs_create_file(name, 0444, dentry,
27362 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
27363 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
27364 +               atomic_set(&my_hist->hist_mode, 1);
27365 +               my_hist->min_lat = LONG_MAX;
27366 +       }
27367 +       entry = debugfs_create_file("reset", 0644, dentry,
27368 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
27369 +#endif
27370 +
27371 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27372 +       entry = debugfs_create_file("preemptirqsoff", 0644,
27373 +           enable_root, (void *)&preemptirqsoff_enabled_data,
27374 +           &enable_fops);
27375 +#endif
27376 +
27377 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27378 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
27379 +           latency_hist_root);
27380 +       dentry_sharedprio = debugfs_create_dir(
27381 +           wakeup_latency_hist_dir_sharedprio, dentry);
27382 +       for_each_possible_cpu(i) {
27383 +               sprintf(name, cpufmt, i);
27384 +
27385 +               entry = debugfs_create_file(name, 0444, dentry,
27386 +                   &per_cpu(wakeup_latency_hist, i),
27387 +                   &latency_hist_fops);
27388 +               my_hist = &per_cpu(wakeup_latency_hist, i);
27389 +               atomic_set(&my_hist->hist_mode, 1);
27390 +               my_hist->min_lat = LONG_MAX;
27391 +
27392 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
27393 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
27394 +                   &latency_hist_fops);
27395 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
27396 +               atomic_set(&my_hist->hist_mode, 1);
27397 +               my_hist->min_lat = LONG_MAX;
27398 +
27399 +               sprintf(name, cpufmt_maxlatproc, i);
27400 +
27401 +               mp = &per_cpu(wakeup_maxlatproc, i);
27402 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27403 +                   &maxlatproc_fops);
27404 +               clear_maxlatprocdata(mp);
27405 +
27406 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
27407 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
27408 +                   &maxlatproc_fops);
27409 +               clear_maxlatprocdata(mp);
27410 +       }
27411 +       entry = debugfs_create_file("pid", 0644, dentry,
27412 +           (void *)&wakeup_pid, &pid_fops);
27413 +       entry = debugfs_create_file("reset", 0644, dentry,
27414 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
27415 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
27416 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
27417 +       entry = debugfs_create_file("wakeup", 0644,
27418 +           enable_root, (void *)&wakeup_latency_enabled_data,
27419 +           &enable_fops);
27420 +#endif
27421 +
27422 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27423 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
27424 +           latency_hist_root);
27425 +       for_each_possible_cpu(i) {
27426 +               sprintf(name, cpufmt, i);
27427 +               entry = debugfs_create_file(name, 0444, dentry,
27428 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
27429 +               my_hist = &per_cpu(missed_timer_offsets, i);
27430 +               atomic_set(&my_hist->hist_mode, 1);
27431 +               my_hist->min_lat = LONG_MAX;
27432 +
27433 +               sprintf(name, cpufmt_maxlatproc, i);
27434 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
27435 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27436 +                   &maxlatproc_fops);
27437 +               clear_maxlatprocdata(mp);
27438 +       }
27439 +       entry = debugfs_create_file("pid", 0644, dentry,
27440 +           (void *)&missed_timer_offsets_pid, &pid_fops);
27441 +       entry = debugfs_create_file("reset", 0644, dentry,
27442 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
27443 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
27444 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
27445 +           &enable_fops);
27446 +#endif
27447 +
27448 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27449 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27450 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
27451 +           latency_hist_root);
27452 +       for_each_possible_cpu(i) {
27453 +               sprintf(name, cpufmt, i);
27454 +               entry = debugfs_create_file(name, 0444, dentry,
27455 +                   &per_cpu(timerandwakeup_latency_hist, i),
27456 +                   &latency_hist_fops);
27457 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
27458 +               atomic_set(&my_hist->hist_mode, 1);
27459 +               my_hist->min_lat = LONG_MAX;
27460 +
27461 +               sprintf(name, cpufmt_maxlatproc, i);
27462 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
27463 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27464 +                   &maxlatproc_fops);
27465 +               clear_maxlatprocdata(mp);
27466 +       }
27467 +       entry = debugfs_create_file("reset", 0644, dentry,
27468 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
27469 +       entry = debugfs_create_file("timerandwakeup", 0644,
27470 +           enable_root, (void *)&timerandwakeup_enabled_data,
27471 +           &enable_fops);
27472 +#endif
27473 +       return 0;
27474 +}
27475 +
27476 +device_initcall(latency_hist_init);
27477 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
27478 index 059233abcfcf..aebdbff7d425 100644
27479 --- a/kernel/trace/trace.c
27480 +++ b/kernel/trace/trace.c
27481 @@ -1652,6 +1652,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27482         struct task_struct *tsk = current;
27483  
27484         entry->preempt_count            = pc & 0xff;
27485 +       entry->preempt_lazy_count       = preempt_lazy_count();
27486         entry->pid                      = (tsk) ? tsk->pid : 0;
27487         entry->flags =
27488  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
27489 @@ -1661,8 +1662,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27490  #endif
27491                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
27492                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
27493 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
27494 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
27495 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
27496                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
27497 +
27498 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
27499  }
27500  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
27501  
27502 @@ -2555,14 +2559,17 @@ get_total_entries(struct trace_buffer *buf,
27503  
27504  static void print_lat_help_header(struct seq_file *m)
27505  {
27506 -       seq_puts(m, "#                  _------=> CPU#            \n"
27507 -                   "#                 / _-----=> irqs-off        \n"
27508 -                   "#                | / _----=> need-resched    \n"
27509 -                   "#                || / _---=> hardirq/softirq \n"
27510 -                   "#                ||| / _--=> preempt-depth   \n"
27511 -                   "#                |||| /     delay            \n"
27512 -                   "#  cmd     pid   ||||| time  |   caller      \n"
27513 -                   "#     \\   /      |||||  \\    |   /         \n");
27514 +       seq_puts(m, "#                   _--------=> CPU#              \n"
27515 +                   "#                  / _-------=> irqs-off          \n"
27516 +                   "#                 | / _------=> need-resched      \n"
27517 +                   "#                 || / _-----=> need-resched_lazy \n"
27518 +                   "#                 ||| / _----=> hardirq/softirq   \n"
27519 +                   "#                 |||| / _---=> preempt-depth     \n"
27520 +                   "#                 ||||| / _--=> preempt-lazy-depth\n"
27521 +                   "#                 |||||| / _-=> migrate-disable   \n"
27522 +                   "#                 ||||||| /     delay             \n"
27523 +                   "#  cmd     pid    |||||||| time  |   caller       \n"
27524 +                   "#     \\   /      ||||||||  \\   |   /            \n");
27525  }
27526  
27527  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
27528 @@ -2588,11 +2595,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
27529         print_event_info(buf, m);
27530         seq_puts(m, "#                              _-----=> irqs-off\n"
27531                     "#                             / _----=> need-resched\n"
27532 -                   "#                            | / _---=> hardirq/softirq\n"
27533 -                   "#                            || / _--=> preempt-depth\n"
27534 -                   "#                            ||| /     delay\n"
27535 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
27536 -                   "#              | |       |   ||||       |         |\n");
27537 +                   "#                            |/  _-----=> need-resched_lazy\n"
27538 +                   "#                            || / _---=> hardirq/softirq\n"
27539 +                   "#                            ||| / _--=> preempt-depth\n"
27540 +                   "#                            |||| /_--=> preempt-lazy-depth\n"
27541 +                   "#                            |||||  _-=> migrate-disable   \n"
27542 +                   "#                            ||||| /    delay\n"
27543 +                   "#           TASK-PID   CPU#  ||||||    TIMESTAMP  FUNCTION\n"
27544 +                   "#              | |       |   ||||||       |         |\n");
27545  }
27546  
27547  void
27548 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
27549 index 919d9d07686f..3bf86ece683c 100644
27550 --- a/kernel/trace/trace.h
27551 +++ b/kernel/trace/trace.h
27552 @@ -117,6 +117,7 @@ struct kretprobe_trace_entry_head {
27553   *  NEED_RESCHED       - reschedule is requested
27554   *  HARDIRQ            - inside an interrupt handler
27555   *  SOFTIRQ            - inside a softirq handler
27556 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
27557   */
27558  enum trace_flag_type {
27559         TRACE_FLAG_IRQS_OFF             = 0x01,
27560 @@ -125,6 +126,7 @@ enum trace_flag_type {
27561         TRACE_FLAG_HARDIRQ              = 0x08,
27562         TRACE_FLAG_SOFTIRQ              = 0x10,
27563         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
27564 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x40,
27565  };
27566  
27567  #define TRACE_BUF_SIZE         1024
27568 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
27569 index 996f0fd34312..5bd79b347398 100644
27570 --- a/kernel/trace/trace_events.c
27571 +++ b/kernel/trace/trace_events.c
27572 @@ -188,6 +188,8 @@ static int trace_define_common_fields(void)
27573         __common_field(unsigned char, flags);
27574         __common_field(unsigned char, preempt_count);
27575         __common_field(int, pid);
27576 +       __common_field(unsigned short, migrate_disable);
27577 +       __common_field(unsigned short, padding);
27578  
27579         return ret;
27580  }
27581 @@ -244,6 +246,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
27582  
27583         local_save_flags(fbuffer->flags);
27584         fbuffer->pc = preempt_count();
27585 +       /*
27586 +        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
27587 +        * preemption (adding one to the preempt_count). Since we are
27588 +        * interested in the preempt_count at the time the tracepoint was
27589 +        * hit, we need to subtract one to offset the increment.
27590 +        */
27591 +       if (IS_ENABLED(CONFIG_PREEMPT))
27592 +               fbuffer->pc--;
27593         fbuffer->trace_file = trace_file;
27594  
27595         fbuffer->event =
27596 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
27597 index be3222b7d72e..553e71254ad6 100644
27598 --- a/kernel/trace/trace_irqsoff.c
27599 +++ b/kernel/trace/trace_irqsoff.c
27600 @@ -13,6 +13,7 @@
27601  #include <linux/uaccess.h>
27602  #include <linux/module.h>
27603  #include <linux/ftrace.h>
27604 +#include <trace/events/hist.h>
27605  
27606  #include "trace.h"
27607  
27608 @@ -424,11 +425,13 @@ void start_critical_timings(void)
27609  {
27610         if (preempt_trace() || irq_trace())
27611                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27612 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
27613  }
27614  EXPORT_SYMBOL_GPL(start_critical_timings);
27615  
27616  void stop_critical_timings(void)
27617  {
27618 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
27619         if (preempt_trace() || irq_trace())
27620                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27621  }
27622 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
27623  #ifdef CONFIG_PROVE_LOCKING
27624  void time_hardirqs_on(unsigned long a0, unsigned long a1)
27625  {
27626 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
27627         if (!preempt_trace() && irq_trace())
27628                 stop_critical_timing(a0, a1);
27629  }
27630 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
27631  {
27632         if (!preempt_trace() && irq_trace())
27633                 start_critical_timing(a0, a1);
27634 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
27635  }
27636  
27637  #else /* !CONFIG_PROVE_LOCKING */
27638 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
27639   */
27640  void trace_hardirqs_on(void)
27641  {
27642 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
27643         if (!preempt_trace() && irq_trace())
27644                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27645  }
27646 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
27647  {
27648         if (!preempt_trace() && irq_trace())
27649                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27650 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
27651  }
27652  EXPORT_SYMBOL(trace_hardirqs_off);
27653  
27654  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
27655  {
27656 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
27657         if (!preempt_trace() && irq_trace())
27658                 stop_critical_timing(CALLER_ADDR0, caller_addr);
27659  }
27660 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
27661  {
27662         if (!preempt_trace() && irq_trace())
27663                 start_critical_timing(CALLER_ADDR0, caller_addr);
27664 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
27665  }
27666  EXPORT_SYMBOL(trace_hardirqs_off_caller);
27667  
27668 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
27669  #ifdef CONFIG_PREEMPT_TRACER
27670  void trace_preempt_on(unsigned long a0, unsigned long a1)
27671  {
27672 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
27673         if (preempt_trace() && !irq_trace())
27674                 stop_critical_timing(a0, a1);
27675  }
27676  
27677  void trace_preempt_off(unsigned long a0, unsigned long a1)
27678  {
27679 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
27680         if (preempt_trace() && !irq_trace())
27681                 start_critical_timing(a0, a1);
27682  }
27683 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
27684 index 282982195e09..9f19d839a756 100644
27685 --- a/kernel/trace/trace_output.c
27686 +++ b/kernel/trace/trace_output.c
27687 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27688  {
27689         char hardsoft_irq;
27690         char need_resched;
27691 +       char need_resched_lazy;
27692         char irqs_off;
27693         int hardirq;
27694         int softirq;
27695 @@ -413,6 +414,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27696                 need_resched = '.';
27697                 break;
27698         }
27699 +       need_resched_lazy =
27700 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
27701  
27702         hardsoft_irq =
27703                 (hardirq && softirq) ? 'H' :
27704 @@ -420,14 +423,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27705                 softirq ? 's' :
27706                 '.';
27707  
27708 -       trace_seq_printf(s, "%c%c%c",
27709 -                        irqs_off, need_resched, hardsoft_irq);
27710 +       trace_seq_printf(s, "%c%c%c%c",
27711 +                        irqs_off, need_resched, need_resched_lazy,
27712 +                        hardsoft_irq);
27713  
27714         if (entry->preempt_count)
27715                 trace_seq_printf(s, "%x", entry->preempt_count);
27716         else
27717                 trace_seq_putc(s, '.');
27718  
27719 +       if (entry->preempt_lazy_count)
27720 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
27721 +       else
27722 +               trace_seq_putc(s, '.');
27723 +
27724 +       if (entry->migrate_disable)
27725 +               trace_seq_printf(s, "%x", entry->migrate_disable);
27726 +       else
27727 +               trace_seq_putc(s, '.');
27728 +
27729         return !trace_seq_has_overflowed(s);
27730  }
27731  
27732 diff --git a/kernel/user.c b/kernel/user.c
27733 index b069ccbfb0b0..1a2e88e98b5e 100644
27734 --- a/kernel/user.c
27735 +++ b/kernel/user.c
27736 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
27737         if (!up)
27738                 return;
27739  
27740 -       local_irq_save(flags);
27741 +       local_irq_save_nort(flags);
27742         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
27743                 free_user(up, flags);
27744         else
27745 -               local_irq_restore(flags);
27746 +               local_irq_restore_nort(flags);
27747  }
27748  
27749  struct user_struct *alloc_uid(kuid_t uid)
27750 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
27751 index 198137b1cadc..47d143740774 100644
27752 --- a/kernel/watchdog.c
27753 +++ b/kernel/watchdog.c
27754 @@ -299,6 +299,8 @@ static int is_softlockup(unsigned long touch_ts)
27755  
27756  #ifdef CONFIG_HARDLOCKUP_DETECTOR
27757  
27758 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
27759 +
27760  static struct perf_event_attr wd_hw_attr = {
27761         .type           = PERF_TYPE_HARDWARE,
27762         .config         = PERF_COUNT_HW_CPU_CYCLES,
27763 @@ -333,6 +335,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
27764                 /* only print hardlockups once */
27765                 if (__this_cpu_read(hard_watchdog_warn) == true)
27766                         return;
27767 +               /*
27768 +                * If early-printk is enabled then make sure we do not
27769 +                * lock up in printk() and kill console logging:
27770 +                */
27771 +               printk_kill();
27772 +
27773 +               raw_spin_lock(&watchdog_output_lock);
27774  
27775                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
27776                 print_modules();
27777 @@ -350,8 +359,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
27778                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
27779                         trigger_allbutself_cpu_backtrace();
27780  
27781 +               raw_spin_unlock(&watchdog_output_lock);
27782                 if (hardlockup_panic)
27783 -                       panic("Hard LOCKUP");
27784 +                       nmi_panic(regs, "Hard LOCKUP");
27785  
27786                 __this_cpu_write(hard_watchdog_warn, true);
27787                 return;
27788 @@ -497,6 +507,7 @@ static void watchdog_enable(unsigned int cpu)
27789         /* kick off the timer for the hardlockup detector */
27790         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
27791         hrtimer->function = watchdog_timer_fn;
27792 +       hrtimer->irqsafe = 1;
27793  
27794         /* Enable the perf event */
27795         watchdog_nmi_enable(cpu);
27796 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
27797 index 2c2f971f3e75..965d5f65e847 100644
27798 --- a/kernel/workqueue.c
27799 +++ b/kernel/workqueue.c
27800 @@ -48,6 +48,8 @@
27801  #include <linux/nodemask.h>
27802  #include <linux/moduleparam.h>
27803  #include <linux/uaccess.h>
27804 +#include <linux/locallock.h>
27805 +#include <linux/delay.h>
27806  
27807  #include "workqueue_internal.h"
27808  
27809 @@ -121,11 +123,16 @@ enum {
27810   *    cpu or grabbing pool->lock is enough for read access.  If
27811   *    POOL_DISASSOCIATED is set, it's identical to L.
27812   *
27813 + *    On RT we need the extra protection via rt_lock_idle_list() for
27814 + *    the list manipulations against read access from
27815 + *    wq_worker_sleeping(). All other places are nicely serialized via
27816 + *    pool->lock.
27817 + *
27818   * A: pool->attach_mutex protected.
27819   *
27820   * PL: wq_pool_mutex protected.
27821   *
27822 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
27823 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
27824   *
27825   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
27826   *
27827 @@ -134,7 +141,7 @@ enum {
27828   *
27829   * WQ: wq->mutex protected.
27830   *
27831 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
27832 + * WR: wq->mutex protected for writes.  RCU protected for reads.
27833   *
27834   * MD: wq_mayday_lock protected.
27835   */
27836 @@ -183,7 +190,7 @@ struct worker_pool {
27837         atomic_t                nr_running ____cacheline_aligned_in_smp;
27838  
27839         /*
27840 -        * Destruction of pool is sched-RCU protected to allow dereferences
27841 +        * Destruction of pool is RCU protected to allow dereferences
27842          * from get_work_pool().
27843          */
27844         struct rcu_head         rcu;
27845 @@ -212,7 +219,7 @@ struct pool_workqueue {
27846         /*
27847          * Release of unbound pwq is punted to system_wq.  See put_pwq()
27848          * and pwq_unbound_release_workfn() for details.  pool_workqueue
27849 -        * itself is also sched-RCU protected so that the first pwq can be
27850 +        * itself is also RCU protected so that the first pwq can be
27851          * determined without grabbing wq->mutex.
27852          */
27853         struct work_struct      unbound_release_work;
27854 @@ -331,6 +338,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
27855  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
27856  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
27857  
27858 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
27859 +
27860  static int worker_thread(void *__worker);
27861  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27862  
27863 @@ -338,20 +347,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27864  #include <trace/events/workqueue.h>
27865  
27866  #define assert_rcu_or_pool_mutex()                                     \
27867 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27868 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27869                          !lockdep_is_held(&wq_pool_mutex),              \
27870 -                        "sched RCU or wq_pool_mutex should be held")
27871 +                        "RCU or wq_pool_mutex should be held")
27872  
27873  #define assert_rcu_or_wq_mutex(wq)                                     \
27874 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27875 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27876                          !lockdep_is_held(&wq->mutex),                  \
27877 -                        "sched RCU or wq->mutex should be held")
27878 +                        "RCU or wq->mutex should be held")
27879  
27880  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
27881 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27882 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27883                          !lockdep_is_held(&wq->mutex) &&                \
27884                          !lockdep_is_held(&wq_pool_mutex),              \
27885 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
27886 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
27887  
27888  #define for_each_cpu_worker_pool(pool, cpu)                            \
27889         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
27890 @@ -363,7 +372,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27891   * @pool: iteration cursor
27892   * @pi: integer used for iteration
27893   *
27894 - * This must be called either with wq_pool_mutex held or sched RCU read
27895 + * This must be called either with wq_pool_mutex held or RCU read
27896   * locked.  If the pool needs to be used beyond the locking in effect, the
27897   * caller is responsible for guaranteeing that the pool stays online.
27898   *
27899 @@ -395,7 +404,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27900   * @pwq: iteration cursor
27901   * @wq: the target workqueue
27902   *
27903 - * This must be called either with wq->mutex held or sched RCU read locked.
27904 + * This must be called either with wq->mutex held or RCU read locked.
27905   * If the pwq needs to be used beyond the locking in effect, the caller is
27906   * responsible for guaranteeing that the pwq stays online.
27907   *
27908 @@ -407,6 +416,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27909                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
27910                 else
27911  
27912 +#ifdef CONFIG_PREEMPT_RT_BASE
27913 +static inline void rt_lock_idle_list(struct worker_pool *pool)
27914 +{
27915 +       preempt_disable();
27916 +}
27917 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
27918 +{
27919 +       preempt_enable();
27920 +}
27921 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
27922 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
27923 +#else
27924 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
27925 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
27926 +static inline void sched_lock_idle_list(struct worker_pool *pool)
27927 +{
27928 +       spin_lock_irq(&pool->lock);
27929 +}
27930 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
27931 +{
27932 +       spin_unlock_irq(&pool->lock);
27933 +}
27934 +#endif
27935 +
27936 +
27937  #ifdef CONFIG_DEBUG_OBJECTS_WORK
27938  
27939  static struct debug_obj_descr work_debug_descr;
27940 @@ -557,7 +591,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
27941   * @wq: the target workqueue
27942   * @node: the node ID
27943   *
27944 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
27945 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
27946   * read locked.
27947   * If the pwq needs to be used beyond the locking in effect, the caller is
27948   * responsible for guaranteeing that the pwq stays online.
27949 @@ -701,8 +735,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
27950   * @work: the work item of interest
27951   *
27952   * Pools are created and destroyed under wq_pool_mutex, and allows read
27953 - * access under sched-RCU read lock.  As such, this function should be
27954 - * called under wq_pool_mutex or with preemption disabled.
27955 + * access under RCU read lock.  As such, this function should be
27956 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
27957   *
27958   * All fields of the returned pool are accessible as long as the above
27959   * mentioned locking is in effect.  If the returned pool needs to be used
27960 @@ -839,51 +873,44 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
27961   */
27962  static void wake_up_worker(struct worker_pool *pool)
27963  {
27964 -       struct worker *worker = first_idle_worker(pool);
27965 +       struct worker *worker;
27966 +
27967 +       rt_lock_idle_list(pool);
27968 +
27969 +       worker = first_idle_worker(pool);
27970  
27971         if (likely(worker))
27972                 wake_up_process(worker->task);
27973 +
27974 +       rt_unlock_idle_list(pool);
27975  }
27976  
27977  /**
27978 - * wq_worker_waking_up - a worker is waking up
27979 - * @task: task waking up
27980 - * @cpu: CPU @task is waking up to
27981 + * wq_worker_running - a worker is running again
27982 + * @task: task returning from sleep
27983   *
27984 - * This function is called during try_to_wake_up() when a worker is
27985 - * being awoken.
27986 - *
27987 - * CONTEXT:
27988 - * spin_lock_irq(rq->lock)
27989 + * This function is called when a worker returns from schedule()
27990   */
27991 -void wq_worker_waking_up(struct task_struct *task, int cpu)
27992 +void wq_worker_running(struct task_struct *task)
27993  {
27994         struct worker *worker = kthread_data(task);
27995  
27996 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
27997 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
27998 +       if (!worker->sleeping)
27999 +               return;
28000 +       if (!(worker->flags & WORKER_NOT_RUNNING))
28001                 atomic_inc(&worker->pool->nr_running);
28002 -       }
28003 +       worker->sleeping = 0;
28004  }
28005  
28006  /**
28007   * wq_worker_sleeping - a worker is going to sleep
28008   * @task: task going to sleep
28009 - * @cpu: CPU in question, must be the current CPU number
28010 - *
28011 - * This function is called during schedule() when a busy worker is
28012 - * going to sleep.  Worker on the same cpu can be woken up by
28013 - * returning pointer to its task.
28014 - *
28015 - * CONTEXT:
28016 - * spin_lock_irq(rq->lock)
28017 - *
28018 - * Return:
28019 - * Worker task on @cpu to wake up, %NULL if none.
28020 + * This function is called from schedule() when a busy worker is
28021 + * going to sleep.
28022   */
28023 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28024 +void wq_worker_sleeping(struct task_struct *task)
28025  {
28026 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
28027 +       struct worker *worker = kthread_data(task);
28028         struct worker_pool *pool;
28029  
28030         /*
28031 @@ -892,29 +919,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28032          * checking NOT_RUNNING.
28033          */
28034         if (worker->flags & WORKER_NOT_RUNNING)
28035 -               return NULL;
28036 +               return;
28037  
28038         pool = worker->pool;
28039  
28040 -       /* this can only happen on the local cpu */
28041 -       if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
28042 -               return NULL;
28043 +       if (WARN_ON_ONCE(worker->sleeping))
28044 +               return;
28045 +
28046 +       worker->sleeping = 1;
28047  
28048         /*
28049          * The counterpart of the following dec_and_test, implied mb,
28050          * worklist not empty test sequence is in insert_work().
28051          * Please read comment there.
28052 -        *
28053 -        * NOT_RUNNING is clear.  This means that we're bound to and
28054 -        * running on the local cpu w/ rq lock held and preemption
28055 -        * disabled, which in turn means that none else could be
28056 -        * manipulating idle_list, so dereferencing idle_list without pool
28057 -        * lock is safe.
28058          */
28059         if (atomic_dec_and_test(&pool->nr_running) &&
28060 -           !list_empty(&pool->worklist))
28061 -               to_wakeup = first_idle_worker(pool);
28062 -       return to_wakeup ? to_wakeup->task : NULL;
28063 +           !list_empty(&pool->worklist)) {
28064 +               sched_lock_idle_list(pool);
28065 +               wake_up_worker(pool);
28066 +               sched_unlock_idle_list(pool);
28067 +       }
28068  }
28069  
28070  /**
28071 @@ -1108,12 +1132,12 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
28072  {
28073         if (pwq) {
28074                 /*
28075 -                * As both pwqs and pools are sched-RCU protected, the
28076 +                * As both pwqs and pools are RCU protected, the
28077                  * following lock operations are safe.
28078                  */
28079 -               spin_lock_irq(&pwq->pool->lock);
28080 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
28081                 put_pwq(pwq);
28082 -               spin_unlock_irq(&pwq->pool->lock);
28083 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
28084         }
28085  }
28086  
28087 @@ -1215,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28088         struct worker_pool *pool;
28089         struct pool_workqueue *pwq;
28090  
28091 -       local_irq_save(*flags);
28092 +       local_lock_irqsave(pendingb_lock, *flags);
28093  
28094         /* try to steal the timer if it exists */
28095         if (is_dwork) {
28096 @@ -1234,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28097         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
28098                 return 0;
28099  
28100 +       rcu_read_lock();
28101         /*
28102          * The queueing is in progress, or it is already queued. Try to
28103          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
28104 @@ -1272,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28105                 set_work_pool_and_keep_pending(work, pool->id);
28106  
28107                 spin_unlock(&pool->lock);
28108 +               rcu_read_unlock();
28109                 return 1;
28110         }
28111         spin_unlock(&pool->lock);
28112  fail:
28113 -       local_irq_restore(*flags);
28114 +       rcu_read_unlock();
28115 +       local_unlock_irqrestore(pendingb_lock, *flags);
28116         if (work_is_canceling(work))
28117                 return -ENOENT;
28118 -       cpu_relax();
28119 +       cpu_chill();
28120         return -EAGAIN;
28121  }
28122  
28123 @@ -1348,7 +1375,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28124          * queued or lose PENDING.  Grabbing PENDING and queueing should
28125          * happen with IRQ disabled.
28126          */
28127 -       WARN_ON_ONCE(!irqs_disabled());
28128 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
28129  
28130         debug_work_activate(work);
28131  
28132 @@ -1356,6 +1383,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28133         if (unlikely(wq->flags & __WQ_DRAINING) &&
28134             WARN_ON_ONCE(!is_chained_work(wq)))
28135                 return;
28136 +
28137 +       rcu_read_lock();
28138  retry:
28139         if (req_cpu == WORK_CPU_UNBOUND)
28140                 cpu = raw_smp_processor_id();
28141 @@ -1412,10 +1441,8 @@ retry:
28142         /* pwq determined, queue */
28143         trace_workqueue_queue_work(req_cpu, pwq, work);
28144  
28145 -       if (WARN_ON(!list_empty(&work->entry))) {
28146 -               spin_unlock(&pwq->pool->lock);
28147 -               return;
28148 -       }
28149 +       if (WARN_ON(!list_empty(&work->entry)))
28150 +               goto out;
28151  
28152         pwq->nr_in_flight[pwq->work_color]++;
28153         work_flags = work_color_to_flags(pwq->work_color);
28154 @@ -1431,7 +1458,9 @@ retry:
28155  
28156         insert_work(pwq, work, worklist, work_flags);
28157  
28158 +out:
28159         spin_unlock(&pwq->pool->lock);
28160 +       rcu_read_unlock();
28161  }
28162  
28163  /**
28164 @@ -1451,14 +1480,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
28165         bool ret = false;
28166         unsigned long flags;
28167  
28168 -       local_irq_save(flags);
28169 +       local_lock_irqsave(pendingb_lock,flags);
28170  
28171         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28172                 __queue_work(cpu, wq, work);
28173                 ret = true;
28174         }
28175  
28176 -       local_irq_restore(flags);
28177 +       local_unlock_irqrestore(pendingb_lock, flags);
28178         return ret;
28179  }
28180  EXPORT_SYMBOL(queue_work_on);
28181 @@ -1525,14 +1554,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
28182         unsigned long flags;
28183  
28184         /* read the comment in __queue_work() */
28185 -       local_irq_save(flags);
28186 +       local_lock_irqsave(pendingb_lock, flags);
28187  
28188         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28189                 __queue_delayed_work(cpu, wq, dwork, delay);
28190                 ret = true;
28191         }
28192  
28193 -       local_irq_restore(flags);
28194 +       local_unlock_irqrestore(pendingb_lock, flags);
28195         return ret;
28196  }
28197  EXPORT_SYMBOL(queue_delayed_work_on);
28198 @@ -1567,7 +1596,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
28199  
28200         if (likely(ret >= 0)) {
28201                 __queue_delayed_work(cpu, wq, dwork, delay);
28202 -               local_irq_restore(flags);
28203 +               local_unlock_irqrestore(pendingb_lock, flags);
28204         }
28205  
28206         /* -ENOENT from try_to_grab_pending() becomes %true */
28207 @@ -1600,7 +1629,9 @@ static void worker_enter_idle(struct worker *worker)
28208         worker->last_active = jiffies;
28209  
28210         /* idle_list is LIFO */
28211 +       rt_lock_idle_list(pool);
28212         list_add(&worker->entry, &pool->idle_list);
28213 +       rt_unlock_idle_list(pool);
28214  
28215         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
28216                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
28217 @@ -1633,7 +1664,9 @@ static void worker_leave_idle(struct worker *worker)
28218                 return;
28219         worker_clr_flags(worker, WORKER_IDLE);
28220         pool->nr_idle--;
28221 +       rt_lock_idle_list(pool);
28222         list_del_init(&worker->entry);
28223 +       rt_unlock_idle_list(pool);
28224  }
28225  
28226  static struct worker *alloc_worker(int node)
28227 @@ -1799,7 +1832,9 @@ static void destroy_worker(struct worker *worker)
28228         pool->nr_workers--;
28229         pool->nr_idle--;
28230  
28231 +       rt_lock_idle_list(pool);
28232         list_del_init(&worker->entry);
28233 +       rt_unlock_idle_list(pool);
28234         worker->flags |= WORKER_DIE;
28235         wake_up_process(worker->task);
28236  }
28237 @@ -2716,14 +2751,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
28238  
28239         might_sleep();
28240  
28241 -       local_irq_disable();
28242 +       rcu_read_lock();
28243         pool = get_work_pool(work);
28244         if (!pool) {
28245 -               local_irq_enable();
28246 +               rcu_read_unlock();
28247                 return false;
28248         }
28249  
28250 -       spin_lock(&pool->lock);
28251 +       spin_lock_irq(&pool->lock);
28252         /* see the comment in try_to_grab_pending() with the same code */
28253         pwq = get_work_pwq(work);
28254         if (pwq) {
28255 @@ -2750,10 +2785,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
28256         else
28257                 lock_map_acquire_read(&pwq->wq->lockdep_map);
28258         lock_map_release(&pwq->wq->lockdep_map);
28259 -
28260 +       rcu_read_unlock();
28261         return true;
28262  already_gone:
28263         spin_unlock_irq(&pool->lock);
28264 +       rcu_read_unlock();
28265         return false;
28266  }
28267  
28268 @@ -2840,7 +2876,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
28269  
28270         /* tell other tasks trying to grab @work to back off */
28271         mark_work_canceling(work);
28272 -       local_irq_restore(flags);
28273 +       local_unlock_irqrestore(pendingb_lock, flags);
28274  
28275         flush_work(work);
28276         clear_work_data(work);
28277 @@ -2895,10 +2931,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
28278   */
28279  bool flush_delayed_work(struct delayed_work *dwork)
28280  {
28281 -       local_irq_disable();
28282 +       local_lock_irq(pendingb_lock);
28283         if (del_timer_sync(&dwork->timer))
28284                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
28285 -       local_irq_enable();
28286 +       local_unlock_irq(pendingb_lock);
28287         return flush_work(&dwork->work);
28288  }
28289  EXPORT_SYMBOL(flush_delayed_work);
28290 @@ -2933,7 +2969,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
28291  
28292         set_work_pool_and_clear_pending(&dwork->work,
28293                                         get_work_pool_id(&dwork->work));
28294 -       local_irq_restore(flags);
28295 +       local_unlock_irqrestore(pendingb_lock, flags);
28296         return ret;
28297  }
28298  EXPORT_SYMBOL(cancel_delayed_work);
28299 @@ -3161,7 +3197,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
28300   * put_unbound_pool - put a worker_pool
28301   * @pool: worker_pool to put
28302   *
28303 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
28304 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
28305   * safe manner.  get_unbound_pool() calls this function on its failure path
28306   * and this function should be able to release pools which went through,
28307   * successfully or not, init_worker_pool().
28308 @@ -3215,8 +3251,8 @@ static void put_unbound_pool(struct worker_pool *pool)
28309         del_timer_sync(&pool->idle_timer);
28310         del_timer_sync(&pool->mayday_timer);
28311  
28312 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
28313 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
28314 +       /* RCU protected to allow dereferences from get_work_pool() */
28315 +       call_rcu(&pool->rcu, rcu_free_pool);
28316  }
28317  
28318  /**
28319 @@ -3323,14 +3359,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
28320         put_unbound_pool(pool);
28321         mutex_unlock(&wq_pool_mutex);
28322  
28323 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
28324 +       call_rcu(&pwq->rcu, rcu_free_pwq);
28325  
28326         /*
28327          * If we're the last pwq going away, @wq is already dead and no one
28328          * is gonna access it anymore.  Schedule RCU free.
28329          */
28330         if (is_last)
28331 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
28332 +               call_rcu(&wq->rcu, rcu_free_wq);
28333  }
28334  
28335  /**
28336 @@ -3983,7 +4019,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
28337                  * The base ref is never dropped on per-cpu pwqs.  Directly
28338                  * schedule RCU free.
28339                  */
28340 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
28341 +               call_rcu(&wq->rcu, rcu_free_wq);
28342         } else {
28343                 /*
28344                  * We're the sole accessor of @wq at this point.  Directly
28345 @@ -4076,7 +4112,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
28346         struct pool_workqueue *pwq;
28347         bool ret;
28348  
28349 -       rcu_read_lock_sched();
28350 +       rcu_read_lock();
28351 +       preempt_disable();
28352  
28353         if (cpu == WORK_CPU_UNBOUND)
28354                 cpu = smp_processor_id();
28355 @@ -4087,7 +4124,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
28356                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
28357  
28358         ret = !list_empty(&pwq->delayed_works);
28359 -       rcu_read_unlock_sched();
28360 +       preempt_enable();
28361 +       rcu_read_unlock();
28362  
28363         return ret;
28364  }
28365 @@ -4113,15 +4151,15 @@ unsigned int work_busy(struct work_struct *work)
28366         if (work_pending(work))
28367                 ret |= WORK_BUSY_PENDING;
28368  
28369 -       local_irq_save(flags);
28370 +       rcu_read_lock();
28371         pool = get_work_pool(work);
28372         if (pool) {
28373 -               spin_lock(&pool->lock);
28374 +               spin_lock_irqsave(&pool->lock, flags);
28375                 if (find_worker_executing_work(pool, work))
28376                         ret |= WORK_BUSY_RUNNING;
28377 -               spin_unlock(&pool->lock);
28378 +               spin_unlock_irqrestore(&pool->lock, flags);
28379         }
28380 -       local_irq_restore(flags);
28381 +       rcu_read_unlock();
28382  
28383         return ret;
28384  }
28385 @@ -4310,7 +4348,7 @@ void show_workqueue_state(void)
28386         unsigned long flags;
28387         int pi;
28388  
28389 -       rcu_read_lock_sched();
28390 +       rcu_read_lock();
28391  
28392         pr_info("Showing busy workqueues and worker pools:\n");
28393  
28394 @@ -4361,7 +4399,7 @@ void show_workqueue_state(void)
28395                 spin_unlock_irqrestore(&pool->lock, flags);
28396         }
28397  
28398 -       rcu_read_unlock_sched();
28399 +       rcu_read_unlock();
28400  }
28401  
28402  /*
28403 @@ -4722,16 +4760,16 @@ bool freeze_workqueues_busy(void)
28404                  * nr_active is monotonically decreasing.  It's safe
28405                  * to peek without lock.
28406                  */
28407 -               rcu_read_lock_sched();
28408 +               rcu_read_lock();
28409                 for_each_pwq(pwq, wq) {
28410                         WARN_ON_ONCE(pwq->nr_active < 0);
28411                         if (pwq->nr_active) {
28412                                 busy = true;
28413 -                               rcu_read_unlock_sched();
28414 +                               rcu_read_unlock();
28415                                 goto out_unlock;
28416                         }
28417                 }
28418 -               rcu_read_unlock_sched();
28419 +               rcu_read_unlock();
28420         }
28421  out_unlock:
28422         mutex_unlock(&wq_pool_mutex);
28423 @@ -4921,7 +4959,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
28424         const char *delim = "";
28425         int node, written = 0;
28426  
28427 -       rcu_read_lock_sched();
28428 +       get_online_cpus();
28429 +       rcu_read_lock();
28430         for_each_node(node) {
28431                 written += scnprintf(buf + written, PAGE_SIZE - written,
28432                                      "%s%d:%d", delim, node,
28433 @@ -4929,7 +4968,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
28434                 delim = " ";
28435         }
28436         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
28437 -       rcu_read_unlock_sched();
28438 +       rcu_read_unlock();
28439 +       put_online_cpus();
28440  
28441         return written;
28442  }
28443 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
28444 index 45215870ac6c..f000c4d6917e 100644
28445 --- a/kernel/workqueue_internal.h
28446 +++ b/kernel/workqueue_internal.h
28447 @@ -43,6 +43,7 @@ struct worker {
28448         unsigned long           last_active;    /* L: last active timestamp */
28449         unsigned int            flags;          /* X: flags */
28450         int                     id;             /* I: worker id */
28451 +       int                     sleeping;       /* None */
28452  
28453         /*
28454          * Opaque string set with work_set_desc().  Printed out with task
28455 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
28456   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
28457   * sched/core.c and workqueue.c.
28458   */
28459 -void wq_worker_waking_up(struct task_struct *task, int cpu);
28460 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
28461 +void wq_worker_running(struct task_struct *task);
28462 +void wq_worker_sleeping(struct task_struct *task);
28463  
28464  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
28465 diff --git a/lib/Kconfig b/lib/Kconfig
28466 index 1a48744253d7..f75de578cca8 100644
28467 --- a/lib/Kconfig
28468 +++ b/lib/Kconfig
28469 @@ -397,6 +397,7 @@ config CHECK_SIGNATURE
28470  
28471  config CPUMASK_OFFSTACK
28472         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
28473 +       depends on !PREEMPT_RT_FULL
28474         help
28475           Use dynamic allocation for cpumask_var_t, instead of putting
28476           them on the stack.  This is a bit more expensive, but avoids
28477 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
28478 index 547f7f923dbc..8fcdbc2fc6d0 100644
28479 --- a/lib/debugobjects.c
28480 +++ b/lib/debugobjects.c
28481 @@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
28482         struct debug_obj *obj;
28483         unsigned long flags;
28484  
28485 -       fill_pool();
28486 +#ifdef CONFIG_PREEMPT_RT_FULL
28487 +       if (preempt_count() == 0 && !irqs_disabled())
28488 +#endif
28489 +               fill_pool();
28490  
28491         db = get_bucket((unsigned long) addr);
28492  
28493 diff --git a/lib/idr.c b/lib/idr.c
28494 index 6098336df267..9decbe914595 100644
28495 --- a/lib/idr.c
28496 +++ b/lib/idr.c
28497 @@ -30,6 +30,7 @@
28498  #include <linux/idr.h>
28499  #include <linux/spinlock.h>
28500  #include <linux/percpu.h>
28501 +#include <linux/locallock.h>
28502  
28503  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
28504  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
28505 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
28506  static DEFINE_PER_CPU(int, idr_preload_cnt);
28507  static DEFINE_SPINLOCK(simple_ida_lock);
28508  
28509 +#ifdef CONFIG_PREEMPT_RT_FULL
28510 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
28511 +
28512 +static inline void idr_preload_lock(void)
28513 +{
28514 +       local_lock(idr_lock);
28515 +}
28516 +
28517 +static inline void idr_preload_unlock(void)
28518 +{
28519 +       local_unlock(idr_lock);
28520 +}
28521 +
28522 +void idr_preload_end(void)
28523 +{
28524 +       idr_preload_unlock();
28525 +}
28526 +EXPORT_SYMBOL(idr_preload_end);
28527 +#else
28528 +static inline void idr_preload_lock(void)
28529 +{
28530 +       preempt_disable();
28531 +}
28532 +
28533 +static inline void idr_preload_unlock(void)
28534 +{
28535 +       preempt_enable();
28536 +}
28537 +#endif
28538 +
28539 +
28540  /* the maximum ID which can be allocated given idr->layers */
28541  static int idr_max(int layers)
28542  {
28543 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
28544          * context.  See idr_preload() for details.
28545          */
28546         if (!in_interrupt()) {
28547 -               preempt_disable();
28548 +               idr_preload_lock();
28549                 new = __this_cpu_read(idr_preload_head);
28550                 if (new) {
28551                         __this_cpu_write(idr_preload_head, new->ary[0]);
28552                         __this_cpu_dec(idr_preload_cnt);
28553                         new->ary[0] = NULL;
28554                 }
28555 -               preempt_enable();
28556 +               idr_preload_unlock();
28557                 if (new)
28558                         return new;
28559         }
28560 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
28561         idr_mark_full(pa, id);
28562  }
28563  
28564 -
28565  /**
28566   * idr_preload - preload for idr_alloc()
28567   * @gfp_mask: allocation mask to use for preloading
28568 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
28569         WARN_ON_ONCE(in_interrupt());
28570         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
28571  
28572 -       preempt_disable();
28573 +       idr_preload_lock();
28574  
28575         /*
28576          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
28577 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
28578         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
28579                 struct idr_layer *new;
28580  
28581 -               preempt_enable();
28582 +               idr_preload_unlock();
28583                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
28584 -               preempt_disable();
28585 +               idr_preload_lock();
28586                 if (!new)
28587                         break;
28588  
28589 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
28590 index 872a15a2a637..b93a6103fa4d 100644
28591 --- a/lib/locking-selftest.c
28592 +++ b/lib/locking-selftest.c
28593 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
28594  #include "locking-selftest-spin-hardirq.h"
28595  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
28596  
28597 +#ifndef CONFIG_PREEMPT_RT_FULL
28598 +
28599  #include "locking-selftest-rlock-hardirq.h"
28600  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
28601  
28602 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
28603  #include "locking-selftest-wlock-softirq.h"
28604  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
28605  
28606 +#endif
28607 +
28608  #undef E1
28609  #undef E2
28610  
28611 +#ifndef CONFIG_PREEMPT_RT_FULL
28612  /*
28613   * Enabling hardirqs with a softirq-safe lock held:
28614   */
28615 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
28616  #undef E1
28617  #undef E2
28618  
28619 +#endif
28620 +
28621  /*
28622   * Enabling irqs with an irq-safe lock held:
28623   */
28624 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
28625  #include "locking-selftest-spin-hardirq.h"
28626  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
28627  
28628 +#ifndef CONFIG_PREEMPT_RT_FULL
28629 +
28630  #include "locking-selftest-rlock-hardirq.h"
28631  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
28632  
28633 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
28634  #include "locking-selftest-wlock-softirq.h"
28635  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
28636  
28637 +#endif
28638 +
28639  #undef E1
28640  #undef E2
28641  
28642 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
28643  #include "locking-selftest-spin-hardirq.h"
28644  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
28645  
28646 +#ifndef CONFIG_PREEMPT_RT_FULL
28647 +
28648  #include "locking-selftest-rlock-hardirq.h"
28649  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
28650  
28651 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
28652  #include "locking-selftest-wlock-softirq.h"
28653  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
28654  
28655 +#endif
28656 +
28657  #undef E1
28658  #undef E2
28659  #undef E3
28660 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
28661  #include "locking-selftest-spin-hardirq.h"
28662  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
28663  
28664 +#ifndef CONFIG_PREEMPT_RT_FULL
28665 +
28666  #include "locking-selftest-rlock-hardirq.h"
28667  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
28668  
28669 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
28670  #include "locking-selftest-wlock-softirq.h"
28671  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
28672  
28673 +#endif
28674 +
28675  #undef E1
28676  #undef E2
28677  #undef E3
28678  
28679 +#ifndef CONFIG_PREEMPT_RT_FULL
28680 +
28681  /*
28682   * read-lock / write-lock irq inversion.
28683   *
28684 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
28685  #undef E2
28686  #undef E3
28687  
28688 +#endif
28689 +
28690 +#ifndef CONFIG_PREEMPT_RT_FULL
28691 +
28692  /*
28693   * read-lock / write-lock recursion that is actually safe.
28694   */
28695 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
28696  #undef E2
28697  #undef E3
28698  
28699 +#endif
28700 +
28701  /*
28702   * read-lock / write-lock recursion that is unsafe.
28703   */
28704 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
28705  
28706         printk("  --------------------------------------------------------------------------\n");
28707  
28708 +#ifndef CONFIG_PREEMPT_RT_FULL
28709         /*
28710          * irq-context testcases:
28711          */
28712 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
28713  
28714         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
28715  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
28716 +#else
28717 +       /* On -rt, we only do hardirq context test for raw spinlock */
28718 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
28719 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
28720 +
28721 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
28722 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
28723 +
28724 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
28725 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
28726 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
28727 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
28728 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
28729 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
28730 +
28731 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
28732 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
28733 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
28734 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
28735 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
28736 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
28737 +#endif
28738  
28739         ww_tests();
28740  
28741 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
28742 index 6d40944960de..822a2c027e72 100644
28743 --- a/lib/percpu_ida.c
28744 +++ b/lib/percpu_ida.c
28745 @@ -26,6 +26,9 @@
28746  #include <linux/string.h>
28747  #include <linux/spinlock.h>
28748  #include <linux/percpu_ida.h>
28749 +#include <linux/locallock.h>
28750 +
28751 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
28752  
28753  struct percpu_ida_cpu {
28754         /*
28755 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28756         unsigned long flags;
28757         int tag;
28758  
28759 -       local_irq_save(flags);
28760 +       local_lock_irqsave(irq_off_lock, flags);
28761         tags = this_cpu_ptr(pool->tag_cpu);
28762  
28763         /* Fastpath */
28764         tag = alloc_local_tag(tags);
28765         if (likely(tag >= 0)) {
28766 -               local_irq_restore(flags);
28767 +               local_unlock_irqrestore(irq_off_lock, flags);
28768                 return tag;
28769         }
28770  
28771 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28772  
28773                 if (!tags->nr_free)
28774                         alloc_global_tags(pool, tags);
28775 +
28776                 if (!tags->nr_free)
28777                         steal_tags(pool, tags);
28778  
28779 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28780                 }
28781  
28782                 spin_unlock(&pool->lock);
28783 -               local_irq_restore(flags);
28784 +               local_unlock_irqrestore(irq_off_lock, flags);
28785  
28786                 if (tag >= 0 || state == TASK_RUNNING)
28787                         break;
28788 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28789  
28790                 schedule();
28791  
28792 -               local_irq_save(flags);
28793 +               local_lock_irqsave(irq_off_lock, flags);
28794                 tags = this_cpu_ptr(pool->tag_cpu);
28795         }
28796         if (state != TASK_RUNNING)
28797 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
28798  
28799         BUG_ON(tag >= pool->nr_tags);
28800  
28801 -       local_irq_save(flags);
28802 +       local_lock_irqsave(irq_off_lock, flags);
28803         tags = this_cpu_ptr(pool->tag_cpu);
28804  
28805         spin_lock(&tags->lock);
28806 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
28807                 spin_unlock(&pool->lock);
28808         }
28809  
28810 -       local_irq_restore(flags);
28811 +       local_unlock_irqrestore(irq_off_lock, flags);
28812  }
28813  EXPORT_SYMBOL_GPL(percpu_ida_free);
28814  
28815 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
28816         struct percpu_ida_cpu *remote;
28817         unsigned cpu, i, err = 0;
28818  
28819 -       local_irq_save(flags);
28820 +       local_lock_irqsave(irq_off_lock, flags);
28821         for_each_possible_cpu(cpu) {
28822                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
28823                 spin_lock(&remote->lock);
28824 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
28825         }
28826         spin_unlock(&pool->lock);
28827  out:
28828 -       local_irq_restore(flags);
28829 +       local_unlock_irqrestore(irq_off_lock, flags);
28830         return err;
28831  }
28832  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
28833 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
28834 index 6b79e9026e24..f27e0bcb74f7 100644
28835 --- a/lib/radix-tree.c
28836 +++ b/lib/radix-tree.c
28837 @@ -196,13 +196,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
28838                  * succeed in getting a node here (and never reach
28839                  * kmem_cache_alloc)
28840                  */
28841 -               rtp = this_cpu_ptr(&radix_tree_preloads);
28842 +               rtp = &get_cpu_var(radix_tree_preloads);
28843                 if (rtp->nr) {
28844                         ret = rtp->nodes;
28845                         rtp->nodes = ret->private_data;
28846                         ret->private_data = NULL;
28847                         rtp->nr--;
28848                 }
28849 +               put_cpu_var(radix_tree_preloads);
28850                 /*
28851                  * Update the allocation stack trace as this is more useful
28852                  * for debugging.
28853 @@ -242,6 +243,7 @@ radix_tree_node_free(struct radix_tree_node *node)
28854         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
28855  }
28856  
28857 +#ifndef CONFIG_PREEMPT_RT_FULL
28858  /*
28859   * Load up this CPU's radix_tree_node buffer with sufficient objects to
28860   * ensure that the addition of a single element in the tree cannot fail.  On
28861 @@ -310,6 +312,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
28862         return 0;
28863  }
28864  EXPORT_SYMBOL(radix_tree_maybe_preload);
28865 +#endif
28866  
28867  /*
28868   *     Return the maximum key which can be store into a
28869 diff --git a/lib/rbtree.c b/lib/rbtree.c
28870 index 1356454e36de..d15d6c4327f1 100644
28871 --- a/lib/rbtree.c
28872 +++ b/lib/rbtree.c
28873 @@ -23,6 +23,7 @@
28874  
28875  #include <linux/rbtree_augmented.h>
28876  #include <linux/export.h>
28877 +#include <linux/rcupdate.h>
28878  
28879  /*
28880   * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
28881 @@ -590,3 +591,13 @@ struct rb_node *rb_first_postorder(const struct rb_root *root)
28882         return rb_left_deepest_node(root->rb_node);
28883  }
28884  EXPORT_SYMBOL(rb_first_postorder);
28885 +
28886 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
28887 +                                   struct rb_node **rb_link)
28888 +{
28889 +       node->__rb_parent_color = (unsigned long)parent;
28890 +       node->rb_left = node->rb_right = NULL;
28891 +
28892 +       rcu_assign_pointer(*rb_link, node);
28893 +}
28894 +EXPORT_SYMBOL(rb_link_node_rcu);
28895 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
28896 index bafa9933fa76..ebe3b7edd086 100644
28897 --- a/lib/scatterlist.c
28898 +++ b/lib/scatterlist.c
28899 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
28900                         flush_kernel_dcache_page(miter->page);
28901  
28902                 if (miter->__flags & SG_MITER_ATOMIC) {
28903 -                       WARN_ON_ONCE(preemptible());
28904 +                       WARN_ON_ONCE(!pagefault_disabled());
28905                         kunmap_atomic(miter->addr);
28906                 } else
28907                         kunmap(miter->page);
28908 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
28909         if (!sg_miter_skip(&miter, skip))
28910                 return false;
28911  
28912 -       local_irq_save(flags);
28913 +       local_irq_save_nort(flags);
28914  
28915         while (sg_miter_next(&miter) && offset < buflen) {
28916                 unsigned int len;
28917 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
28918  
28919         sg_miter_stop(&miter);
28920  
28921 -       local_irq_restore(flags);
28922 +       local_irq_restore_nort(flags);
28923         return offset;
28924  }
28925  EXPORT_SYMBOL(sg_copy_buffer);
28926 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
28927 index 1afec32de6f2..11fa431046a8 100644
28928 --- a/lib/smp_processor_id.c
28929 +++ b/lib/smp_processor_id.c
28930 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
28931         if (!printk_ratelimit())
28932                 goto out_enable;
28933  
28934 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
28935 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
28936 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
28937 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
28938 +               current->comm, current->pid);
28939  
28940         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
28941         dump_stack();
28942 diff --git a/localversion-rt b/localversion-rt
28943 new file mode 100644
28944 index 000000000000..e1d836252017
28945 --- /dev/null
28946 +++ b/localversion-rt
28947 @@ -0,0 +1 @@
28948 +-rt33
28949 diff --git a/mm/Kconfig b/mm/Kconfig
28950 index 97a4e06b15c0..9614351e68b8 100644
28951 --- a/mm/Kconfig
28952 +++ b/mm/Kconfig
28953 @@ -392,7 +392,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
28954  
28955  config TRANSPARENT_HUGEPAGE
28956         bool "Transparent Hugepage Support"
28957 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
28958 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
28959         select COMPACTION
28960         help
28961           Transparent Hugepages allows the kernel to use huge pages and
28962 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
28963 index 9ef80bf441b3..826fed55c1cc 100644
28964 --- a/mm/backing-dev.c
28965 +++ b/mm/backing-dev.c
28966 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
28967  {
28968         unsigned long flags;
28969  
28970 -       local_irq_save(flags);
28971 +       local_irq_save_nort(flags);
28972         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
28973 -               local_irq_restore(flags);
28974 +               local_irq_restore_nort(flags);
28975                 return;
28976         }
28977  
28978 diff --git a/mm/compaction.c b/mm/compaction.c
28979 index dba02dec7195..51963f58a29b 100644
28980 --- a/mm/compaction.c
28981 +++ b/mm/compaction.c
28982 @@ -1430,10 +1430,12 @@ check_drain:
28983                                 cc->migrate_pfn & ~((1UL << cc->order) - 1);
28984  
28985                         if (cc->last_migrated_pfn < current_block_start) {
28986 -                               cpu = get_cpu();
28987 +                               cpu = get_cpu_light();
28988 +                               local_lock_irq(swapvec_lock);
28989                                 lru_add_drain_cpu(cpu);
28990 +                               local_unlock_irq(swapvec_lock);
28991                                 drain_local_pages(zone);
28992 -                               put_cpu();
28993 +                               put_cpu_light();
28994                                 /* No more flushing until we migrate again */
28995                                 cc->last_migrated_pfn = 0;
28996                         }
28997 diff --git a/mm/filemap.c b/mm/filemap.c
28998 index 1bb007624b53..44301361c100 100644
28999 --- a/mm/filemap.c
29000 +++ b/mm/filemap.c
29001 @@ -168,7 +168,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
29002         if (!workingset_node_pages(node) &&
29003             list_empty(&node->private_list)) {
29004                 node->private_data = mapping;
29005 -               list_lru_add(&workingset_shadow_nodes, &node->private_list);
29006 +               local_lock(workingset_shadow_lock);
29007 +               list_lru_add(&__workingset_shadow_nodes, &node->private_list);
29008 +               local_unlock(workingset_shadow_lock);
29009         }
29010  }
29011  
29012 @@ -597,9 +599,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
29013                  * node->private_list is protected by
29014                  * mapping->tree_lock.
29015                  */
29016 -               if (!list_empty(&node->private_list))
29017 -                       list_lru_del(&workingset_shadow_nodes,
29018 +               if (!list_empty(&node->private_list)) {
29019 +                       local_lock(workingset_shadow_lock);
29020 +                       list_lru_del(&__workingset_shadow_nodes,
29021                                      &node->private_list);
29022 +                       local_unlock(workingset_shadow_lock);
29023 +               }
29024         }
29025         return 0;
29026  }
29027 diff --git a/mm/highmem.c b/mm/highmem.c
29028 index 123bcd3ed4f2..16e8cf26d38a 100644
29029 --- a/mm/highmem.c
29030 +++ b/mm/highmem.c
29031 @@ -29,10 +29,11 @@
29032  #include <linux/kgdb.h>
29033  #include <asm/tlbflush.h>
29034  
29035 -
29036 +#ifndef CONFIG_PREEMPT_RT_FULL
29037  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
29038  DEFINE_PER_CPU(int, __kmap_atomic_idx);
29039  #endif
29040 +#endif
29041  
29042  /*
29043   * Virtual_count is not a pure "count".
29044 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
29045  unsigned long totalhigh_pages __read_mostly;
29046  EXPORT_SYMBOL(totalhigh_pages);
29047  
29048 -
29049 +#ifndef CONFIG_PREEMPT_RT_FULL
29050  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
29051 +#endif
29052  
29053  unsigned int nr_free_highpages (void)
29054  {
29055 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
29056 index 6b90d184e9c0..ed7aa011ad70 100644
29057 --- a/mm/memcontrol.c
29058 +++ b/mm/memcontrol.c
29059 @@ -67,6 +67,8 @@
29060  #include <net/sock.h>
29061  #include <net/ip.h>
29062  #include <net/tcp_memcontrol.h>
29063 +#include <linux/locallock.h>
29064 +
29065  #include "slab.h"
29066  
29067  #include <asm/uaccess.h>
29068 @@ -87,6 +89,7 @@ int do_swap_account __read_mostly;
29069  #define do_swap_account                0
29070  #endif
29071  
29072 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
29073  static const char * const mem_cgroup_stat_names[] = {
29074         "cache",
29075         "rss",
29076 @@ -1922,14 +1925,17 @@ static void drain_local_stock(struct work_struct *dummy)
29077   */
29078  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
29079  {
29080 -       struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
29081 +       struct memcg_stock_pcp *stock;
29082 +       int cpu = get_cpu_light();
29083 +
29084 +       stock = &per_cpu(memcg_stock, cpu);
29085  
29086         if (stock->cached != memcg) { /* reset if necessary */
29087                 drain_stock(stock);
29088                 stock->cached = memcg;
29089         }
29090         stock->nr_pages += nr_pages;
29091 -       put_cpu_var(memcg_stock);
29092 +       put_cpu_light();
29093  }
29094  
29095  /*
29096 @@ -1945,7 +1951,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29097                 return;
29098         /* Notify other cpus that system-wide "drain" is running */
29099         get_online_cpus();
29100 -       curcpu = get_cpu();
29101 +       curcpu = get_cpu_light();
29102         for_each_online_cpu(cpu) {
29103                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
29104                 struct mem_cgroup *memcg;
29105 @@ -1962,7 +1968,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29106                                 schedule_work_on(cpu, &stock->work);
29107                 }
29108         }
29109 -       put_cpu();
29110 +       put_cpu_light();
29111         put_online_cpus();
29112         mutex_unlock(&percpu_charge_mutex);
29113  }
29114 @@ -4700,12 +4706,12 @@ static int mem_cgroup_move_account(struct page *page,
29115  
29116         ret = 0;
29117  
29118 -       local_irq_disable();
29119 +       local_lock_irq(event_lock);
29120         mem_cgroup_charge_statistics(to, page, nr_pages);
29121         memcg_check_events(to, page);
29122         mem_cgroup_charge_statistics(from, page, -nr_pages);
29123         memcg_check_events(from, page);
29124 -       local_irq_enable();
29125 +       local_unlock_irq(event_lock);
29126  out_unlock:
29127         unlock_page(page);
29128  out:
29129 @@ -5495,10 +5501,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
29130                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
29131         }
29132  
29133 -       local_irq_disable();
29134 +       local_lock_irq(event_lock);
29135         mem_cgroup_charge_statistics(memcg, page, nr_pages);
29136         memcg_check_events(memcg, page);
29137 -       local_irq_enable();
29138 +       local_unlock_irq(event_lock);
29139  
29140         if (do_swap_account && PageSwapCache(page)) {
29141                 swp_entry_t entry = { .val = page_private(page) };
29142 @@ -5554,14 +5560,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
29143                 memcg_oom_recover(memcg);
29144         }
29145  
29146 -       local_irq_save(flags);
29147 +       local_lock_irqsave(event_lock, flags);
29148         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
29149         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
29150         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
29151         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
29152         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
29153         memcg_check_events(memcg, dummy_page);
29154 -       local_irq_restore(flags);
29155 +       local_unlock_irqrestore(event_lock, flags);
29156  
29157         if (!mem_cgroup_is_root(memcg))
29158                 css_put_many(&memcg->css, nr_pages);
29159 @@ -5753,6 +5759,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29160  {
29161         struct mem_cgroup *memcg, *swap_memcg;
29162         unsigned short oldid;
29163 +       unsigned long flags;
29164  
29165         VM_BUG_ON_PAGE(PageLRU(page), page);
29166         VM_BUG_ON_PAGE(page_count(page), page);
29167 @@ -5793,12 +5800,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29168          * important here to have the interrupts disabled because it is the
29169          * only synchronisation we have for udpating the per-CPU variables.
29170          */
29171 +       local_lock_irqsave(event_lock, flags);
29172 +#ifndef CONFIG_PREEMPT_RT_BASE
29173         VM_BUG_ON(!irqs_disabled());
29174 +#endif
29175         mem_cgroup_charge_statistics(memcg, page, -1);
29176         memcg_check_events(memcg, page);
29177  
29178         if (!mem_cgroup_is_root(memcg))
29179                 css_put(&memcg->css);
29180 +       local_unlock_irqrestore(event_lock, flags);
29181  }
29182  
29183  /**
29184 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
29185 index f802c2d216a7..b1b6f238e42d 100644
29186 --- a/mm/mmu_context.c
29187 +++ b/mm/mmu_context.c
29188 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
29189         struct task_struct *tsk = current;
29190  
29191         task_lock(tsk);
29192 +       preempt_disable_rt();
29193         active_mm = tsk->active_mm;
29194         if (active_mm != mm) {
29195                 atomic_inc(&mm->mm_count);
29196 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
29197         }
29198         tsk->mm = mm;
29199         switch_mm(active_mm, mm, tsk);
29200 +       preempt_enable_rt();
29201         task_unlock(tsk);
29202  #ifdef finish_arch_post_lock_switch
29203         finish_arch_post_lock_switch();
29204 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
29205 index 2bcdfbf8c36d..a500c9e740dd 100644
29206 --- a/mm/page_alloc.c
29207 +++ b/mm/page_alloc.c
29208 @@ -60,6 +60,7 @@
29209  #include <linux/page_ext.h>
29210  #include <linux/hugetlb.h>
29211  #include <linux/sched/rt.h>
29212 +#include <linux/locallock.h>
29213  #include <linux/page_owner.h>
29214  #include <linux/kthread.h>
29215  
29216 @@ -264,6 +265,18 @@ EXPORT_SYMBOL(nr_node_ids);
29217  EXPORT_SYMBOL(nr_online_nodes);
29218  #endif
29219  
29220 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
29221 +
29222 +#ifdef CONFIG_PREEMPT_RT_BASE
29223 +# define cpu_lock_irqsave(cpu, flags)          \
29224 +       local_lock_irqsave_on(pa_lock, flags, cpu)
29225 +# define cpu_unlock_irqrestore(cpu, flags)     \
29226 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
29227 +#else
29228 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
29229 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
29230 +#endif
29231 +
29232  int page_group_by_mobility_disabled __read_mostly;
29233  
29234  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
29235 @@ -786,7 +799,7 @@ static inline int free_pages_check(struct page *page)
29236  }
29237  
29238  /*
29239 - * Frees a number of pages from the PCP lists
29240 + * Frees a number of pages which have been collected from the pcp lists.
29241   * Assumes all pages on list are in same zone, and of same order.
29242   * count is the number of pages to free.
29243   *
29244 @@ -797,18 +810,53 @@ static inline int free_pages_check(struct page *page)
29245   * pinned" detection logic.
29246   */
29247  static void free_pcppages_bulk(struct zone *zone, int count,
29248 -                                       struct per_cpu_pages *pcp)
29249 +                              struct list_head *list)
29250  {
29251 -       int migratetype = 0;
29252 -       int batch_free = 0;
29253         int to_free = count;
29254         unsigned long nr_scanned;
29255 +       unsigned long flags;
29256 +
29257 +       spin_lock_irqsave(&zone->lock, flags);
29258  
29259 -       spin_lock(&zone->lock);
29260         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
29261         if (nr_scanned)
29262                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
29263  
29264 +       while (!list_empty(list)) {
29265 +               struct page *page = list_first_entry(list, struct page, lru);
29266 +               int mt; /* migratetype of the to-be-freed page */
29267 +
29268 +               /* must delete as __free_one_page list manipulates */
29269 +               list_del(&page->lru);
29270 +
29271 +               mt = get_pcppage_migratetype(page);
29272 +               /* MIGRATE_ISOLATE page should not go to pcplists */
29273 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
29274 +               /* Pageblock could have been isolated meanwhile */
29275 +               if (unlikely(has_isolate_pageblock(zone)))
29276 +                       mt = get_pageblock_migratetype(page);
29277 +
29278 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
29279 +               trace_mm_page_pcpu_drain(page, 0, mt);
29280 +               to_free--;
29281 +       }
29282 +       WARN_ON(to_free != 0);
29283 +       spin_unlock_irqrestore(&zone->lock, flags);
29284 +}
29285 +
29286 +/*
29287 + * Moves a number of pages from the PCP lists to free list which
29288 + * is freed outside of the locked region.
29289 + *
29290 + * Assumes all pages on list are in same zone, and of same order.
29291 + * count is the number of pages to free.
29292 + */
29293 +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
29294 +                             struct list_head *dst)
29295 +{
29296 +       int migratetype = 0;
29297 +       int batch_free = 0;
29298 +
29299         while (to_free) {
29300                 struct page *page;
29301                 struct list_head *list;
29302 @@ -824,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
29303                         batch_free++;
29304                         if (++migratetype == MIGRATE_PCPTYPES)
29305                                 migratetype = 0;
29306 -                       list = &pcp->lists[migratetype];
29307 +                       list = &src->lists[migratetype];
29308                 } while (list_empty(list));
29309  
29310                 /* This is the only non-empty list. Free them all. */
29311 @@ -832,24 +880,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
29312                         batch_free = to_free;
29313  
29314                 do {
29315 -                       int mt; /* migratetype of the to-be-freed page */
29316 -
29317 -                       page = list_entry(list->prev, struct page, lru);
29318 -                       /* must delete as __free_one_page list manipulates */
29319 +                       page = list_last_entry(list, struct page, lru);
29320                         list_del(&page->lru);
29321  
29322 -                       mt = get_pcppage_migratetype(page);
29323 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
29324 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
29325 -                       /* Pageblock could have been isolated meanwhile */
29326 -                       if (unlikely(has_isolate_pageblock(zone)))
29327 -                               mt = get_pageblock_migratetype(page);
29328 -
29329 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
29330 -                       trace_mm_page_pcpu_drain(page, 0, mt);
29331 +                       list_add(&page->lru, dst);
29332                 } while (--to_free && --batch_free && !list_empty(list));
29333         }
29334 -       spin_unlock(&zone->lock);
29335  }
29336  
29337  static void free_one_page(struct zone *zone,
29338 @@ -858,7 +894,9 @@ static void free_one_page(struct zone *zone,
29339                                 int migratetype)
29340  {
29341         unsigned long nr_scanned;
29342 -       spin_lock(&zone->lock);
29343 +       unsigned long flags;
29344 +
29345 +       spin_lock_irqsave(&zone->lock, flags);
29346         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
29347         if (nr_scanned)
29348                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
29349 @@ -868,7 +906,7 @@ static void free_one_page(struct zone *zone,
29350                 migratetype = get_pfnblock_migratetype(page, pfn);
29351         }
29352         __free_one_page(page, pfn, zone, order, migratetype);
29353 -       spin_unlock(&zone->lock);
29354 +       spin_unlock_irqrestore(&zone->lock, flags);
29355  }
29356  
29357  static int free_tail_pages_check(struct page *head_page, struct page *page)
29358 @@ -1019,10 +1057,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
29359                 return;
29360  
29361         migratetype = get_pfnblock_migratetype(page, pfn);
29362 -       local_irq_save(flags);
29363 +       local_lock_irqsave(pa_lock, flags);
29364         __count_vm_events(PGFREE, 1 << order);
29365         free_one_page(page_zone(page), page, pfn, order, migratetype);
29366 -       local_irq_restore(flags);
29367 +       local_unlock_irqrestore(pa_lock, flags);
29368  }
29369  
29370  static void __init __free_pages_boot_core(struct page *page,
29371 @@ -1879,16 +1917,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
29372  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
29373  {
29374         unsigned long flags;
29375 +       LIST_HEAD(dst);
29376         int to_drain, batch;
29377  
29378 -       local_irq_save(flags);
29379 +       local_lock_irqsave(pa_lock, flags);
29380         batch = READ_ONCE(pcp->batch);
29381         to_drain = min(pcp->count, batch);
29382         if (to_drain > 0) {
29383 -               free_pcppages_bulk(zone, to_drain, pcp);
29384 +               isolate_pcp_pages(to_drain, pcp, &dst);
29385                 pcp->count -= to_drain;
29386         }
29387 -       local_irq_restore(flags);
29388 +       local_unlock_irqrestore(pa_lock, flags);
29389 +       free_pcppages_bulk(zone, to_drain, &dst);
29390  }
29391  #endif
29392  
29393 @@ -1904,16 +1944,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
29394         unsigned long flags;
29395         struct per_cpu_pageset *pset;
29396         struct per_cpu_pages *pcp;
29397 +       LIST_HEAD(dst);
29398 +       int count;
29399  
29400 -       local_irq_save(flags);
29401 +       cpu_lock_irqsave(cpu, flags);
29402         pset = per_cpu_ptr(zone->pageset, cpu);
29403  
29404         pcp = &pset->pcp;
29405 -       if (pcp->count) {
29406 -               free_pcppages_bulk(zone, pcp->count, pcp);
29407 +       count = pcp->count;
29408 +       if (count) {
29409 +               isolate_pcp_pages(count, pcp, &dst);
29410                 pcp->count = 0;
29411         }
29412 -       local_irq_restore(flags);
29413 +       cpu_unlock_irqrestore(cpu, flags);
29414 +       if (count)
29415 +               free_pcppages_bulk(zone, count, &dst);
29416  }
29417  
29418  /*
29419 @@ -1999,8 +2044,17 @@ void drain_all_pages(struct zone *zone)
29420                 else
29421                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
29422         }
29423 +#ifndef CONFIG_PREEMPT_RT_BASE
29424         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
29425                                                                 zone, 1);
29426 +#else
29427 +       for_each_cpu(cpu, &cpus_with_pcps) {
29428 +               if (zone)
29429 +                       drain_pages_zone(cpu, zone);
29430 +               else
29431 +                       drain_pages(cpu);
29432 +       }
29433 +#endif
29434  }
29435  
29436  #ifdef CONFIG_HIBERNATION
29437 @@ -2056,7 +2110,7 @@ void free_hot_cold_page(struct page *page, bool cold)
29438  
29439         migratetype = get_pfnblock_migratetype(page, pfn);
29440         set_pcppage_migratetype(page, migratetype);
29441 -       local_irq_save(flags);
29442 +       local_lock_irqsave(pa_lock, flags);
29443         __count_vm_event(PGFREE);
29444  
29445         /*
29446 @@ -2082,12 +2136,17 @@ void free_hot_cold_page(struct page *page, bool cold)
29447         pcp->count++;
29448         if (pcp->count >= pcp->high) {
29449                 unsigned long batch = READ_ONCE(pcp->batch);
29450 -               free_pcppages_bulk(zone, batch, pcp);
29451 +               LIST_HEAD(dst);
29452 +
29453 +               isolate_pcp_pages(batch, pcp, &dst);
29454                 pcp->count -= batch;
29455 +               local_unlock_irqrestore(pa_lock, flags);
29456 +               free_pcppages_bulk(zone, batch, &dst);
29457 +               return;
29458         }
29459  
29460  out:
29461 -       local_irq_restore(flags);
29462 +       local_unlock_irqrestore(pa_lock, flags);
29463  }
29464  
29465  /*
29466 @@ -2222,7 +2281,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29467                 struct per_cpu_pages *pcp;
29468                 struct list_head *list;
29469  
29470 -               local_irq_save(flags);
29471 +               local_lock_irqsave(pa_lock, flags);
29472                 pcp = &this_cpu_ptr(zone->pageset)->pcp;
29473                 list = &pcp->lists[migratetype];
29474                 if (list_empty(list)) {
29475 @@ -2254,7 +2313,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29476                          */
29477                         WARN_ON_ONCE(order > 1);
29478                 }
29479 -               spin_lock_irqsave(&zone->lock, flags);
29480 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
29481  
29482                 page = NULL;
29483                 if (alloc_flags & ALLOC_HARDER) {
29484 @@ -2264,11 +2323,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29485                 }
29486                 if (!page)
29487                         page = __rmqueue(zone, order, migratetype, gfp_flags);
29488 -               spin_unlock(&zone->lock);
29489 -               if (!page)
29490 +               if (!page) {
29491 +                       spin_unlock(&zone->lock);
29492                         goto failed;
29493 +               }
29494                 __mod_zone_freepage_state(zone, -(1 << order),
29495                                           get_pcppage_migratetype(page));
29496 +               spin_unlock(&zone->lock);
29497         }
29498  
29499         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
29500 @@ -2278,13 +2339,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29501  
29502         __count_zone_vm_events(PGALLOC, zone, 1 << order);
29503         zone_statistics(preferred_zone, zone, gfp_flags);
29504 -       local_irq_restore(flags);
29505 +       local_unlock_irqrestore(pa_lock, flags);
29506  
29507         VM_BUG_ON_PAGE(bad_range(zone, page), page);
29508         return page;
29509  
29510  failed:
29511 -       local_irq_restore(flags);
29512 +       local_unlock_irqrestore(pa_lock, flags);
29513         return NULL;
29514  }
29515  
29516 @@ -5950,6 +6011,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
29517  void __init page_alloc_init(void)
29518  {
29519         hotcpu_notifier(page_alloc_cpu_notify, 0);
29520 +       local_irq_lock_init(pa_lock);
29521  }
29522  
29523  /*
29524 @@ -6844,7 +6906,7 @@ void zone_pcp_reset(struct zone *zone)
29525         struct per_cpu_pageset *pset;
29526  
29527         /* avoid races with drain_pages()  */
29528 -       local_irq_save(flags);
29529 +       local_lock_irqsave(pa_lock, flags);
29530         if (zone->pageset != &boot_pageset) {
29531                 for_each_online_cpu(cpu) {
29532                         pset = per_cpu_ptr(zone->pageset, cpu);
29533 @@ -6853,7 +6915,7 @@ void zone_pcp_reset(struct zone *zone)
29534                 free_percpu(zone->pageset);
29535                 zone->pageset = &boot_pageset;
29536         }
29537 -       local_irq_restore(flags);
29538 +       local_unlock_irqrestore(pa_lock, flags);
29539  }
29540  
29541  #ifdef CONFIG_MEMORY_HOTREMOVE
29542 diff --git a/mm/slab.h b/mm/slab.h
29543 index 7b6087197997..afdc57941179 100644
29544 --- a/mm/slab.h
29545 +++ b/mm/slab.h
29546 @@ -324,7 +324,11 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
29547   * The slab lists for all objects.
29548   */
29549  struct kmem_cache_node {
29550 +#ifdef CONFIG_SLUB
29551 +       raw_spinlock_t list_lock;
29552 +#else
29553         spinlock_t list_lock;
29554 +#endif
29555  
29556  #ifdef CONFIG_SLAB
29557         struct list_head slabs_partial; /* partial list first, better asm code */
29558 diff --git a/mm/slub.c b/mm/slub.c
29559 index 65d5f92d51d2..feb4a445a546 100644
29560 --- a/mm/slub.c
29561 +++ b/mm/slub.c
29562 @@ -1075,7 +1075,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
29563         void *object = head;
29564         int cnt = 0;
29565  
29566 -       spin_lock_irqsave(&n->list_lock, *flags);
29567 +       raw_spin_lock_irqsave(&n->list_lock, *flags);
29568         slab_lock(page);
29569  
29570         if (!check_slab(s, page))
29571 @@ -1136,7 +1136,7 @@ out:
29572  
29573  fail:
29574         slab_unlock(page);
29575 -       spin_unlock_irqrestore(&n->list_lock, *flags);
29576 +       raw_spin_unlock_irqrestore(&n->list_lock, *flags);
29577         slab_fix(s, "Object at 0x%p not freed", object);
29578         return NULL;
29579  }
29580 @@ -1263,6 +1263,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
29581  
29582  #endif /* CONFIG_SLUB_DEBUG */
29583  
29584 +struct slub_free_list {
29585 +       raw_spinlock_t          lock;
29586 +       struct list_head        list;
29587 +};
29588 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
29589 +
29590  /*
29591   * Hooks for other subsystems that check memory allocations. In a typical
29592   * production configuration these hooks all should produce no code at all.
29593 @@ -1399,10 +1405,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
29594         gfp_t alloc_gfp;
29595         void *start, *p;
29596         int idx, order;
29597 +       bool enableirqs = false;
29598  
29599         flags &= gfp_allowed_mask;
29600  
29601         if (gfpflags_allow_blocking(flags))
29602 +               enableirqs = true;
29603 +#ifdef CONFIG_PREEMPT_RT_FULL
29604 +       if (system_state == SYSTEM_RUNNING)
29605 +               enableirqs = true;
29606 +#endif
29607 +       if (enableirqs)
29608                 local_irq_enable();
29609  
29610         flags |= s->allocflags;
29611 @@ -1473,7 +1486,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
29612         page->frozen = 1;
29613  
29614  out:
29615 -       if (gfpflags_allow_blocking(flags))
29616 +       if (enableirqs)
29617                 local_irq_disable();
29618         if (!page)
29619                 return NULL;
29620 @@ -1529,6 +1542,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
29621         __free_kmem_pages(page, order);
29622  }
29623  
29624 +static void free_delayed(struct list_head *h)
29625 +{
29626 +       while(!list_empty(h)) {
29627 +               struct page *page = list_first_entry(h, struct page, lru);
29628 +
29629 +               list_del(&page->lru);
29630 +               __free_slab(page->slab_cache, page);
29631 +       }
29632 +}
29633 +
29634  #define need_reserve_slab_rcu                                          \
29635         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
29636  
29637 @@ -1560,6 +1583,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
29638                 }
29639  
29640                 call_rcu(head, rcu_free_slab);
29641 +       } else if (irqs_disabled()) {
29642 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
29643 +
29644 +               raw_spin_lock(&f->lock);
29645 +               list_add(&page->lru, &f->list);
29646 +               raw_spin_unlock(&f->lock);
29647         } else
29648                 __free_slab(s, page);
29649  }
29650 @@ -1673,7 +1702,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
29651         if (!n || !n->nr_partial)
29652                 return NULL;
29653  
29654 -       spin_lock(&n->list_lock);
29655 +       raw_spin_lock(&n->list_lock);
29656         list_for_each_entry_safe(page, page2, &n->partial, lru) {
29657                 void *t;
29658  
29659 @@ -1698,7 +1727,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
29660                         break;
29661  
29662         }
29663 -       spin_unlock(&n->list_lock);
29664 +       raw_spin_unlock(&n->list_lock);
29665         return object;
29666  }
29667  
29668 @@ -1944,7 +1973,7 @@ redo:
29669                          * that acquire_slab() will see a slab page that
29670                          * is frozen
29671                          */
29672 -                       spin_lock(&n->list_lock);
29673 +                       raw_spin_lock(&n->list_lock);
29674                 }
29675         } else {
29676                 m = M_FULL;
29677 @@ -1955,7 +1984,7 @@ redo:
29678                          * slabs from diagnostic functions will not see
29679                          * any frozen slabs.
29680                          */
29681 -                       spin_lock(&n->list_lock);
29682 +                       raw_spin_lock(&n->list_lock);
29683                 }
29684         }
29685  
29686 @@ -1990,7 +2019,7 @@ redo:
29687                 goto redo;
29688  
29689         if (lock)
29690 -               spin_unlock(&n->list_lock);
29691 +               raw_spin_unlock(&n->list_lock);
29692  
29693         if (m == M_FREE) {
29694                 stat(s, DEACTIVATE_EMPTY);
29695 @@ -2022,10 +2051,10 @@ static void unfreeze_partials(struct kmem_cache *s,
29696                 n2 = get_node(s, page_to_nid(page));
29697                 if (n != n2) {
29698                         if (n)
29699 -                               spin_unlock(&n->list_lock);
29700 +                               raw_spin_unlock(&n->list_lock);
29701  
29702                         n = n2;
29703 -                       spin_lock(&n->list_lock);
29704 +                       raw_spin_lock(&n->list_lock);
29705                 }
29706  
29707                 do {
29708 @@ -2054,7 +2083,7 @@ static void unfreeze_partials(struct kmem_cache *s,
29709         }
29710  
29711         if (n)
29712 -               spin_unlock(&n->list_lock);
29713 +               raw_spin_unlock(&n->list_lock);
29714  
29715         while (discard_page) {
29716                 page = discard_page;
29717 @@ -2093,14 +2122,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
29718                         pobjects = oldpage->pobjects;
29719                         pages = oldpage->pages;
29720                         if (drain && pobjects > s->cpu_partial) {
29721 +                               struct slub_free_list *f;
29722                                 unsigned long flags;
29723 +                               LIST_HEAD(tofree);
29724                                 /*
29725                                  * partial array is full. Move the existing
29726                                  * set to the per node partial list.
29727                                  */
29728                                 local_irq_save(flags);
29729                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
29730 +                               f = this_cpu_ptr(&slub_free_list);
29731 +                               raw_spin_lock(&f->lock);
29732 +                               list_splice_init(&f->list, &tofree);
29733 +                               raw_spin_unlock(&f->lock);
29734                                 local_irq_restore(flags);
29735 +                               free_delayed(&tofree);
29736                                 oldpage = NULL;
29737                                 pobjects = 0;
29738                                 pages = 0;
29739 @@ -2172,7 +2208,22 @@ static bool has_cpu_slab(int cpu, void *info)
29740  
29741  static void flush_all(struct kmem_cache *s)
29742  {
29743 +       LIST_HEAD(tofree);
29744 +       int cpu;
29745 +
29746         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
29747 +       for_each_online_cpu(cpu) {
29748 +               struct slub_free_list *f;
29749 +
29750 +               if (!has_cpu_slab(cpu, s))
29751 +                       continue;
29752 +
29753 +               f = &per_cpu(slub_free_list, cpu);
29754 +               raw_spin_lock_irq(&f->lock);
29755 +               list_splice_init(&f->list, &tofree);
29756 +               raw_spin_unlock_irq(&f->lock);
29757 +               free_delayed(&tofree);
29758 +       }
29759  }
29760  
29761  /*
29762 @@ -2208,10 +2259,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
29763         unsigned long x = 0;
29764         struct page *page;
29765  
29766 -       spin_lock_irqsave(&n->list_lock, flags);
29767 +       raw_spin_lock_irqsave(&n->list_lock, flags);
29768         list_for_each_entry(page, &n->partial, lru)
29769                 x += get_count(page);
29770 -       spin_unlock_irqrestore(&n->list_lock, flags);
29771 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29772         return x;
29773  }
29774  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
29775 @@ -2349,8 +2400,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
29776   * already disabled (which is the case for bulk allocation).
29777   */
29778  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29779 -                         unsigned long addr, struct kmem_cache_cpu *c)
29780 +                         unsigned long addr, struct kmem_cache_cpu *c,
29781 +                         struct list_head *to_free)
29782  {
29783 +       struct slub_free_list *f;
29784         void *freelist;
29785         struct page *page;
29786  
29787 @@ -2410,6 +2463,13 @@ load_freelist:
29788         VM_BUG_ON(!c->page->frozen);
29789         c->freelist = get_freepointer(s, freelist);
29790         c->tid = next_tid(c->tid);
29791 +
29792 +out:
29793 +       f = this_cpu_ptr(&slub_free_list);
29794 +       raw_spin_lock(&f->lock);
29795 +       list_splice_init(&f->list, to_free);
29796 +       raw_spin_unlock(&f->lock);
29797 +
29798         return freelist;
29799  
29800  new_slab:
29801 @@ -2441,7 +2501,7 @@ new_slab:
29802         deactivate_slab(s, page, get_freepointer(s, freelist));
29803         c->page = NULL;
29804         c->freelist = NULL;
29805 -       return freelist;
29806 +       goto out;
29807  }
29808  
29809  /*
29810 @@ -2453,6 +2513,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29811  {
29812         void *p;
29813         unsigned long flags;
29814 +       LIST_HEAD(tofree);
29815  
29816         local_irq_save(flags);
29817  #ifdef CONFIG_PREEMPT
29818 @@ -2464,8 +2525,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29819         c = this_cpu_ptr(s->cpu_slab);
29820  #endif
29821  
29822 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
29823 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
29824         local_irq_restore(flags);
29825 +       free_delayed(&tofree);
29826         return p;
29827  }
29828  
29829 @@ -2652,7 +2714,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29830  
29831         do {
29832                 if (unlikely(n)) {
29833 -                       spin_unlock_irqrestore(&n->list_lock, flags);
29834 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29835                         n = NULL;
29836                 }
29837                 prior = page->freelist;
29838 @@ -2684,7 +2746,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29839                                  * Otherwise the list_lock will synchronize with
29840                                  * other processors updating the list of slabs.
29841                                  */
29842 -                               spin_lock_irqsave(&n->list_lock, flags);
29843 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
29844  
29845                         }
29846                 }
29847 @@ -2726,7 +2788,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29848                 add_partial(n, page, DEACTIVATE_TO_TAIL);
29849                 stat(s, FREE_ADD_PARTIAL);
29850         }
29851 -       spin_unlock_irqrestore(&n->list_lock, flags);
29852 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29853         return;
29854  
29855  slab_empty:
29856 @@ -2741,7 +2803,7 @@ slab_empty:
29857                 remove_full(s, n, page);
29858         }
29859  
29860 -       spin_unlock_irqrestore(&n->list_lock, flags);
29861 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29862         stat(s, FREE_SLAB);
29863         discard_slab(s, page);
29864  }
29865 @@ -2913,6 +2975,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29866                           void **p)
29867  {
29868         struct kmem_cache_cpu *c;
29869 +       LIST_HEAD(to_free);
29870         int i;
29871  
29872         /* memcg and kmem_cache debug support */
29873 @@ -2936,7 +2999,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29874                          * of re-populating per CPU c->freelist
29875                          */
29876                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
29877 -                                           _RET_IP_, c);
29878 +                                           _RET_IP_, c, &to_free);
29879                         if (unlikely(!p[i]))
29880                                 goto error;
29881  
29882 @@ -2948,6 +3011,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29883         }
29884         c->tid = next_tid(c->tid);
29885         local_irq_enable();
29886 +       free_delayed(&to_free);
29887  
29888         /* Clear memory outside IRQ disabled fastpath loop */
29889         if (unlikely(flags & __GFP_ZERO)) {
29890 @@ -3095,7 +3159,7 @@ static void
29891  init_kmem_cache_node(struct kmem_cache_node *n)
29892  {
29893         n->nr_partial = 0;
29894 -       spin_lock_init(&n->list_lock);
29895 +       raw_spin_lock_init(&n->list_lock);
29896         INIT_LIST_HEAD(&n->partial);
29897  #ifdef CONFIG_SLUB_DEBUG
29898         atomic_long_set(&n->nr_slabs, 0);
29899 @@ -3677,7 +3741,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
29900                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
29901                         INIT_LIST_HEAD(promote + i);
29902  
29903 -               spin_lock_irqsave(&n->list_lock, flags);
29904 +               raw_spin_lock_irqsave(&n->list_lock, flags);
29905  
29906                 /*
29907                  * Build lists of slabs to discard or promote.
29908 @@ -3708,7 +3772,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
29909                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
29910                         list_splice(promote + i, &n->partial);
29911  
29912 -               spin_unlock_irqrestore(&n->list_lock, flags);
29913 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
29914  
29915                 /* Release empty slabs */
29916                 list_for_each_entry_safe(page, t, &discard, lru)
29917 @@ -3884,6 +3948,12 @@ void __init kmem_cache_init(void)
29918  {
29919         static __initdata struct kmem_cache boot_kmem_cache,
29920                 boot_kmem_cache_node;
29921 +       int cpu;
29922 +
29923 +       for_each_possible_cpu(cpu) {
29924 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
29925 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
29926 +       }
29927  
29928         if (debug_guardpage_minorder())
29929                 slub_max_order = 0;
29930 @@ -4127,7 +4197,7 @@ static int validate_slab_node(struct kmem_cache *s,
29931         struct page *page;
29932         unsigned long flags;
29933  
29934 -       spin_lock_irqsave(&n->list_lock, flags);
29935 +       raw_spin_lock_irqsave(&n->list_lock, flags);
29936  
29937         list_for_each_entry(page, &n->partial, lru) {
29938                 validate_slab_slab(s, page, map);
29939 @@ -4149,7 +4219,7 @@ static int validate_slab_node(struct kmem_cache *s,
29940                        s->name, count, atomic_long_read(&n->nr_slabs));
29941  
29942  out:
29943 -       spin_unlock_irqrestore(&n->list_lock, flags);
29944 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29945         return count;
29946  }
29947  
29948 @@ -4337,12 +4407,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
29949                 if (!atomic_long_read(&n->nr_slabs))
29950                         continue;
29951  
29952 -               spin_lock_irqsave(&n->list_lock, flags);
29953 +               raw_spin_lock_irqsave(&n->list_lock, flags);
29954                 list_for_each_entry(page, &n->partial, lru)
29955                         process_slab(&t, s, page, alloc, map);
29956                 list_for_each_entry(page, &n->full, lru)
29957                         process_slab(&t, s, page, alloc, map);
29958 -               spin_unlock_irqrestore(&n->list_lock, flags);
29959 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
29960         }
29961  
29962         for (i = 0; i < t.count; i++) {
29963 diff --git a/mm/swap.c b/mm/swap.c
29964 index 39395fb549c0..ad16649221d7 100644
29965 --- a/mm/swap.c
29966 +++ b/mm/swap.c
29967 @@ -31,6 +31,7 @@
29968  #include <linux/memcontrol.h>
29969  #include <linux/gfp.h>
29970  #include <linux/uio.h>
29971 +#include <linux/locallock.h>
29972  #include <linux/hugetlb.h>
29973  #include <linux/page_idle.h>
29974  
29975 @@ -46,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
29976  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
29977  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
29978  
29979 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
29980 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
29981 +
29982  /*
29983   * This path almost never happens for VM activity - pages are normally
29984   * freed via pagevecs.  But it gets used by networking.
29985 @@ -481,11 +485,11 @@ void rotate_reclaimable_page(struct page *page)
29986                 unsigned long flags;
29987  
29988                 page_cache_get(page);
29989 -               local_irq_save(flags);
29990 +               local_lock_irqsave(rotate_lock, flags);
29991                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
29992                 if (!pagevec_add(pvec, page))
29993                         pagevec_move_tail(pvec);
29994 -               local_irq_restore(flags);
29995 +               local_unlock_irqrestore(rotate_lock, flags);
29996         }
29997  }
29998  
29999 @@ -536,12 +540,13 @@ static bool need_activate_page_drain(int cpu)
30000  void activate_page(struct page *page)
30001  {
30002         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
30003 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
30004 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30005 +                                                      activate_page_pvecs);
30006  
30007                 page_cache_get(page);
30008                 if (!pagevec_add(pvec, page))
30009                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
30010 -               put_cpu_var(activate_page_pvecs);
30011 +               put_locked_var(swapvec_lock, activate_page_pvecs);
30012         }
30013  }
30014  
30015 @@ -567,7 +572,7 @@ void activate_page(struct page *page)
30016  
30017  static void __lru_cache_activate_page(struct page *page)
30018  {
30019 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30020 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30021         int i;
30022  
30023         /*
30024 @@ -589,7 +594,7 @@ static void __lru_cache_activate_page(struct page *page)
30025                 }
30026         }
30027  
30028 -       put_cpu_var(lru_add_pvec);
30029 +       put_locked_var(swapvec_lock, lru_add_pvec);
30030  }
30031  
30032  /*
30033 @@ -630,13 +635,13 @@ EXPORT_SYMBOL(mark_page_accessed);
30034  
30035  static void __lru_cache_add(struct page *page)
30036  {
30037 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30038 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30039  
30040         page_cache_get(page);
30041         if (!pagevec_space(pvec))
30042                 __pagevec_lru_add(pvec);
30043         pagevec_add(pvec, page);
30044 -       put_cpu_var(lru_add_pvec);
30045 +       put_locked_var(swapvec_lock, lru_add_pvec);
30046  }
30047  
30048  /**
30049 @@ -816,9 +821,15 @@ void lru_add_drain_cpu(int cpu)
30050                 unsigned long flags;
30051  
30052                 /* No harm done if a racing interrupt already did this */
30053 -               local_irq_save(flags);
30054 +#ifdef CONFIG_PREEMPT_RT_BASE
30055 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
30056 +               pagevec_move_tail(pvec);
30057 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
30058 +#else
30059 +               local_lock_irqsave(rotate_lock, flags);
30060                 pagevec_move_tail(pvec);
30061 -               local_irq_restore(flags);
30062 +               local_unlock_irqrestore(rotate_lock, flags);
30063 +#endif
30064         }
30065  
30066         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
30067 @@ -846,26 +857,47 @@ void deactivate_file_page(struct page *page)
30068                 return;
30069  
30070         if (likely(get_page_unless_zero(page))) {
30071 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
30072 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30073 +                                                      lru_deactivate_file_pvecs);
30074  
30075                 if (!pagevec_add(pvec, page))
30076                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
30077 -               put_cpu_var(lru_deactivate_file_pvecs);
30078 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
30079         }
30080  }
30081  
30082  void lru_add_drain(void)
30083  {
30084 -       lru_add_drain_cpu(get_cpu());
30085 -       put_cpu();
30086 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
30087 +       local_unlock_cpu(swapvec_lock);
30088  }
30089  
30090 +
30091 +#ifdef CONFIG_PREEMPT_RT_BASE
30092 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30093 +{
30094 +       local_lock_on(swapvec_lock, cpu);
30095 +       lru_add_drain_cpu(cpu);
30096 +       local_unlock_on(swapvec_lock, cpu);
30097 +}
30098 +
30099 +#else
30100 +
30101  static void lru_add_drain_per_cpu(struct work_struct *dummy)
30102  {
30103         lru_add_drain();
30104  }
30105  
30106  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
30107 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30108 +{
30109 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30110 +
30111 +       INIT_WORK(work, lru_add_drain_per_cpu);
30112 +       schedule_work_on(cpu, work);
30113 +       cpumask_set_cpu(cpu, has_work);
30114 +}
30115 +#endif
30116  
30117  void lru_add_drain_all(void)
30118  {
30119 @@ -878,20 +910,17 @@ void lru_add_drain_all(void)
30120         cpumask_clear(&has_work);
30121  
30122         for_each_online_cpu(cpu) {
30123 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30124 -
30125                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
30126                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
30127                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
30128 -                   need_activate_page_drain(cpu)) {
30129 -                       INIT_WORK(work, lru_add_drain_per_cpu);
30130 -                       schedule_work_on(cpu, work);
30131 -                       cpumask_set_cpu(cpu, &has_work);
30132 -               }
30133 +                   need_activate_page_drain(cpu))
30134 +                       remote_lru_add_drain(cpu, &has_work);
30135         }
30136  
30137 +#ifndef CONFIG_PREEMPT_RT_BASE
30138         for_each_cpu(cpu, &has_work)
30139                 flush_work(&per_cpu(lru_add_drain_work, cpu));
30140 +#endif
30141  
30142         put_online_cpus();
30143         mutex_unlock(&lock);
30144 diff --git a/mm/truncate.c b/mm/truncate.c
30145 index 76e35ad97102..5f196420020c 100644
30146 --- a/mm/truncate.c
30147 +++ b/mm/truncate.c
30148 @@ -56,8 +56,11 @@ static void clear_exceptional_entry(struct address_space *mapping,
30149          * protected by mapping->tree_lock.
30150          */
30151         if (!workingset_node_shadows(node) &&
30152 -           !list_empty(&node->private_list))
30153 -               list_lru_del(&workingset_shadow_nodes, &node->private_list);
30154 +           !list_empty(&node->private_list)) {
30155 +               local_lock(workingset_shadow_lock);
30156 +               list_lru_del(&__workingset_shadow_nodes, &node->private_list);
30157 +               local_unlock(workingset_shadow_lock);
30158 +       }
30159         __radix_tree_delete_node(&mapping->page_tree, node);
30160  unlock:
30161         spin_unlock_irq(&mapping->tree_lock);
30162 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
30163 index 8e3c9c5a3042..68740314ad54 100644
30164 --- a/mm/vmalloc.c
30165 +++ b/mm/vmalloc.c
30166 @@ -821,7 +821,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
30167         struct vmap_block *vb;
30168         struct vmap_area *va;
30169         unsigned long vb_idx;
30170 -       int node, err;
30171 +       int node, err, cpu;
30172         void *vaddr;
30173  
30174         node = numa_node_id();
30175 @@ -864,11 +864,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
30176         BUG_ON(err);
30177         radix_tree_preload_end();
30178  
30179 -       vbq = &get_cpu_var(vmap_block_queue);
30180 +       cpu = get_cpu_light();
30181 +       vbq = this_cpu_ptr(&vmap_block_queue);
30182         spin_lock(&vbq->lock);
30183         list_add_tail_rcu(&vb->free_list, &vbq->free);
30184         spin_unlock(&vbq->lock);
30185 -       put_cpu_var(vmap_block_queue);
30186 +       put_cpu_light();
30187  
30188         return vaddr;
30189  }
30190 @@ -937,6 +938,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30191         struct vmap_block *vb;
30192         void *vaddr = NULL;
30193         unsigned int order;
30194 +       int cpu;
30195  
30196         BUG_ON(offset_in_page(size));
30197         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
30198 @@ -951,7 +953,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30199         order = get_order(size);
30200  
30201         rcu_read_lock();
30202 -       vbq = &get_cpu_var(vmap_block_queue);
30203 +       cpu = get_cpu_light();
30204 +       vbq = this_cpu_ptr(&vmap_block_queue);
30205         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
30206                 unsigned long pages_off;
30207  
30208 @@ -974,7 +977,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30209                 break;
30210         }
30211  
30212 -       put_cpu_var(vmap_block_queue);
30213 +       put_cpu_light();
30214         rcu_read_unlock();
30215  
30216         /* Allocate new block if nothing was found */
30217 diff --git a/mm/vmstat.c b/mm/vmstat.c
30218 index c54fd2924f25..64416fd7c209 100644
30219 --- a/mm/vmstat.c
30220 +++ b/mm/vmstat.c
30221 @@ -226,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
30222         long x;
30223         long t;
30224  
30225 +       preempt_disable_rt();
30226         x = delta + __this_cpu_read(*p);
30227  
30228         t = __this_cpu_read(pcp->stat_threshold);
30229 @@ -235,6 +236,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
30230                 x = 0;
30231         }
30232         __this_cpu_write(*p, x);
30233 +       preempt_enable_rt();
30234  }
30235  EXPORT_SYMBOL(__mod_zone_page_state);
30236  
30237 @@ -267,6 +269,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
30238         s8 __percpu *p = pcp->vm_stat_diff + item;
30239         s8 v, t;
30240  
30241 +       preempt_disable_rt();
30242         v = __this_cpu_inc_return(*p);
30243         t = __this_cpu_read(pcp->stat_threshold);
30244         if (unlikely(v > t)) {
30245 @@ -275,6 +278,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
30246                 zone_page_state_add(v + overstep, zone, item);
30247                 __this_cpu_write(*p, -overstep);
30248         }
30249 +       preempt_enable_rt();
30250  }
30251  
30252  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
30253 @@ -289,6 +293,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
30254         s8 __percpu *p = pcp->vm_stat_diff + item;
30255         s8 v, t;
30256  
30257 +       preempt_disable_rt();
30258         v = __this_cpu_dec_return(*p);
30259         t = __this_cpu_read(pcp->stat_threshold);
30260         if (unlikely(v < - t)) {
30261 @@ -297,6 +302,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
30262                 zone_page_state_add(v - overstep, zone, item);
30263                 __this_cpu_write(*p, overstep);
30264         }
30265 +       preempt_enable_rt();
30266  }
30267  
30268  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
30269 diff --git a/mm/workingset.c b/mm/workingset.c
30270 index aa017133744b..263d0194734a 100644
30271 --- a/mm/workingset.c
30272 +++ b/mm/workingset.c
30273 @@ -264,7 +264,8 @@ void workingset_activation(struct page *page)
30274   * point where they would still be useful.
30275   */
30276  
30277 -struct list_lru workingset_shadow_nodes;
30278 +struct list_lru __workingset_shadow_nodes;
30279 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
30280  
30281  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
30282                                         struct shrink_control *sc)
30283 @@ -274,9 +275,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
30284         unsigned long pages;
30285  
30286         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
30287 -       local_irq_disable();
30288 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
30289 -       local_irq_enable();
30290 +       local_lock_irq(workingset_shadow_lock);
30291 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
30292 +       local_unlock_irq(workingset_shadow_lock);
30293  
30294         pages = node_present_pages(sc->nid);
30295         /*
30296 @@ -363,9 +364,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
30297         spin_unlock(&mapping->tree_lock);
30298         ret = LRU_REMOVED_RETRY;
30299  out:
30300 -       local_irq_enable();
30301 +       local_unlock_irq(workingset_shadow_lock);
30302         cond_resched();
30303 -       local_irq_disable();
30304 +       local_lock_irq(workingset_shadow_lock);
30305         spin_lock(lru_lock);
30306         return ret;
30307  }
30308 @@ -376,10 +377,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
30309         unsigned long ret;
30310  
30311         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
30312 -       local_irq_disable();
30313 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
30314 +       local_lock_irq(workingset_shadow_lock);
30315 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
30316                                     shadow_lru_isolate, NULL);
30317 -       local_irq_enable();
30318 +       local_unlock_irq(workingset_shadow_lock);
30319         return ret;
30320  }
30321  
30322 @@ -400,7 +401,7 @@ static int __init workingset_init(void)
30323  {
30324         int ret;
30325  
30326 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
30327 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
30328         if (ret)
30329                 goto err;
30330         ret = register_shrinker(&workingset_shadow_shrinker);
30331 @@ -408,7 +409,7 @@ static int __init workingset_init(void)
30332                 goto err_list_lru;
30333         return 0;
30334  err_list_lru:
30335 -       list_lru_destroy(&workingset_shadow_nodes);
30336 +       list_lru_destroy(&__workingset_shadow_nodes);
30337  err:
30338         return ret;
30339  }
30340 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
30341 index c1ea19478119..b552fd607df8 100644
30342 --- a/mm/zsmalloc.c
30343 +++ b/mm/zsmalloc.c
30344 @@ -1289,7 +1289,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
30345         class = pool->size_class[class_idx];
30346         off = obj_idx_to_offset(page, obj_idx, class->size);
30347  
30348 -       area = &get_cpu_var(zs_map_area);
30349 +       area = per_cpu_ptr(&zs_map_area, get_cpu_light());
30350         area->vm_mm = mm;
30351         if (off + class->size <= PAGE_SIZE) {
30352                 /* this object is contained entirely within a page */
30353 @@ -1342,7 +1342,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
30354  
30355                 __zs_unmap_object(area, pages, off, class->size);
30356         }
30357 -       put_cpu_var(zs_map_area);
30358 +       put_cpu_light();
30359         unpin_tag(handle);
30360  }
30361  EXPORT_SYMBOL_GPL(zs_unmap_object);
30362 diff --git a/net/core/dev.c b/net/core/dev.c
30363 index de4ed2b5a221..564933374c5f 100644
30364 --- a/net/core/dev.c
30365 +++ b/net/core/dev.c
30366 @@ -186,6 +186,7 @@ static unsigned int napi_gen_id;
30367  static DEFINE_HASHTABLE(napi_hash, 8);
30368  
30369  static seqcount_t devnet_rename_seq;
30370 +static DEFINE_MUTEX(devnet_rename_mutex);
30371  
30372  static inline void dev_base_seq_inc(struct net *net)
30373  {
30374 @@ -207,14 +208,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
30375  static inline void rps_lock(struct softnet_data *sd)
30376  {
30377  #ifdef CONFIG_RPS
30378 -       spin_lock(&sd->input_pkt_queue.lock);
30379 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
30380  #endif
30381  }
30382  
30383  static inline void rps_unlock(struct softnet_data *sd)
30384  {
30385  #ifdef CONFIG_RPS
30386 -       spin_unlock(&sd->input_pkt_queue.lock);
30387 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
30388  #endif
30389  }
30390  
30391 @@ -884,7 +885,8 @@ retry:
30392         strcpy(name, dev->name);
30393         rcu_read_unlock();
30394         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
30395 -               cond_resched();
30396 +               mutex_lock(&devnet_rename_mutex);
30397 +               mutex_unlock(&devnet_rename_mutex);
30398                 goto retry;
30399         }
30400  
30401 @@ -1153,20 +1155,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
30402         if (dev->flags & IFF_UP)
30403                 return -EBUSY;
30404  
30405 -       write_seqcount_begin(&devnet_rename_seq);
30406 +       mutex_lock(&devnet_rename_mutex);
30407 +       __raw_write_seqcount_begin(&devnet_rename_seq);
30408  
30409 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30410 -               write_seqcount_end(&devnet_rename_seq);
30411 -               return 0;
30412 -       }
30413 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
30414 +               goto outunlock;
30415  
30416         memcpy(oldname, dev->name, IFNAMSIZ);
30417  
30418         err = dev_get_valid_name(net, dev, newname);
30419 -       if (err < 0) {
30420 -               write_seqcount_end(&devnet_rename_seq);
30421 -               return err;
30422 -       }
30423 +       if (err < 0)
30424 +               goto outunlock;
30425  
30426         if (oldname[0] && !strchr(oldname, '%'))
30427                 netdev_info(dev, "renamed from %s\n", oldname);
30428 @@ -1179,11 +1178,12 @@ rollback:
30429         if (ret) {
30430                 memcpy(dev->name, oldname, IFNAMSIZ);
30431                 dev->name_assign_type = old_assign_type;
30432 -               write_seqcount_end(&devnet_rename_seq);
30433 -               return ret;
30434 +               err = ret;
30435 +               goto outunlock;
30436         }
30437  
30438 -       write_seqcount_end(&devnet_rename_seq);
30439 +       __raw_write_seqcount_end(&devnet_rename_seq);
30440 +       mutex_unlock(&devnet_rename_mutex);
30441  
30442         netdev_adjacent_rename_links(dev, oldname);
30443  
30444 @@ -1204,7 +1204,8 @@ rollback:
30445                 /* err >= 0 after dev_alloc_name() or stores the first errno */
30446                 if (err >= 0) {
30447                         err = ret;
30448 -                       write_seqcount_begin(&devnet_rename_seq);
30449 +                       mutex_lock(&devnet_rename_mutex);
30450 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
30451                         memcpy(dev->name, oldname, IFNAMSIZ);
30452                         memcpy(oldname, newname, IFNAMSIZ);
30453                         dev->name_assign_type = old_assign_type;
30454 @@ -1217,6 +1218,11 @@ rollback:
30455         }
30456  
30457         return err;
30458 +
30459 +outunlock:
30460 +       __raw_write_seqcount_end(&devnet_rename_seq);
30461 +       mutex_unlock(&devnet_rename_mutex);
30462 +       return err;
30463  }
30464  
30465  /**
30466 @@ -2246,6 +2252,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
30467         sd->output_queue_tailp = &q->next_sched;
30468         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30469         local_irq_restore(flags);
30470 +       preempt_check_resched_rt();
30471  }
30472  
30473  void __netif_schedule(struct Qdisc *q)
30474 @@ -2327,6 +2334,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
30475         __this_cpu_write(softnet_data.completion_queue, skb);
30476         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30477         local_irq_restore(flags);
30478 +       preempt_check_resched_rt();
30479  }
30480  EXPORT_SYMBOL(__dev_kfree_skb_irq);
30481  
30482 @@ -2883,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
30483          * This permits __QDISC___STATE_RUNNING owner to get the lock more
30484          * often and dequeue packets faster.
30485          */
30486 +#ifdef CONFIG_PREEMPT_RT_FULL
30487 +       contended = true;
30488 +#else
30489         contended = qdisc_is_running(q);
30490 +#endif
30491         if (unlikely(contended))
30492                 spin_lock(&q->busylock);
30493  
30494 @@ -2943,9 +2955,44 @@ static void skb_update_prio(struct sk_buff *skb)
30495  #define skb_update_prio(skb)
30496  #endif
30497  
30498 +#ifdef CONFIG_PREEMPT_RT_FULL
30499 +
30500 +static inline int xmit_rec_read(void)
30501 +{
30502 +       return current->xmit_recursion;
30503 +}
30504 +
30505 +static inline void xmit_rec_inc(void)
30506 +{
30507 +       current->xmit_recursion++;
30508 +}
30509 +
30510 +static inline void xmit_rec_dec(void)
30511 +{
30512 +       current->xmit_recursion--;
30513 +}
30514 +
30515 +#else
30516 +
30517  DEFINE_PER_CPU(int, xmit_recursion);
30518  EXPORT_SYMBOL(xmit_recursion);
30519  
30520 +static inline int xmit_rec_read(void)
30521 +{
30522 +       return __this_cpu_read(xmit_recursion);
30523 +}
30524 +
30525 +static inline void xmit_rec_inc(void)
30526 +{
30527 +       __this_cpu_inc(xmit_recursion);
30528 +}
30529 +
30530 +static inline void xmit_rec_dec(void)
30531 +{
30532 +       __this_cpu_dec(xmit_recursion);
30533 +}
30534 +#endif
30535 +
30536  #define RECURSION_LIMIT 10
30537  
30538  /**
30539 @@ -3138,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
30540  
30541                 if (txq->xmit_lock_owner != cpu) {
30542  
30543 -                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
30544 +                       if (xmit_rec_read() > RECURSION_LIMIT)
30545                                 goto recursion_alert;
30546  
30547                         skb = validate_xmit_skb(skb, dev);
30548 @@ -3148,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
30549                         HARD_TX_LOCK(dev, txq, cpu);
30550  
30551                         if (!netif_xmit_stopped(txq)) {
30552 -                               __this_cpu_inc(xmit_recursion);
30553 +                               xmit_rec_inc();
30554                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
30555 -                               __this_cpu_dec(xmit_recursion);
30556 +                               xmit_rec_dec();
30557                                 if (dev_xmit_complete(rc)) {
30558                                         HARD_TX_UNLOCK(dev, txq);
30559                                         goto out;
30560 @@ -3524,6 +3571,7 @@ drop:
30561         rps_unlock(sd);
30562  
30563         local_irq_restore(flags);
30564 +       preempt_check_resched_rt();
30565  
30566         atomic_long_inc(&skb->dev->rx_dropped);
30567         kfree_skb(skb);
30568 @@ -3542,7 +3590,7 @@ static int netif_rx_internal(struct sk_buff *skb)
30569                 struct rps_dev_flow voidflow, *rflow = &voidflow;
30570                 int cpu;
30571  
30572 -               preempt_disable();
30573 +               migrate_disable();
30574                 rcu_read_lock();
30575  
30576                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
30577 @@ -3552,13 +3600,13 @@ static int netif_rx_internal(struct sk_buff *skb)
30578                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
30579  
30580                 rcu_read_unlock();
30581 -               preempt_enable();
30582 +               migrate_enable();
30583         } else
30584  #endif
30585         {
30586                 unsigned int qtail;
30587 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
30588 -               put_cpu();
30589 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
30590 +               put_cpu_light();
30591         }
30592         return ret;
30593  }
30594 @@ -3592,16 +3640,44 @@ int netif_rx_ni(struct sk_buff *skb)
30595  
30596         trace_netif_rx_ni_entry(skb);
30597  
30598 -       preempt_disable();
30599 +       local_bh_disable();
30600         err = netif_rx_internal(skb);
30601 -       if (local_softirq_pending())
30602 -               do_softirq();
30603 -       preempt_enable();
30604 +       local_bh_enable();
30605  
30606         return err;
30607  }
30608  EXPORT_SYMBOL(netif_rx_ni);
30609  
30610 +#ifdef CONFIG_PREEMPT_RT_FULL
30611 +/*
30612 + * RT runs ksoftirqd as a real time thread and the root_lock is a
30613 + * "sleeping spinlock". If the trylock fails then we can go into an
30614 + * infinite loop when ksoftirqd preempted the task which actually
30615 + * holds the lock, because we requeue q and raise NET_TX softirq
30616 + * causing ksoftirqd to loop forever.
30617 + *
30618 + * It's safe to use spin_lock on RT here as softirqs run in thread
30619 + * context and cannot deadlock against the thread which is holding
30620 + * root_lock.
30621 + *
30622 + * On !RT the trylock might fail, but there we bail out from the
30623 + * softirq loop after 10 attempts which we can't do on RT. And the
30624 + * task holding root_lock cannot be preempted, so the only downside of
30625 + * that trylock is that we need 10 loops to decide that we should have
30626 + * given up in the first one :)
30627 + */
30628 +static inline int take_root_lock(spinlock_t *lock)
30629 +{
30630 +       spin_lock(lock);
30631 +       return 1;
30632 +}
30633 +#else
30634 +static inline int take_root_lock(spinlock_t *lock)
30635 +{
30636 +       return spin_trylock(lock);
30637 +}
30638 +#endif
30639 +
30640  static void net_tx_action(struct softirq_action *h)
30641  {
30642         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
30643 @@ -3643,7 +3719,7 @@ static void net_tx_action(struct softirq_action *h)
30644                         head = head->next_sched;
30645  
30646                         root_lock = qdisc_lock(q);
30647 -                       if (spin_trylock(root_lock)) {
30648 +                       if (take_root_lock(root_lock)) {
30649                                 smp_mb__before_atomic();
30650                                 clear_bit(__QDISC_STATE_SCHED,
30651                                           &q->state);
30652 @@ -4065,7 +4141,7 @@ static void flush_backlog(void *arg)
30653         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
30654                 if (skb->dev == dev) {
30655                         __skb_unlink(skb, &sd->input_pkt_queue);
30656 -                       kfree_skb(skb);
30657 +                       __skb_queue_tail(&sd->tofree_queue, skb);
30658                         input_queue_head_incr(sd);
30659                 }
30660         }
30661 @@ -4074,10 +4150,13 @@ static void flush_backlog(void *arg)
30662         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
30663                 if (skb->dev == dev) {
30664                         __skb_unlink(skb, &sd->process_queue);
30665 -                       kfree_skb(skb);
30666 +                       __skb_queue_tail(&sd->tofree_queue, skb);
30667                         input_queue_head_incr(sd);
30668                 }
30669         }
30670 +
30671 +       if (!skb_queue_empty(&sd->tofree_queue))
30672 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
30673  }
30674  
30675  static int napi_gro_complete(struct sk_buff *skb)
30676 @@ -4531,6 +4610,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
30677                 sd->rps_ipi_list = NULL;
30678  
30679                 local_irq_enable();
30680 +               preempt_check_resched_rt();
30681  
30682                 /* Send pending IPI's to kick RPS processing on remote cpus. */
30683                 while (remsd) {
30684 @@ -4544,6 +4624,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
30685         } else
30686  #endif
30687                 local_irq_enable();
30688 +       preempt_check_resched_rt();
30689  }
30690  
30691  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
30692 @@ -4625,6 +4706,7 @@ void __napi_schedule(struct napi_struct *n)
30693         local_irq_save(flags);
30694         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
30695         local_irq_restore(flags);
30696 +       preempt_check_resched_rt();
30697  }
30698  EXPORT_SYMBOL(__napi_schedule);
30699  
30700 @@ -4901,7 +4983,7 @@ static void net_rx_action(struct softirq_action *h)
30701         list_splice_tail(&repoll, &list);
30702         list_splice(&list, &sd->poll_list);
30703         if (!list_empty(&sd->poll_list))
30704 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
30705 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
30706  
30707         net_rps_action_and_irq_enable(sd);
30708  }
30709 @@ -7234,7 +7316,7 @@ EXPORT_SYMBOL(free_netdev);
30710  void synchronize_net(void)
30711  {
30712         might_sleep();
30713 -       if (rtnl_is_locked())
30714 +       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
30715                 synchronize_rcu_expedited();
30716         else
30717                 synchronize_rcu();
30718 @@ -7475,16 +7557,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
30719  
30720         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30721         local_irq_enable();
30722 +       preempt_check_resched_rt();
30723  
30724         /* Process offline CPU's input_pkt_queue */
30725         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
30726                 netif_rx_ni(skb);
30727                 input_queue_head_incr(oldsd);
30728         }
30729 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
30730 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
30731                 netif_rx_ni(skb);
30732                 input_queue_head_incr(oldsd);
30733         }
30734 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
30735 +               kfree_skb(skb);
30736 +       }
30737  
30738         return NOTIFY_OK;
30739  }
30740 @@ -7786,8 +7872,9 @@ static int __init net_dev_init(void)
30741         for_each_possible_cpu(i) {
30742                 struct softnet_data *sd = &per_cpu(softnet_data, i);
30743  
30744 -               skb_queue_head_init(&sd->input_pkt_queue);
30745 -               skb_queue_head_init(&sd->process_queue);
30746 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
30747 +               skb_queue_head_init_raw(&sd->process_queue);
30748 +               skb_queue_head_init_raw(&sd->tofree_queue);
30749                 INIT_LIST_HEAD(&sd->poll_list);
30750                 sd->output_queue_tailp = &sd->output_queue;
30751  #ifdef CONFIG_RPS
30752 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
30753 index 4968b5ddea69..c8d778f405dc 100644
30754 --- a/net/core/skbuff.c
30755 +++ b/net/core/skbuff.c
30756 @@ -63,6 +63,7 @@
30757  #include <linux/errqueue.h>
30758  #include <linux/prefetch.h>
30759  #include <linux/if_vlan.h>
30760 +#include <linux/locallock.h>
30761  
30762  #include <net/protocol.h>
30763  #include <net/dst.h>
30764 @@ -351,6 +352,8 @@ EXPORT_SYMBOL(build_skb);
30765  
30766  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
30767  static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
30768 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
30769 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
30770  
30771  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30772  {
30773 @@ -358,10 +361,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30774         unsigned long flags;
30775         void *data;
30776  
30777 -       local_irq_save(flags);
30778 +       local_lock_irqsave(netdev_alloc_lock, flags);
30779         nc = this_cpu_ptr(&netdev_alloc_cache);
30780         data = __alloc_page_frag(nc, fragsz, gfp_mask);
30781 -       local_irq_restore(flags);
30782 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
30783         return data;
30784  }
30785  
30786 @@ -380,9 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
30787  
30788  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30789  {
30790 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
30791 +       struct page_frag_cache *nc;
30792 +       void *data;
30793  
30794 -       return __alloc_page_frag(nc, fragsz, gfp_mask);
30795 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30796 +       data = __alloc_page_frag(nc, fragsz, gfp_mask);
30797 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30798 +       return data;
30799  }
30800  
30801  void *napi_alloc_frag(unsigned int fragsz)
30802 @@ -429,13 +436,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
30803         if (sk_memalloc_socks())
30804                 gfp_mask |= __GFP_MEMALLOC;
30805  
30806 -       local_irq_save(flags);
30807 +       local_lock_irqsave(netdev_alloc_lock, flags);
30808  
30809         nc = this_cpu_ptr(&netdev_alloc_cache);
30810         data = __alloc_page_frag(nc, len, gfp_mask);
30811         pfmemalloc = nc->pfmemalloc;
30812  
30813 -       local_irq_restore(flags);
30814 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
30815  
30816         if (unlikely(!data))
30817                 return NULL;
30818 @@ -476,9 +483,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
30819  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30820                                  gfp_t gfp_mask)
30821  {
30822 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
30823 +       struct page_frag_cache *nc;
30824         struct sk_buff *skb;
30825         void *data;
30826 +       bool pfmemalloc;
30827  
30828         len += NET_SKB_PAD + NET_IP_ALIGN;
30829  
30830 @@ -496,7 +504,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30831         if (sk_memalloc_socks())
30832                 gfp_mask |= __GFP_MEMALLOC;
30833  
30834 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30835         data = __alloc_page_frag(nc, len, gfp_mask);
30836 +       pfmemalloc = nc->pfmemalloc;
30837 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30838 +
30839         if (unlikely(!data))
30840                 return NULL;
30841  
30842 @@ -507,7 +519,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30843         }
30844  
30845         /* use OR instead of assignment to avoid clearing of bits in mask */
30846 -       if (nc->pfmemalloc)
30847 +       if (pfmemalloc)
30848                 skb->pfmemalloc = 1;
30849         skb->head_frag = 1;
30850  
30851 diff --git a/net/core/sock.c b/net/core/sock.c
30852 index 0d91f7dca751..9c3234299fc3 100644
30853 --- a/net/core/sock.c
30854 +++ b/net/core/sock.c
30855 @@ -2435,12 +2435,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
30856         if (sk->sk_lock.owned)
30857                 __lock_sock(sk);
30858         sk->sk_lock.owned = 1;
30859 -       spin_unlock(&sk->sk_lock.slock);
30860 +       spin_unlock_bh(&sk->sk_lock.slock);
30861         /*
30862          * The sk_lock has mutex_lock() semantics here:
30863          */
30864         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
30865 -       local_bh_enable();
30866  }
30867  EXPORT_SYMBOL(lock_sock_nested);
30868  
30869 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
30870 index 36e26977c908..ff2593269089 100644
30871 --- a/net/ipv4/icmp.c
30872 +++ b/net/ipv4/icmp.c
30873 @@ -69,6 +69,7 @@
30874  #include <linux/jiffies.h>
30875  #include <linux/kernel.h>
30876  #include <linux/fcntl.h>
30877 +#include <linux/sysrq.h>
30878  #include <linux/socket.h>
30879  #include <linux/in.h>
30880  #include <linux/inet.h>
30881 @@ -77,6 +78,7 @@
30882  #include <linux/string.h>
30883  #include <linux/netfilter_ipv4.h>
30884  #include <linux/slab.h>
30885 +#include <linux/locallock.h>
30886  #include <net/snmp.h>
30887  #include <net/ip.h>
30888  #include <net/route.h>
30889 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
30890   *
30891   *     On SMP we have one ICMP socket per-cpu.
30892   */
30893 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
30894 +
30895  static struct sock *icmp_sk(struct net *net)
30896  {
30897         return *this_cpu_ptr(net->ipv4.icmp_sk);
30898 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
30899  
30900         local_bh_disable();
30901  
30902 +       local_lock(icmp_sk_lock);
30903         sk = icmp_sk(net);
30904  
30905         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
30906                 /* This can happen if the output path signals a
30907                  * dst_link_failure() for an outgoing ICMP packet.
30908                  */
30909 +               local_unlock(icmp_sk_lock);
30910                 local_bh_enable();
30911                 return NULL;
30912         }
30913 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
30914  static inline void icmp_xmit_unlock(struct sock *sk)
30915  {
30916         spin_unlock_bh(&sk->sk_lock.slock);
30917 +       local_unlock(icmp_sk_lock);
30918  }
30919  
30920  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
30921 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
30922         struct sock *sk;
30923         struct sk_buff *skb;
30924  
30925 +       local_lock(icmp_sk_lock);
30926         sk = icmp_sk(dev_net((*rt)->dst.dev));
30927         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
30928                            icmp_param->data_len+icmp_param->head_len,
30929 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
30930                 skb->ip_summed = CHECKSUM_NONE;
30931                 ip_push_pending_frames(sk, fl4);
30932         }
30933 +       local_unlock(icmp_sk_lock);
30934  }
30935  
30936  /*
30937 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
30938  }
30939  
30940  /*
30941 + * 32bit and 64bit have different timestamp length, so we check for
30942 + * the cookie at offset 20 and verify it is repeated at offset 50
30943 + */
30944 +#define CO_POS0                20
30945 +#define CO_POS1                50
30946 +#define CO_SIZE                sizeof(int)
30947 +#define ICMP_SYSRQ_SIZE        57
30948 +
30949 +/*
30950 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
30951 + * pattern and if it matches send the next byte as a trigger to sysrq.
30952 + */
30953 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
30954 +{
30955 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
30956 +       char *p = skb->data;
30957 +
30958 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
30959 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
30960 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
30961 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
30962 +}
30963 +
30964 +/*
30965   *     Handle ICMP_ECHO ("ping") requests.
30966   *
30967   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
30968 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
30969                 icmp_param.data_len        = skb->len;
30970                 icmp_param.head_len        = sizeof(struct icmphdr);
30971                 icmp_reply(&icmp_param, skb);
30972 +
30973 +               if (skb->len == ICMP_SYSRQ_SIZE &&
30974 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
30975 +                       icmp_check_sysrq(net, skb);
30976 +               }
30977         }
30978         /* should there be an ICMP stat for ignored echos? */
30979         return true;
30980 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
30981 index a0bd7a55193e..1866f910263f 100644
30982 --- a/net/ipv4/sysctl_net_ipv4.c
30983 +++ b/net/ipv4/sysctl_net_ipv4.c
30984 @@ -818,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = {
30985                 .proc_handler   = proc_dointvec
30986         },
30987         {
30988 +               .procname       = "icmp_echo_sysrq",
30989 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
30990 +               .maxlen         = sizeof(int),
30991 +               .mode           = 0644,
30992 +               .proc_handler   = proc_dointvec
30993 +       },
30994 +       {
30995                 .procname       = "icmp_ignore_bogus_error_responses",
30996                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
30997                 .maxlen         = sizeof(int),
30998 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
30999 index b5853cac3269..de922d86ba2c 100644
31000 --- a/net/ipv4/tcp_ipv4.c
31001 +++ b/net/ipv4/tcp_ipv4.c
31002 @@ -62,6 +62,7 @@
31003  #include <linux/init.h>
31004  #include <linux/times.h>
31005  #include <linux/slab.h>
31006 +#include <linux/locallock.h>
31007  
31008  #include <net/net_namespace.h>
31009  #include <net/icmp.h>
31010 @@ -566,6 +567,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
31011  }
31012  EXPORT_SYMBOL(tcp_v4_send_check);
31013  
31014 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
31015  /*
31016   *     This routine will send an RST to the other tcp.
31017   *
31018 @@ -687,10 +689,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
31019                 arg.bound_dev_if = sk->sk_bound_dev_if;
31020  
31021         arg.tos = ip_hdr(skb)->tos;
31022 +
31023 +       local_lock(tcp_sk_lock);
31024         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31025                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31026                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31027                               &arg, arg.iov[0].iov_len);
31028 +       local_unlock(tcp_sk_lock);
31029  
31030         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31031         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
31032 @@ -772,10 +777,12 @@ static void tcp_v4_send_ack(struct net *net,
31033         if (oif)
31034                 arg.bound_dev_if = oif;
31035         arg.tos = tos;
31036 +       local_lock(tcp_sk_lock);
31037         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31038                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31039                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31040                               &arg, arg.iov[0].iov_len);
31041 +       local_unlock(tcp_sk_lock);
31042  
31043         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31044  }
31045 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
31046 index a3bb8f7f5fc5..3be977394a80 100644
31047 --- a/net/mac80211/rx.c
31048 +++ b/net/mac80211/rx.c
31049 @@ -3574,7 +3574,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
31050         struct ieee80211_supported_band *sband;
31051         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
31052  
31053 -       WARN_ON_ONCE(softirq_count() == 0);
31054 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
31055  
31056         if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
31057                 goto drop;
31058 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
31059 index f39276d1c2d7..10880c89d62f 100644
31060 --- a/net/netfilter/core.c
31061 +++ b/net/netfilter/core.c
31062 @@ -22,11 +22,17 @@
31063  #include <linux/proc_fs.h>
31064  #include <linux/mutex.h>
31065  #include <linux/slab.h>
31066 +#include <linux/locallock.h>
31067  #include <net/net_namespace.h>
31068  #include <net/sock.h>
31069  
31070  #include "nf_internals.h"
31071  
31072 +#ifdef CONFIG_PREEMPT_RT_BASE
31073 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
31074 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
31075 +#endif
31076 +
31077  static DEFINE_MUTEX(afinfo_mutex);
31078  
31079  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31080 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
31081 index a86f26d05bc2..77276e3ff2a2 100644
31082 --- a/net/packet/af_packet.c
31083 +++ b/net/packet/af_packet.c
31084 @@ -63,6 +63,7 @@
31085  #include <linux/if_packet.h>
31086  #include <linux/wireless.h>
31087  #include <linux/kernel.h>
31088 +#include <linux/delay.h>
31089  #include <linux/kmod.h>
31090  #include <linux/slab.h>
31091  #include <linux/vmalloc.h>
31092 @@ -695,7 +696,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
31093         if (BLOCK_NUM_PKTS(pbd)) {
31094                 while (atomic_read(&pkc->blk_fill_in_prog)) {
31095                         /* Waiting for skb_copy_bits to finish... */
31096 -                       cpu_relax();
31097 +                       cpu_chill();
31098                 }
31099         }
31100  
31101 @@ -957,7 +958,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
31102                 if (!(status & TP_STATUS_BLK_TMO)) {
31103                         while (atomic_read(&pkc->blk_fill_in_prog)) {
31104                                 /* Waiting for skb_copy_bits to finish... */
31105 -                               cpu_relax();
31106 +                               cpu_chill();
31107                         }
31108                 }
31109                 prb_close_block(pkc, pbd, po, status);
31110 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
31111 index a2340748ec86..19123a97b354 100644
31112 --- a/net/rds/ib_rdma.c
31113 +++ b/net/rds/ib_rdma.c
31114 @@ -34,6 +34,7 @@
31115  #include <linux/slab.h>
31116  #include <linux/rculist.h>
31117  #include <linux/llist.h>
31118 +#include <linux/delay.h>
31119  
31120  #include "rds.h"
31121  #include "ib.h"
31122 @@ -313,7 +314,7 @@ static inline void wait_clean_list_grace(void)
31123         for_each_online_cpu(cpu) {
31124                 flag = &per_cpu(clean_list_grace, cpu);
31125                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
31126 -                       cpu_relax();
31127 +                       cpu_chill();
31128         }
31129  }
31130  
31131 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
31132 index aa4725038f94..00b81cab28f3 100644
31133 --- a/net/sched/sch_generic.c
31134 +++ b/net/sched/sch_generic.c
31135 @@ -893,7 +893,7 @@ void dev_deactivate_many(struct list_head *head)
31136         /* Wait for outstanding qdisc_run calls. */
31137         list_for_each_entry(dev, head, close_list)
31138                 while (some_qdisc_is_busy(dev))
31139 -                       yield();
31140 +                       msleep(1);
31141  }
31142  
31143  void dev_deactivate(struct net_device *dev)
31144 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
31145 index a6cbb2104667..5b69bb580617 100644
31146 --- a/net/sunrpc/svc_xprt.c
31147 +++ b/net/sunrpc/svc_xprt.c
31148 @@ -340,7 +340,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
31149                 goto out;
31150         }
31151  
31152 -       cpu = get_cpu();
31153 +       cpu = get_cpu_light();
31154         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
31155  
31156         atomic_long_inc(&pool->sp_stats.packets);
31157 @@ -376,7 +376,7 @@ redo_search:
31158  
31159                 atomic_long_inc(&pool->sp_stats.threads_woken);
31160                 wake_up_process(rqstp->rq_task);
31161 -               put_cpu();
31162 +               put_cpu_light();
31163                 goto out;
31164         }
31165         rcu_read_unlock();
31166 @@ -397,7 +397,7 @@ redo_search:
31167                 goto redo_search;
31168         }
31169         rqstp = NULL;
31170 -       put_cpu();
31171 +       put_cpu_light();
31172  out:
31173         trace_svc_xprt_do_enqueue(xprt, rqstp);
31174  }
31175 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
31176 index 6fdc97ef6023..523e0420d7f0 100755
31177 --- a/scripts/mkcompile_h
31178 +++ b/scripts/mkcompile_h
31179 @@ -4,7 +4,8 @@ TARGET=$1
31180  ARCH=$2
31181  SMP=$3
31182  PREEMPT=$4
31183 -CC=$5
31184 +RT=$5
31185 +CC=$6
31186  
31187  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
31188  
31189 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
31190  CONFIG_FLAGS=""
31191  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
31192  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
31193 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
31194  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
31195  
31196  # Truncate to maximum length
31197 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
31198 index 4ba64fd49759..34e50186885d 100644
31199 --- a/sound/core/pcm_native.c
31200 +++ b/sound/core/pcm_native.c
31201 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
31202  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
31203  {
31204         if (!substream->pcm->nonatomic)
31205 -               local_irq_disable();
31206 +               local_irq_disable_nort();
31207         snd_pcm_stream_lock(substream);
31208  }
31209  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
31210 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
31211  {
31212         snd_pcm_stream_unlock(substream);
31213         if (!substream->pcm->nonatomic)
31214 -               local_irq_enable();
31215 +               local_irq_enable_nort();
31216  }
31217  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
31218  
31219 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
31220  {
31221         unsigned long flags = 0;
31222         if (!substream->pcm->nonatomic)
31223 -               local_irq_save(flags);
31224 +               local_irq_save_nort(flags);
31225         snd_pcm_stream_lock(substream);
31226         return flags;
31227  }
31228 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
31229  {
31230         snd_pcm_stream_unlock(substream);
31231         if (!substream->pcm->nonatomic)
31232 -               local_irq_restore(flags);
31233 +               local_irq_restore_nort(flags);
31234  }
31235  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
31236  
31237 diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
31238 index 4f70d12e392d..9378d0919ed8 100644
31239 --- a/virt/kvm/async_pf.c
31240 +++ b/virt/kvm/async_pf.c
31241 @@ -98,8 +98,8 @@ static void async_pf_execute(struct work_struct *work)
31242          * This memory barrier pairs with prepare_to_wait's set_current_state()
31243          */
31244         smp_mb();
31245 -       if (waitqueue_active(&vcpu->wq))
31246 -               wake_up_interruptible(&vcpu->wq);
31247 +       if (swait_active(&vcpu->wq))
31248 +               swake_up(&vcpu->wq);
31249  
31250         mmput(mm);
31251         kvm_put_kvm(vcpu->kvm);
31252 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
31253 index 336ed267c407..7748ca386e60 100644
31254 --- a/virt/kvm/kvm_main.c
31255 +++ b/virt/kvm/kvm_main.c
31256 @@ -228,8 +228,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
31257         vcpu->kvm = kvm;
31258         vcpu->vcpu_id = id;
31259         vcpu->pid = NULL;
31260 -       vcpu->halt_poll_ns = 0;
31261 -       init_waitqueue_head(&vcpu->wq);
31262 +       init_swait_queue_head(&vcpu->wq);
31263         kvm_async_pf_vcpu_init(vcpu);
31264  
31265         vcpu->pre_pcpu = -1;
31266 @@ -2005,7 +2004,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
31267  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31268  {
31269         ktime_t start, cur;
31270 -       DEFINE_WAIT(wait);
31271 +       DECLARE_SWAITQUEUE(wait);
31272         bool waited = false;
31273         u64 block_ns;
31274  
31275 @@ -2030,7 +2029,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31276         kvm_arch_vcpu_blocking(vcpu);
31277  
31278         for (;;) {
31279 -               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
31280 +               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
31281  
31282                 if (kvm_vcpu_check_block(vcpu) < 0)
31283                         break;
31284 @@ -2039,7 +2038,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31285                 schedule();
31286         }
31287  
31288 -       finish_wait(&vcpu->wq, &wait);
31289 +       finish_swait(&vcpu->wq, &wait);
31290         cur = ktime_get();
31291  
31292         kvm_arch_vcpu_unblocking(vcpu);
31293 @@ -2071,11 +2070,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
31294  {
31295         int me;
31296         int cpu = vcpu->cpu;
31297 -       wait_queue_head_t *wqp;
31298 +       struct swait_queue_head *wqp;
31299  
31300         wqp = kvm_arch_vcpu_wq(vcpu);
31301 -       if (waitqueue_active(wqp)) {
31302 -               wake_up_interruptible(wqp);
31303 +       if (swait_active(wqp)) {
31304 +               swake_up(wqp);
31305                 ++vcpu->stat.halt_wakeup;
31306         }
31307  
31308 @@ -2176,7 +2175,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
31309                                 continue;
31310                         if (vcpu == me)
31311                                 continue;
31312 -                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
31313 +                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
31314                                 continue;
31315                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
31316                                 continue;
This page took 2.802562 seconds and 3 git commands to generate.